#### In this notebook I will pre-process the data to get it ready for modeling. 

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import pickle
import pytz 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
from sklearn.neighbors import KNeighborsRegressor

In [2]:
posts_df = pd.read_csv('../food_trends/Data/reddit_6_months.csv')

posts_df.drop(columns=['subreddit','selftext'], inplace=True)
posts_df.head(5)

Unnamed: 0,id,title,created_utc,num_comments,score
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1
2,uu6apo,[homemade] 🇲🇦,1653076639,0,1
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1


#### first, I will check for any null values

In [3]:
posts_df.isnull().sum()

id              0
title           0
created_utc     0
num_comments    0
score           0
dtype: int64

#### convert epoch time to datetime and convert UTC to EST 

resource used - https://www.adamsmith.haus/python/answers/how-to-convert-a-utc-datetime-to-a-local-datetime-in-python

In [4]:
date = dt.datetime.fromtimestamp(1653077037, pytz.utc)
date

datetime.datetime(2022, 5, 20, 20, 3, 57, tzinfo=<UTC>)

In [5]:
local_datetime = pytz.timezone('US/Eastern')

In [87]:
date = date.astimezone(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d, %H:%M")

In [88]:
date

'2022-05-20, 16:03'

In [89]:
posts_df['post_time_est']= [(dt.datetime.fromtimestamp(x, pytz.utc)).astimezone(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d, %H:%M")
                            for x in posts_df['created_utc']]

In [91]:
posts_df['post_time_est'] = pd.to_datetime(posts_df['post_time_est'])

#### combine scores data with post data

In [94]:
posts_df.rename(columns={'score':'score_old', 'num_comments':'comments_old'}, inplace=True)

read in scores and comments from api

In [95]:
df_10 = pd.read_csv('./Data/0_to_10k_scores.csv')

df_20 = pd.read_csv('./Data/10_to_20k_scores.csv')

df_30 = pd.read_csv('./Data/2_to_3k_scores.csv')

df_40 = pd.read_csv('./Data/3_to_4k_scores.csv')

df_50 = pd.read_csv('./Data/4_to_5k_scores.csv')

df_60 = pd.read_csv('./Data/5_to_6k_scores.csv')

df_70 = pd.read_csv('./Data/7_to_7k_scores.csv')

df_80 = pd.read_csv('./Data/7_to_8k_scores.csv')

df_90 = pd.read_csv('./Data/8_to_9k_scores.csv')

df_100 = pd.read_csv('./Data/9_to_10k_scores.csv')

concatenate into 1 dataframe

In [96]:
scores_df = pd.concat([df_10, df_20, df_30, df_40, df_50, df_60, df_70, df_80, df_90, df_100])

In [97]:
scores_df.shape, posts_df.shape

((11238, 3), (50177, 6))

In [98]:
scores_df.dtypes, posts_df.dtypes

(id           object
 score         int64
 comments    float64
 dtype: object,
 id                       object
 title                    object
 created_utc               int64
 comments_old              int64
 score_old                 int64
 post_time_est    datetime64[ns]
 dtype: object)

In [99]:
scores_df.isnull().sum()

id          0
score       0
comments    1
dtype: int64

In [100]:
scores_df.dropna(inplace=True)

In [101]:
scores_df.drop_duplicates(inplace=True)

In [102]:
scores_df.shape

(10795, 3)

join to the posts dataframe

In [103]:
posts_scores_df = posts_df.merge(right=scores_df, how='inner')
posts_scores_df.head(5)

Unnamed: 0,id,title,created_utc,comments_old,score_old,post_time_est,score,comments
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1,2022-05-20 16:03:00,69,7.0
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1,2022-05-20 15:59:00,37,4.0
2,uu6apo,[homemade] 🇲🇦,1653076639,0,1,2022-05-20 15:57:00,1,0.0
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1,2022-05-20 15:48:00,16,2.0
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1,2022-05-20 15:38:00,1,0.0


In [104]:
posts_scores_df.dtypes

id                       object
title                    object
created_utc               int64
comments_old              int64
score_old                 int64
post_time_est    datetime64[ns]
score                     int64
comments                float64
dtype: object

In [106]:
posts_scores_df.isnull().sum()

id               0
title            0
created_utc      0
comments_old     0
score_old        0
post_time_est    0
score            0
comments         0
dtype: int64

In [107]:
posts_scores_df.shape

(10795, 8)

#### Engineer a field to connote 'viral'. Viral is being defined as a post with a score in the top 2% of all posts

In [108]:
posts_scores_df['percentile']= posts_scores_df.score.rank(pct=True)
#https://www.geeksforgeeks.org/percentile-rank-of-a-column-in-a-pandas-dataframe/

posts_scores_df['viral'] = np.where(posts_scores_df['percentile'] > .98, 1, 0 )

posts_scores_df.head()

Unnamed: 0,id,title,created_utc,comments_old,score_old,post_time_est,score,comments,percentile,viral
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1,2022-05-20 16:03:00,69,7.0,0.83673,0
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1,2022-05-20 15:59:00,37,4.0,0.699491,0
2,uu6apo,[homemade] 🇲🇦,1653076639,0,1,2022-05-20 15:57:00,1,0.0,0.117647,0
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1,2022-05-20 15:48:00,16,2.0,0.443585,0
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1,2022-05-20 15:38:00,1,0.0,0.117647,0


In [109]:
posts_scores_df.to_csv('./Data/posts_scores_dates.csv', index=False)