#### In this notebook I will pre-process the data to get it ready for modeling. Since I will be using Natural Language Porcessing (NLP) models I will focus on vectoring the text of the titles

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
from sklearn.neighbors import KNeighborsRegressor

In [2]:
posts_df = pd.read_csv('../food_trends/Data/reddit_6_months.csv')

posts_df.drop(columns=['subreddit','selftext'], inplace=True)
posts_df

Unnamed: 0,id,title,created_utc,num_comments,score
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1
2,uu6apo,[homemade] üá≤üá¶,1653076639,0,1
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1
...,...,...,...,...,...
50172,r8dw7i,[homemade] Korean beef lettuce wraps,1638580177,1,1
50173,r8du3q,[homemade] Ramen. Inexperienced cook and I‚Äôm p...,1638580002,0,1
50174,r8dsud,[homemade] I am a very inexperienced cook and ...,1638579895,2,1
50175,r8dquz,[Homemade] Beef stew w/ fresh baked bread,1638579723,18,1


#### first, I will check for any null values

In [3]:
posts_df.isnull().sum()

id              0
title           0
created_utc     0
num_comments    0
score           0
dtype: int64

#### convert epoch time to datetime

In [4]:
date = dt.datetime.fromtimestamp(1653077037)
str(date)

'2022-05-20 16:03:57'

In [5]:
posts_df['post_time_utc']= [str(dt.datetime.fromtimestamp(x)) for x in posts_df['created_utc']]

In [6]:
posts_df.dtypes

id               object
title            object
created_utc       int64
num_comments      int64
score             int64
post_time_utc    object
dtype: object

In [7]:
posts_df.to_csv('./Data/posts_with_date.csv', index=False) ##emily go back and re-export this

#### combine scores data with post data

In [8]:
posts_df.rename(columns={'score':'score_old', 'num_comments':'comments_old'}, inplace=True)

read in scores and comments from api

In [140]:
df_10 = pd.read_csv('./Data/0_to_10k_scores.csv')

df_20 = pd.read_csv('./Data/10_to_20k_scores.csv')

df_30 = pd.read_csv('./Data/2_to_3k_scores.csv')

df_40 = pd.read_csv('./Data/3_to_4k_scores.csv')

df_50 = pd.read_csv('./Data/4_to_5k_scores.csv')

df_60 = pd.read_csv('./Data/5_to_6k_scores.csv')

df_70 = pd.read_csv('./Data/7_to_7k_scores.csv')

df_80 = pd.read_csv('./Data/7_to_8k_scores.csv')

df_90 = pd.read_csv('./Data/8_to_9k_scores.csv')

df_100 = pd.read_csv('./Data/9_to_10k_scores.csv')

In [144]:
df_90.shape

(1000, 3)

concatenate into 1 dataframe

In [146]:
scores_df = pd.concat([df_10, df_20, df_30, df_40, df_50, df_60, df_70, df_80, df_90, df_100])

In [147]:
scores_df.shape, posts_df.shape

((11238, 3), (50177, 6))

In [150]:
scores_df.dtypes, posts_df.dtypes

(id           object
 score         int64
 comments    float64
 dtype: object,
 id               object
 title            object
 created_utc       int64
 comments_old      int64
 score_old         int64
 post_time_utc    object
 dtype: object)

In [153]:
scores_df.isnull().sum()

id          0
score       0
comments    1
dtype: int64

In [154]:
scores_df.dropna(inplace=True)

In [156]:
scores_df.drop_duplicates(inplace=True)

join to the posts dataframe

In [157]:
posts_scores_df = posts_df.merge(right=scores_df, how='inner')
posts_scores_df

Unnamed: 0,id,title,created_utc,comments_old,score_old,post_time_utc,score,comments
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1,2022-05-20 16:03:57,69,7.0
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1,2022-05-20 15:59:59,37,4.0
2,uu6apo,[homemade] üá≤üá¶,1653076639,0,1,2022-05-20 15:57:19,1,0.0
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1,2022-05-20 15:48:11,16,2.0
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1,2022-05-20 15:38:20,1,0.0
...,...,...,...,...,...,...,...,...
10790,u3jog2,[Homemade] Spinach fettuccine alfredo,1649948916,0,1,2022-04-14 11:08:36,13,0.0
10791,u3jezk,[homemade] Glowup of instant noodles for me an...,1649948229,0,1,2022-04-14 10:57:09,15,2.0
10792,u3jcej,[I ate] Napoleon cake and lemon curd tartlet.,1649948026,0,1,2022-04-14 10:53:46,14,1.0
10793,u3j8ay,[Homemade] Tonkatsu bento box that I made for ...,1649947706,0,1,2022-04-14 10:48:26,30,5.0


In [158]:
posts_scores_df.dtypes

id                object
title             object
created_utc        int64
comments_old       int64
score_old          int64
post_time_utc     object
score              int64
comments         float64
dtype: object

In [159]:
posts_scores_df.isnull().sum()

id               0
title            0
created_utc      0
comments_old     0
score_old        0
post_time_utc    0
score            0
comments         0
dtype: int64

Engineering a field to connote 'viral'. Viral is being defined as a post with a score in the top 2% of all posts

In [162]:
posts_scores_df['percentile']= posts_scores_df.score.rank(pct=True)
#https://www.geeksforgeeks.org/percentile-rank-of-a-column-in-a-pandas-dataframe/

In [163]:
posts_scores_df['viral'] = np.where(posts_scores_df['percentile'] > .98, 1, 0 )

In [164]:
posts_scores_df.to_csv('./Data/posts_scores_dates.csv', index=False)

#### next, I will work on vectorizing the text of the titles

In [24]:
X = posts_scores_df['title']
y = posts_scores_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [26]:
cv = CountVectorizer()

In [27]:
cv.fit(X_train)

CountVectorizer()

In [28]:
X_train_cv = cv.transform(X_train)

In [29]:
X_test_cv = cv.transform(X_test)

In [30]:
X_train_cv

<12816x6169 sparse matrix of type '<class 'numpy.int64'>'
	with 89528 stored elements in Compressed Sparse Row format>

In [31]:
text_df = pd.DataFrame(X_train_cv.A, columns = cv.get_feature_names_out())
text_df

Unnamed: 0,00,04,048,05unamksrom,06,10,100,1000,1000x,10pm,...,Ëá™Âà∂,Ë±öÈ™®„É©„Éº„É°„É≥,ùóõùóºùòÑ,ùó∂ùóª,ùóπùó≤ùòÄùòÄ,ùó∫ùóÆùó∏ùó≤,ùó∫ùó∂ùóªùòÇùòÅùó≤,ùó∫ùòÇùó¥,ùóΩùóÆùóªùó∞ùóÆùó∏ùó≤,ùòÅùóµùóÆùóª
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12811,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
text_df['post_time_utc'] = posts_df['post_time_utc']

In [34]:
text_df.sum(numeric_only=True).sort_values(ascending=False).head(25)

homemade    10061
and          4176
with         3122
ate          2628
chicken      1532
cheese        729
the           670
in            660
sauce         607
amp           598
rice          591
fried         580
garlic        539
pizza         525
of            492
on            472
my            451
pork          440
made          430
salad         412
for           410
chef          406
potatoes      403
bacon         394
beef          380
dtype: int64

### try with make column transformer

In [None]:
X2 = posts_df[['title', 'post_time_utc']]
y2 = posts_df['score']

In [None]:
X2.dtypes

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, random_state=20)

In [None]:
ct = make_column_transformer(
    (CountVectorizer(),'title'),
    remainder= 'passthrough',
    n_jobs= -1,
    sparse_threshold= 0,
    verbose_feature_names_out= False
)

In [None]:
ct.fit(X2_train)

In [None]:
ct.transform(X2_train)

In [None]:
ct.get_feature_names_out()

#### pickling

In [None]:
with open('./Models/column_transformer.pkl', 'wb') as pickle_out:
    pickle.dump(ct, pickle_out)

In [None]:
with open('./Data/X_train.pkl', 'wb') as pickle_out:
    pickle.dump(X2_train, pickle_out)

In [None]:
with open('./Data/X_test.pkl', 'wb') as pickle_out:
    pickle.dump(X2_test, pickle_out)

In [None]:
with open('./Data/y_train.pkl', 'wb') as pickle_out:
    pickle.dump(y2_train, pickle_out)

In [None]:
with open('./Data/y_test.pkl', 'wb') as pickle_out:
    pickle.dump(y2_test, pickle_out)