#### In this notebook I will pre-process the data to get it ready for modeling. Since I will be using Natural Language Porcessing (NLP) models I will focus on vectoring the text of the titles

In [220]:
import pandas as pd
import datetime as dt
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
from sklearn.neighbors import KNeighborsRegressor

In [2]:
posts_df = pd.read_csv('../food_trends/Data/reddit_6_months.csv')

posts_df.drop(columns=['subreddit','selftext'], inplace=True)
posts_df

Unnamed: 0,id,title,created_utc,num_comments,score
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1
2,uu6apo,[homemade] 🇲🇦,1653076639,0,1
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1
...,...,...,...,...,...
50172,r8dw7i,[homemade] Korean beef lettuce wraps,1638580177,1,1
50173,r8du3q,[homemade] Ramen. Inexperienced cook and I’m p...,1638580002,0,1
50174,r8dsud,[homemade] I am a very inexperienced cook and ...,1638579895,2,1
50175,r8dquz,[Homemade] Beef stew w/ fresh baked bread,1638579723,18,1


#### first, I will check for any null values

In [169]:
posts_df.isnull().sum()

id               0
title            0
created_utc      0
num_comments     0
score            0
post_time_utc    0
dtype: int64

#### convert epoch time to datetime

In [4]:
date = dt.datetime.fromtimestamp(1653077037)
str(date)

'2022-05-20 16:03:57'

In [200]:
posts_df['post_time_utc']= [str(dt.datetime.fromtimestamp(x)) for x in posts_df['created_utc']]

In [201]:
posts_df.dtypes

id               object
title            object
created_utc       int64
num_comments      int64
score             int64
post_time_utc    object
dtype: object

In [238]:
posts_df.to_csv('./Data/posts_with_date.csv', index=False)

#### next, I will work on vectorizing the text of the titles

In [34]:
X = posts_df['title']
y = posts_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [35]:
X_train.dtypes

dtype('O')

In [36]:
cv = CountVectorizer()

In [37]:
cv.fit(X_train)

CountVectorizer()

In [38]:
X_train_cv = cv.transform(X_train)

In [39]:
X_test_cv = cv.transform(X_test)

In [40]:
X_train_cv

<37632x14354 sparse matrix of type '<class 'numpy.int64'>'
	with 257085 stored elements in Compressed Sparse Row format>

In [41]:
text_df = pd.DataFrame(X_train_cv.A, columns = cv.get_feature_names_out())
text_df

Unnamed: 0,00,000,007,00pm,03,0336,04,048,05unamksrom,06,...,𝓼𝓾𝓫,𝔀𝓲𝓽𝓱,𝗛𝗼𝘄,𝗶𝗻,𝗹𝗲𝘀𝘀,𝗺𝗮𝗸𝗲,𝗺𝗶𝗻𝘂𝘁𝗲,𝗺𝘂𝗴,𝗽𝗮𝗻𝗰𝗮𝗸𝗲,𝘁𝗵𝗮𝗻
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37627,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37628,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37629,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37630,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
text_df['post_time_utc'] = posts_df['post_time_utc']

In [92]:
text_df.sum(numeric_only=True).sort_values(ascending=False).head(25)

homemade     30443
and          11363
with          8634
ate           6964
chicken       4323
cheese        2154
the           1836
in            1815
amp           1756
rice          1703
sauce         1691
fried         1684
pizza         1597
beef          1438
garlic        1414
of            1402
on            1361
pork          1306
for           1268
my            1258
potatoes      1189
chocolate     1175
bacon         1154
steak         1121
cake          1084
dtype: int64

### try with make column transformer

In [202]:
X2 = posts_df[['title', 'post_time_utc']]
y2 = posts_df['score']

In [73]:
X2.dtypes

title                    object
post_time_utc    datetime64[ns]
dtype: object

In [205]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, random_state=20)

In [216]:
ct = make_column_transformer(
    (CountVectorizer(),'title'),
    remainder= 'passthrough',
    n_jobs= -1,
    sparse_threshold= 0,
    verbose_feature_names_out= False
)

In [217]:
ct.fit(X2_train)

ColumnTransformer(n_jobs=-1, remainder='passthrough', sparse_threshold=0,
                  transformers=[('countvectorizer', CountVectorizer(),
                                 'title')],
                  verbose_feature_names_out=False)

In [218]:
ct.transform(X2_train)

array([[0, 0, 0, ..., 0, 0, '2022-05-01 23:13:06'],
       [0, 0, 0, ..., 0, 0, '2022-03-12 11:22:58'],
       [0, 0, 0, ..., 0, 0, '2022-04-21 23:23:24'],
       ...,
       [0, 0, 0, ..., 0, 0, '2022-01-11 16:36:43'],
       [0, 0, 0, ..., 0, 0, '2022-02-12 20:32:52'],
       [0, 0, 0, ..., 0, 0, '2021-12-09 01:25:25']], dtype=object)

In [219]:
ct.get_feature_names_out()

array(['00', '000', '007', ..., '𝗽𝗮𝗻𝗰𝗮𝗸𝗲', '𝘁𝗵𝗮𝗻', 'post_time_utc'],
      dtype=object)

#### pickling

In [225]:
with open('./Models/column_transformer.pkl', 'wb') as pickle_out:
    pickle.dump(ct, pickle_out)

In [236]:
with open('./Data/X_train.pkl', 'wb') as pickle_out:
    pickle.dump(X2_train, pickle_out)

In [233]:
with open('./Data/X_test.pkl', 'wb') as pickle_out:
    pickle.dump(X2_test, pickle_out)

In [234]:
with open('./Data/y_train.pkl', 'wb') as pickle_out:
    pickle.dump(y2_train, pickle_out)

In [235]:
with open('./Data/y_test.pkl', 'wb') as pickle_out:
    pickle.dump(y2_test, pickle_out)