# Project 3 - 2. EDA and Data Cleaning

In [356]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LinearRegression

#### Read combined reddit posts csv file

In [357]:
df = pd.read_csv('./reddit_posts.csv')
df.head()
df.shape

(1738, 2)

In [358]:
# Check the type of the data
type(df['data'][0])

str

In [359]:
# Convert the data type to dictionary to make it easy to utilize.
df['data'] = df['data'].map(lambda x: ast.literal_eval(x))
df.head()

Unnamed: 0,data,kind
0,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3
1,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3
2,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3
3,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3
4,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3


In [360]:
df['data'][0]

{'approved_at_utc': None,
 'subreddit': 'FoodNYC',
 'selftext': "My boyfriend and I are going to NYC tomorrow through friday and are looking for some cool (and affordable) restaurant suggestions! We are staying in Koreatown but our schedule is wide open so we're open to places anywhere that we can get to via subway. One thing I'd really like to try is soul food in Harlem and would also like to find a cheap, generic new york pizza place for a quick bite. We're open to literally anything unless its crazy expensive. Thanks in advance!!! :)",
 'author_fullname': 't2_l0i2u8p',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'First timer looking for suggestions!',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/FoodNYC',
 'hidden': False,
 'pwls': None,
 'link_flair_css_class': None,
 'downs': 0,
 'thumbnail_height': None,
 'hide_score': False,
 'name': 't3_a7gkn5',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'author_flair_backgr

> Select some keys that might be meaningful by intuition.

In [361]:
# Keys that seems to be meaningful.
keys = ['title',
        'selftext',
        'subreddit',
        'num_comments',
        'is_video',
        'over_18',
        'likes',
        'score',
        'ups',
        'downs',
        'link_flair_text_color']

In [362]:
# Append dataframe with the keys above.
for i in range(len(df)):
    for j in keys:
        df.loc[i, j] = df['data'][i][j]

In [363]:
# Check the number of values in each feature, except title, selftext, and subreddit.
check_keys =['num_comments',
        'is_video',
        'over_18',
        'likes',
        'score',
        'ups',
        'downs',
        'link_flair_text_color']

for i in check_keys:
    print(df[i].value_counts())
    print('-'*30)

0.0     217
2.0     146
3.0     131
1.0     127
4.0     108
5.0     100
6.0      97
7.0      89
8.0      85
10.0     64
9.0      58
11.0     55
12.0     49
15.0     43
13.0     41
14.0     33
16.0     30
17.0     25
18.0     25
19.0     21
20.0     15
22.0     15
25.0     15
21.0     15
24.0     13
26.0     13
31.0     11
23.0      9
27.0      9
46.0      6
       ... 
32.0      5
29.0      5
30.0      4
28.0      3
49.0      3
50.0      3
40.0      3
34.0      3
35.0      3
38.0      3
39.0      3
33.0      3
41.0      3
43.0      3
42.0      2
54.0      2
45.0      2
51.0      2
44.0      2
65.0      1
52.0      1
61.0      1
74.0      1
47.0      1
58.0      1
48.0      1
60.0      1
36.0      1
55.0      1
73.0      1
Name: num_comments, Length: 61, dtype: int64
------------------------------
False    1738
Name: is_video, dtype: int64
------------------------------
False    1737
True        1
Name: over_18, dtype: int64
------------------------------
Series([], Name: likes, dtype: 

> all null values - [likes] - Drop

> all same values - [downs, over_18, is_video, link_flair_text_color] - Drop

In [364]:
# Drop the meaningless features.
df.drop(['likes', 'downs', 'over_18', 'is_video', 'link_flair_text_color'], axis=1, inplace=True)

In [365]:
df.head()

Unnamed: 0,data,kind,title,selftext,subreddit,num_comments,score,ups
0,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,First timer looking for suggestions!,My boyfriend and I are going to NYC tomorrow t...,FoodNYC,20.0,5.0,5.0
1,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,Chinatown on Christmas Day,Dumb question:. Can I walk in to a restaurant ...,FoodNYC,5.0,4.0,4.0
2,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,Hidden Gem: Hi Collar's Katsu Sando,,FoodNYC,11.0,36.0,36.0
3,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,anybody want to open a restaurant in Howard be...,,FoodNYC,0.0,0.0,0.0
4,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,Midtown/Hell's Kitchen Suggestions for a small...,"Hey, I've got a group of six hanging around Mi...",FoodNYC,5.0,1.0,1.0


> similar values - [score, ups] - check if they are same and then drop one of them.

In [366]:
# Since 'score' and 'like' are numeric values, use .describe() to see the distribution.
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_comments,1738.0,8.869965,9.965704,0.0,2.0,6.0,12.0,74.0
score,1738.0,13.926352,15.849911,0.0,5.0,9.0,17.0,142.0
ups,1738.0,13.926352,15.849911,0.0,5.0,9.0,17.0,142.0


> We can assume that score and ups has exact same statistics. We will drop ups features.

In [367]:
df.drop('ups', axis=1, inplace=True)
df.head()

Unnamed: 0,data,kind,title,selftext,subreddit,num_comments,score
0,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,First timer looking for suggestions!,My boyfriend and I are going to NYC tomorrow t...,FoodNYC,20.0,5.0
1,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,Chinatown on Christmas Day,Dumb question:. Can I walk in to a restaurant ...,FoodNYC,5.0,4.0
2,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,Hidden Gem: Hi Collar's Katsu Sando,,FoodNYC,11.0,36.0
3,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,anybody want to open a restaurant in Howard be...,,FoodNYC,0.0,0.0
4,"{'approved_at_utc': None, 'subreddit': 'FoodNY...",t3,Midtown/Hell's Kitchen Suggestions for a small...,"Hey, I've got a group of six hanging around Mi...",FoodNYC,5.0,1.0


#### Let's drop 'data', 'kind' columns since we already extracted information.

In [368]:
df.drop(['data', 'kind'], axis=1, inplace=True)

#### Assign classes ( FoodNYC = 1, FoodLosAngeles = 0 )

In [369]:
df['subreddit'] = df.apply(lambda row: 1 if row.subreddit == 'FoodNYC' else 0, axis=1)

#### Fill empty string in the posts instead of null value.

In [370]:
df.replace('', 'notext', inplace=True)

#### Save as csv file

In [371]:
df.to_csv('./reddit_text.csv', index=False)