# Project 3: Web APIs & Classification

### Contents

- [Problem Statement](#Problem-Statement)
- [Functions](#Functions)
- [Importing](#Importing)


- [Inspect Data](#Inspect-Data)
- [Clean Data](#Clean-Data)
- [Output Clean Data](#Output-Clean-Data)


- [EDA](#EDA)


- [Create Feature Matrix and Target](#Create-Feature-Matrix-and-Target)



- [Output Model Predictions](#Output-Model-Predictions)


- [Descriptive and Inferential Statistics](#Descriptive-and-Inferential-Statistics)
- [Outside Research](#Outside-Research)
- [Conclusions and Recommendations](#Conclusions-and-Recommendations)

### Problem Statement

- Blank

### Configurations and Libraries

In [1]:
# user configuration

# True: scape data from website -> save as json files
# False: load json files
scape_data = False

# scape data for subreddits[0] if scape_data=True
scape_index = 0
# scape data for subreddits[1] if scape_data=True
#scape_index = 1

num_requests = 50

posts_limit = 900

subreddits = ['boardgames','mobilegames']

url = "https://www.reddit.com/r/"
headers = {'User-agent':'Bleep blorp bot 0.1'}

In [2]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix

# nlp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# web
import requests
import json
from IPython.display import Image
from IPython.core.display import HTML

# others
import time
import datetime as dt
#import re
import os

In [3]:
# file paths

input_path = '../data/input/'
mid_path = '../data/mid/'
output_path = '../data/output/'

image_path = '../images/'

### Functions

In [4]:
# output scaped data to json files
def write_json_files(data):    
    
    timestamp = dt.datetime.now()
    timestamp = timestamp.strftime(" %Y_%m_%d %H_%M_%S")
    
    filepath = input_path + subreddits[scape_index] + '/'
    filename = subreddits[scape_index] + timestamp + '.json'
    
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    
    file = open(filepath + filename,"w+")    
    json.dump(data,file)
    
    print('created',filename)

In [5]:
# create posts list from json files
def read_json_files(subreddit):   
    
    files = []    
    filepath = input_path + subreddit

    # r=root, d=directories, f = files
    for r, d, f in os.walk(filepath):
        for file in f:
            if '.json' in file:
                files.append(os.path.join(r, file))
                
    posts = []

    # load in newer jsons first
    for f in files[::-1]:
        print(f)
        file_data = open(f).read()
        json_data = json.loads(file_data)

        posts.extend(json_data['data']['children'])
        after = json_data['data']['after']
        
    return posts

In [6]:
# created df with posts

def create_df(posts,df):

    for idx,p in enumerate(posts):

        p_keys = p['data'].keys()

        for col in cols:

            if col in p_keys:
                df.at[idx,col] = p['data'][col]
            else:
                df.at[idx,col] = np.nan
                print(idx,col,'nan')
                
    return df

### Scape or Import Data

In [7]:
# old method (not used)
# get json from reddit boardgames

# if scape_data == True:

#     response = requests.get(url,headers=headers)
#     print(response.status_code)

#     json_data = response.json()
#     sorted(json_data.keys())

In [8]:
# old method (not used)
# print number of posts in json_data

# if scape_data == True:

#     print(len(json_data['data']['children']))

#     # print info in kind
#     print(json_data['kind'])
#     print('')

#     # print data keys
#     print(sorted(json_data['data'].keys()))
#     print('')

#     # print 1st post
#     print(json_data['data']['children'][0]['data'])
#     print('')

#     # print id of of last post
#     print(json_data['data']['after'])

In [9]:
# old method (not used)
# print ids of all posts

# if scape_data == True:

#     [post['data']['name'] for post in json_data['data']['children']]

In [10]:
# get json_data with multiple requests

if scape_data == True:

    posts = []
    after = None

    for i in range(1,num_requests+1):

        if after == None:
            params = {}
        else:
            params = {'after':after}

        json_url = url + subreddits[scape_index] + '.json'
        response = requests.get(json_url,params=params,headers=headers)

        if response.status_code == 200:       

            print('process request',i)

            json_data = response.json()
            posts.extend(json_data['data']['children'])
            after = json_data['data']['after']

            # save scaped data to files
            write_json_files(json_data)        

        else:
            print(response.status_code)
            break

        time.sleep(2)

In [11]:
# load json data from files (saved previously in input folder)

if scape_data == False:
    
    posts_a = read_json_files(subreddits[0])
    posts_b = read_json_files(subreddits[1])      

../data/input/boardgames\boardgames 2019_07_19 21_42_59.json
../data/input/boardgames\boardgames 2019_07_19 21_42_56.json
../data/input/boardgames\boardgames 2019_07_19 21_42_53.json
../data/input/boardgames\boardgames 2019_07_19 21_42_50.json
../data/input/boardgames\boardgames 2019_07_19 21_42_48.json
../data/input/boardgames\boardgames 2019_07_19 21_42_45.json
../data/input/boardgames\boardgames 2019_07_19 21_42_42.json
../data/input/boardgames\boardgames 2019_07_19 21_42_39.json
../data/input/boardgames\boardgames 2019_07_19 21_42_36.json
../data/input/boardgames\boardgames 2019_07_19 21_42_32.json
../data/input/boardgames\boardgames 2019_07_19 21_42_29.json
../data/input/boardgames\boardgames 2019_07_19 21_42_27.json
../data/input/boardgames\boardgames 2019_07_19 21_42_24.json
../data/input/boardgames\boardgames 2019_07_19 21_42_21.json
../data/input/boardgames\boardgames 2019_07_19 21_42_18.json
../data/input/boardgames\boardgames 2019_07_19 21_42_15.json
../data/input/boardgames

../data/input/mobilegames\mobilegames 2019_07_17 20_16_04.json
../data/input/mobilegames\mobilegames 2019_07_17 20_16_01.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_57.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_53.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_50.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_46.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_42.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_39.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_35.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_32.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_28.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_24.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_21.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_17.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_14.json
../data/input/mobilegames\mobilegames 2019_07_17 20_15_

In [12]:
# posts count info

if scape_data == False:
    
    total_posts_a = len(posts_a)
    unique_posts_a = len(set(p['data']['name'] for p in posts_a))
    total_posts_b = len(posts_b)
    unique_post_b = len(set(p['data']['name'] for p in posts_b))
    
    print('total number of posts:',total_posts_a)
    print('unique number of posts:',unique_posts_a)
    print('')
    print('total number of posts:',total_posts_b)
    print('unique number of posts:',unique_post_b)

total number of posts: 2235
unique number of posts: 1165

total number of posts: 2204
unique number of posts: 956


### Inspect Data

In [13]:
# print some info from posts_a (subreddit[0])

# for p in posts_a:
    
#     title = p['data']['title']
#     #subreddit = p['data']['subreddit']
#     score = p['data']['score']
#     pinned = p['data']['pinned'] 
    
#     print(title)
#     print(subreddit)
#     print(score)
#     print(pinned)
#     print('')
    
#     keys = p['data'].keys()    
#     print(len(keys))   

In [14]:
# create empty df

#cols = list(posts_a[0]['data'].keys())
cols = ['name','title','score','selftext']

df = pd.DataFrame(columns=cols)
df_a = pd.DataFrame(columns=cols)
df_b = pd.DataFrame(columns=cols)

In [15]:
# convert posts to df

df_a = create_df(posts_a,df_a)
df_b = create_df(posts_b,df_b)

In [16]:
# drop rows with empty selftext

# need to check for empty string in selftext???
if False:

    print(len(df_a))
    print(len(df_b))

    df_a['selftext'].replace('',np.nan,inplace=True)
    df_b['selftext'].replace('',np.nan,inplace=True)

    df_a.dropna(subset=['selftext'],inplace=True)
    df_b.dropna(subset=['selftext'],inplace=True)

    print(len(df_a))
    print(len(df_b))

In [17]:
# remove rows with duplicated name

print(len(df_a))
print(len(df_b))

df_a.drop_duplicates(subset='name',inplace=True)
df_b.drop_duplicates(subset='name',inplace=True)

print(len(df_a))
print(len(df_b))

2235
2204
1165
956


In [18]:
df_a.head()

Unnamed: 0,name,title,score,selftext
0,t3_ccwwo2,What player mats look like after lamination,12,http://imgur.com/gallery/Ab1Q9CQ\n\nSearched t...
1,t3_ccppxz,Anyone playing in downtown Toronto?,69,I've built up a small game collection lately -...
2,t3_ccqzz1,I made an app to help run my PitchCar tourname...,46,
3,t3_ccwwjl,Pandemic Legacy s1 *September spoilers*,8,Just want to make sure cause this seems weird....
4,t3_cctvi9,Rumbleslam is the best game I almost never hea...,17,https://imgur.com/a/tPFXI9B\n\nI think I gloss...


In [19]:
df_a['selftext'].apply(len).sort_values(ascending=True)

495         0
361         0
944         0
904         0
138         0
139         0
639         0
339         0
640         0
336         0
777         0
778         0
709         0
328         0
162         0
790         0
642         0
547         0
634         0
936         0
1434        0
368         0
1444        0
393         0
1442        0
391         0
94          0
97          0
98          0
551         0
        ...  
1352     4880
276      5077
236      5402
798      5594
52       5596
396      5952
522      6261
323      6542
506      6545
420      7006
605      7067
1357     7215
504      7566
491      7586
583      7773
312      7928
42       8129
916      8507
647      8771
157      9212
341     11174
839     11663
654     12309
536     12548
927     13138
723     15055
1273    19660
55      21592
741     24697
430     32425
Name: selftext, Length: 1165, dtype: int64

In [20]:
# list all columns

print(df_a.columns)

Index(['name', 'title', 'score', 'selftext'], dtype='object')


In [21]:
# output 1st 5 records

df_a.head()

Unnamed: 0,name,title,score,selftext
0,t3_ccwwo2,What player mats look like after lamination,12,http://imgur.com/gallery/Ab1Q9CQ\n\nSearched t...
1,t3_ccppxz,Anyone playing in downtown Toronto?,69,I've built up a small game collection lately -...
2,t3_ccqzz1,I made an app to help run my PitchCar tourname...,46,
3,t3_ccwwjl,Pandemic Legacy s1 *September spoilers*,8,Just want to make sure cause this seems weird....
4,t3_cctvi9,Rumbleslam is the best game I almost never hea...,17,https://imgur.com/a/tPFXI9B\n\nI think I gloss...


In [22]:
df_a_desc = df_a.describe()
df_a_desc

Unnamed: 0,name,title,score,selftext
count,1165,1165,1165,1165.0
unique,1165,1165,202,987.0
top,t3_cazjn5,Any updates on Reich Busters?,0,
freq,1,1,76,137.0


In [23]:
# Check for nulls in columns

null_cols = df_a.isnull().sum()
mask_null = null_cols > 0
null_cols[mask_null].sort_values(ascending=False)

Series([], dtype: int64)

In [24]:
# Check for nulls in rows

null_rows = df_a.isnull().sum(axis=1)
mask_null = null_rows > 0
null_rows[mask_null].sort_values(ascending=False)

Series([], dtype: int64)

In [25]:
# show column summary

df_a_info = df_a.info()
df_a_info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1165 entries, 0 to 2178
Data columns (total 4 columns):
name        1165 non-null object
title       1165 non-null object
score       1165 non-null object
selftext    1165 non-null object
dtypes: object(4)
memory usage: 45.5+ KB


In [26]:
# find columns with the most empty cells

df_a.count(axis=0).sort_values()

name        1165
title       1165
score       1165
selftext    1165
dtype: int64

In [27]:
pd.value_counts(df_a['name'].values, sort=True)

t3_cazjn5    1
t3_cb16j1    1
t3_cdk284    1
t3_cai7jn    1
t3_c5wlk5    1
t3_c89pkm    1
t3_c6abqt    1
t3_c9redn    1
t3_c85y0l    1
t3_ca7btu    1
t3_ca05ru    1
t3_cbf5im    1
t3_cd1b07    1
t3_c4rpie    1
t3_cezj3y    1
t3_cd6xu1    1
t3_cedg57    1
t3_c4tzdr    1
t3_c56s0u    1
t3_ceww30    1
t3_c5j2og    1
t3_c9ej3e    1
t3_cdk6dv    1
t3_cavjcg    1
t3_c5th6b    1
t3_ceugae    1
t3_c6a1n1    1
t3_cb82b5    1
t3_c901qn    1
t3_c4dx7w    1
            ..
t3_c8b5iu    1
t3_cbykh0    1
t3_c6tozf    1
t3_cbihmk    1
t3_cdwbai    1
t3_c69f62    1
t3_c9di1o    1
t3_caa2sq    1
t3_cbzuje    1
t3_c5arp0    1
t3_c9xbx7    1
t3_cbz7xk    1
t3_cdpltu    1
t3_ceq674    1
t3_c9vzs6    1
t3_c7hv4n    1
t3_cei5mb    1
t3_c5zgf2    1
t3_c7r4qq    1
t3_cbea27    1
t3_c5dykj    1
t3_cdco7u    1
t3_c5uqj6    1
t3_c9mjgz    1
t3_cc0pdh    1
t3_c8sosr    1
t3_c9lmgl    1
t3_c53rrf    1
t3_cd9zij    1
t3_cesprz    1
Length: 1165, dtype: int64

In [28]:
# limit no of rows in each df

df_a = df_a[:posts_limit]
df_b = df_b[:posts_limit]

### Clean Data

In [29]:
# rename columns

In [30]:
# convert all string in cells to lowercase -> prevent duplicates when creating dummies

#df_train = df_train.applymap(lambda s:s.lower() if type(s) == str else s)
#df_test = df_test.applymap(lambda s:s.lower() if type(s) == str else s)

In [31]:
# fill nan/empty cells with na

#cols = ['pool_qual','bsmt_qual','fireplace_score','garage_qual','garage_cond','bsmt_type1_score','bsmt_type2_score']

#for col in cols:
    #df_train[col] = df_train[col].fillna(value='na')
    #df_test[col] = df_test[col].fillna(value='na')

In [32]:
# # fill nan/empty cells with 0

# cols = ['bath_half_bsmt_num','bath_full_bsmt_num','garage_area','garage_car_num','bsmt_total_area',
#         'bsmt_unfinish_area','bsmt_type2_area','bsmt_type1_area']

# for col in cols:
#     df_train[col] = df_train[col].fillna(value='0')
#     df_test[col] = df_test[col].fillna(value='0')

### Output Clean Data

In [33]:
# output to csv

#df_a.to_csv(mid_path + 'df_a.csv')

### EDA

In [None]:
features = 'title'
#features = 'selftext'

#X = df_a[[features]]
#y = df['is_a'].values

In [54]:
cvec = CountVectorizer(stop_words='english')
cvec.fit(df_a[features])

len_features = len(cvec.get_feature_names())
print(len_features)

df_a_cv = pd.DataFrame(cvec.transform(df_a[features]).todense(),columns=cvec.get_feature_names())

top_words_a = df_a_cv.sum(axis=0)
top_words_a.sort_values(ascending = False).head(20)

2111


game               249
board              114
games               94
2019                55
play                41
question            40
july                38
help                31
new                 31
card                30
boardgames          29
discussion          25
kickstarter         25
player              25
table               23
recommendations     22
best                22
review              21
comc                21
daily               21
dtype: int64

In [55]:
cvec = CountVectorizer(stop_words='english')
cvec.fit(df_b[features])

len_features = len(cvec.get_feature_names())
print(len_features)

df_b_cv = pd.DataFrame(cvec.transform(df_b[features]).todense(),columns=cvec.get_feature_names())

top_words_b = df_b_cv.sum(axis=0)
top_words_b.sort_values(ascending = False).head(20)

1933


game        334
mobile      204
games       103
android     101
new          73
ios          65
play         63
gameplay     52
like         39
looking      38
best         36
good         35
free         28
help         26
2018         24
amp          22
look         22
heroes       20
review       20
pubg         20
dtype: int64

In [34]:
# df_train heatmap (staircase)

#corr = df_train.corr()
#mask = np.zeros_like(corr)
#mask[np.triu_indices_from(mask)] = True

#fig, ax = plt.subplots(figsize=(20, 10))
#sns.heatmap(corr, mask=mask, vmax=.3, square=True,cmap="coolwarm_r");

In [35]:
#create_scatterplot('Sale Price vs House Quality',df_train,x ='house_qual',y ='sale_price',hue ='lot_subclass')

#print("sale price tends to increase as house quality increases.")

In [36]:
# create histograms for all numeric columns
#df_train.hist(figsize=(15, 15));

#print('plot histograms for all numeric columns to check for zeros and abnormalities.')

In [37]:
#create_boxplot(df_train,x='lot_subclass',y='sale_price',title='Sale Price vs Lot Subclass')

#print("Most subclasses has sale price between 100K to 200K.")
#print("4 subclasses have sale price above 200k and they have more outliers.")
#print("I will convert lot_subclass to dummy variables for model predictions.")

### Logistic Regression Model

In [38]:
print(df_a.shape)
print(df_b.shape)

(900, 4)
(900, 4)


In [39]:
df_a.head()

Unnamed: 0,name,title,score,selftext
0,t3_ccwwo2,What player mats look like after lamination,12,http://imgur.com/gallery/Ab1Q9CQ\n\nSearched t...
1,t3_ccppxz,Anyone playing in downtown Toronto?,69,I've built up a small game collection lately -...
2,t3_ccqzz1,I made an app to help run my PitchCar tourname...,46,
3,t3_ccwwjl,Pandemic Legacy s1 *September spoilers*,8,Just want to make sure cause this seems weird....
4,t3_cctvi9,Rumbleslam is the best game I almost never hea...,17,https://imgur.com/a/tPFXI9B\n\nI think I gloss...


In [40]:
df_b.head()

Unnamed: 0,name,title,score,selftext
0,t3_bbdq7o,"Age Of Z, recruiting",1,I'm looking for players who play the game age ...
1,t3_bbb1np,There was a java game...,1,"I remember playing an old game on my nokia, ab..."
2,t3_bb4yj9,Mmorpg?,1,Hi guys! Can you suggest some mobile mmorpg? S...
3,t3_bar748,The worst mobile game ad I have ever seen,1,
4,t3_baqg2e,Searching for games to be transplanted from st...,1,"Hi, I am a steam game player with more than 30..."


In [41]:
# create df by appending df_a and df_b

df = df.append(df_a)
df = df.append(df_b)

df.reset_index(drop=True)

Unnamed: 0,name,title,score,selftext
0,t3_ccwwo2,What player mats look like after lamination,12,http://imgur.com/gallery/Ab1Q9CQ\n\nSearched t...
1,t3_ccppxz,Anyone playing in downtown Toronto?,69,I've built up a small game collection lately -...
2,t3_ccqzz1,I made an app to help run my PitchCar tourname...,46,
3,t3_ccwwjl,Pandemic Legacy s1 *September spoilers*,8,Just want to make sure cause this seems weird....
4,t3_cctvi9,Rumbleslam is the best game I almost never hea...,17,https://imgur.com/a/tPFXI9B\n\nI think I gloss...
5,t3_ccvrtl,About to play eldritch horror for the first ti...,11,I'm buying Eldritch Horror tonight and I'm hop...
6,t3_ccvffm,What game is this from?,12,https://imgur.com/cM0HZh3\n\nI'm cleaning up a...
7,t3_ccx4vu,Hey! I recently discovered a board game cafe...,6,Just a couple days ago I realized there was a ...
8,t3_ccwizc,I gotta get out of this rabbit hole!,7,I just saw Dungeon Degenerates and now I want ...
9,t3_ccvqtl,Need Help Remembering the Name of this Game,8,So everyone is assigned a character synonymous...


In [42]:
# create target (use one hot encoding)

df['is_a'] = 0
df['is_a'].iloc[0:len(df_a)] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [43]:
# create X (feature matrix) and y (target)
features = 'title'
#features = 'selftext'

X = df[[features]]
y = df['is_a'].values

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,test_size=0.3,stratify=y,random_state=3050)

In [44]:
# check train and test data

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1260, 1)
(540, 1)
(1260,)
(540,)


In [45]:
# use CountVectorizer

cvec = CountVectorizer(stop_words='english')
cvec.fit(X_train[features])

len_features = len(cvec.get_feature_names())
print(len_features)

X_train_cv = pd.DataFrame(cvec.transform(X_train[features]).todense(),columns=cvec.get_feature_names())
X_test_cv = pd.DataFrame(cvec.transform(X_test[features]).todense(),columns=cvec.get_feature_names())

2768


In [46]:
# use LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_cv,y_train)

lr_score = lr.score(X_test_cv, y_test)
print(lr_score)

0.85




### Bayes Classifer

In [47]:
# create MultinomialNB model
mnb = MultinomialNB()

mnb.fit(X_train_cv,y_train)
y_pred = mnb.predict(X_test_cv)

In [48]:
# score MultinomialNB model

train_score = mnb.score(X_train_cv,y_train)
print(train_score)

test_score = mnb.score(X_test_cv,y_test)
print(test_score)

0.9714285714285714
0.8703703703703703


In [49]:
# create confusion matrix

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Positives: %s" % tp)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Negatives: %s" % tn)

True Positives: 225
False Positives: 25
False Negatives: 45
True Negatives: 245


In [50]:
# cm kpi

accuracy = (tp+tn)/(tp+fp+fn+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
precision = tp/(tp+fp)

print('Accuracy:',round(accuracy,3))
print('Sensitivity:',round(sensitivity,3))
print('Specificity:',round(specificity,3))
print('Precision:',round(precision,3))

Accuracy: 0.87
Sensitivity: 0.833
Specificity: 0.907
Precision: 0.9


### Output Model Predictions

In [51]:
# output to csv

#timestamp = dt.datetime.now()
#timestamp = timestamp.strftime(" %Y_%m_%d %H_%M_%S ")
    
# contains selected columns for feature matrix
#df_cols.to_csv(output_path + 'columns' + timestamp + '.csv')

### Descriptive and Inferential Statistics

### Outside Research

### Conclusions and Recommendations

In [52]:
df_b.shape

(900, 4)