# Project 3: Web APIs & Classification

### Contents

- [Problem Statement](#Problem-Statement)
- [Functions](#Functions)
- [Importing](#Importing)


- [Inspect Data](#Inspect-Data)
- [Clean Data](#Clean-Data)
- [Output Clean Data](#Output-Clean-Data)


- [EDA](#EDA)


- [Create Feature Matrix and Target](#Create-Feature-Matrix-and-Target)



- [Output Model Predictions](#Output-Model-Predictions)


- [Descriptive and Inferential Statistics](#Descriptive-and-Inferential-Statistics)
- [Outside Research](#Outside-Research)
- [Conclusions and Recommendations](#Conclusions-and-Recommendations)

### Problem Statement

- Blank

### Configurations and Libraries

In [None]:
# user configuration

# True: scape data from website -> save as json files
# False: load json files
scape_data = False

# scape data for subreddits[0] if scape_data=True
#scape_index = 0
# scape data for subreddits[1] if scape_data=True
scape_index = 1

subreddits = ['boardgames','mobilegames']

url = "http://www.reddit.com/r/"
headers = {'User-agent':'Bleep blorp bot 0.1'}

num_requests = 40

In [None]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score,mean_squared_error

# nlp
from sklearn.feature_extraction.text import CountVectorizer

# web
import requests
import json
from IPython.display import Image
from IPython.core.display import HTML

# others
import time
import datetime as dt
#import re
import os

In [None]:
# file paths

input_path = '../data/input/'
mid_path = '../data/mid/'
output_path = '../data/output/'

image_path = '../images/'

### Functions

In [None]:
# output scaped data to json files
def write_json_files(data):    
    
    timestamp = dt.datetime.now()
    timestamp = timestamp.strftime(" %Y_%m_%d %H_%M_%S")
    
    filepath = input_path + subreddits[scape_index] + '/'
    filename = subreddits[scape_index] + timestamp + '.json'
    
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    
    file = open(filepath + filename,"w+")    
    json.dump(data,file)
    
    print('created',filename)

In [None]:
# create posts list from json files
def read_json_files(subreddit):   
    
    files = []    
    filepath = input_path + subreddit

    # r=root, d=directories, f = files
    for r, d, f in os.walk(filepath):
        for file in f:
            if '.json' in file:
                files.append(os.path.join(r, file))
                
    posts = []

    for f in files:
        print(f)
        file_data = open(f).read()
        json_data = json.loads(file_data)

        posts.extend(json_data['data']['children'])
        after = json_data['data']['after']
        
    return posts

### Scape or Import Data

In [None]:
# old method (not used)
# get json from reddit boardgames

# if scape_data == True:

#     response = requests.get(url,headers=headers)
#     print(response.status_code)

#     json_data = response.json()
#     sorted(json_data.keys())

In [None]:
# old method (not used)
# print number of posts in json_data

# if scape_data == True:

#     print(len(json_data['data']['children']))

#     # print info in kind
#     print(json_data['kind'])
#     print('')

#     # print data keys
#     print(sorted(json_data['data'].keys()))
#     print('')

#     # print 1st post
#     print(json_data['data']['children'][0]['data'])
#     print('')

#     # print id of of last post
#     print(json_data['data']['after'])

In [None]:
# old method (not used)
# print ids of all posts

# if scape_data == True:

#     [post['data']['name'] for post in json_data['data']['children']]

In [None]:
# get json_data with multiple requests

if scape_data == True:

    posts = []
    after = None

    for i in range(1,num_requests+1):

        if after == None:
            params = {}
        else:
            params = {'after':after}

        json_url = url + subreddits[scape_index] + '.json'
        response = requests.get(json_url,params=params,headers=headers)

        if response.status_code == 200:       

            print('process request',i)

            json_data = response.json()
            posts.extend(json_data['data']['children'])
            after = json_data['data']['after']

            # save scaped data to files
            write_json_files(json_data)        

        else:
            print(response.status_code)
            break

        time.sleep(3)

In [None]:
# load json data from files (saved previously in input folder)

if scape_data == False:
    
    posts_a = read_json_files(subreddits[0])
    posts_b = read_json_files(subreddits[1])      

In [None]:
# posts count info

if scape_data == False:
    
    print('total number of posts:',len(posts_a))
    print('unique number of posts:',len(set(p['data']['name'] for p in posts_a)))
    print('')
    print('total number of posts:',len(posts_b))
    print('unique number of posts:',len(set(p['data']['name'] for p in posts_b)))

### Inspect Data

In [None]:
# only inspect posts from subreddit[0]

In [None]:
# print some info from posts_a (subreddit[0])

# for p in posts_a:
    
#     title = p['data']['title']
#     #subreddit = p['data']['subreddit']
#     score = p['data']['score']
#     pinned = p['data']['pinned'] 
    
#     print(title)
#     print(subreddit)
#     print(score)
#     print(pinned)
#     print('')
    
#     keys = p['data'].keys()    
#     print(len(keys))   

In [None]:
# create empty df

#cols = list(posts_a[0]['data'].keys())
cols = ['title','score','selftext']

df = pd.DataFrame(columns=cols)
df_a = pd.DataFrame(columns=cols)
df_b = pd.DataFrame(columns=cols)

In [None]:
# convert posts -> df

for idx,p in enumerate(posts_a):
    
    p_keys = p['data'].keys()
    
    for col in cols:
        
        if col in p_keys:
            df_a.at[idx,col] = p['data'][col]
        else:
            df_a.at[idx,col] = np.nan
            print(idx,col,'nan')
            
for idx,p in enumerate(posts_b):
    
    p_keys = p['data'].keys()
    
    for col in cols:
        
        if col in p_keys:
            df_b.at[idx,col] = p['data'][col]
        else:
            df_b.at[idx,col] = np.nan
            print(idx,col,'nan')

In [None]:
# list all columns

print(df_a.columns)

In [None]:
# output 1st 5 records in df_train

df_a.head()

In [None]:
df_a_desc = df_a.describe()
df_a_desc

In [None]:
# Check for nulls in columns

null_cols = df_a.isnull().sum()
mask_null = null_cols > 0
null_cols[mask_null].sort_values(ascending=False)

In [None]:
# Check for nulls in rows

null_rows = df_a.isnull().sum(axis=1)
mask_null = null_rows > 0
null_rows[mask_null].sort_values(ascending=False)

In [None]:
# show column summary

df_a_info = df_a.info()
df_a_info

In [None]:
# find columns with the most empty cells

df_a.count(axis=0).sort_values()

### Clean Data

In [None]:
# rename columns

In [None]:
# convert all string in cells to lowercase -> prevent duplicates when creating dummies

#df_train = df_train.applymap(lambda s:s.lower() if type(s) == str else s)
#df_test = df_test.applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
# fill nan/empty cells with na

#cols = ['pool_qual','bsmt_qual','fireplace_score','garage_qual','garage_cond','bsmt_type1_score','bsmt_type2_score']

#for col in cols:
    #df_train[col] = df_train[col].fillna(value='na')
    #df_test[col] = df_test[col].fillna(value='na')

In [None]:
# # fill nan/empty cells with 0

# cols = ['bath_half_bsmt_num','bath_full_bsmt_num','garage_area','garage_car_num','bsmt_total_area',
#         'bsmt_unfinish_area','bsmt_type2_area','bsmt_type1_area']

# for col in cols:
#     df_train[col] = df_train[col].fillna(value='0')
#     df_test[col] = df_test[col].fillna(value='0')

### Output Clean Data

In [None]:
# output to csv

#df_a.to_csv(mid_path + 'df_a.csv')

### EDA

In [None]:
# df_train heatmap (staircase)

#corr = df_train.corr()
#mask = np.zeros_like(corr)
#mask[np.triu_indices_from(mask)] = True

#fig, ax = plt.subplots(figsize=(20, 10))
#sns.heatmap(corr, mask=mask, vmax=.3, square=True,cmap="coolwarm_r");

In [None]:
#create_scatterplot('Sale Price vs House Quality',df_train,x ='house_qual',y ='sale_price',hue ='lot_subclass')

#print("sale price tends to increase as house quality increases.")

In [None]:
# create histograms for all numeric columns
#df_train.hist(figsize=(15, 15));

#print('plot histograms for all numeric columns to check for zeros and abnormalities.')

In [None]:
#create_boxplot(df_train,x='lot_subclass',y='sale_price',title='Sale Price vs Lot Subclass')

#print("Most subclasses has sale price between 100K to 200K.")
#print("4 subclasses have sale price above 200k and they have more outliers.")
#print("I will convert lot_subclass to dummy variables for model predictions.")

### Logistic Regression Model

In [None]:
# use post title for classification

In [None]:
print(df_a.shape)
print(df_b.shape)

In [None]:
df_a.head()

In [None]:
df_b.head()

In [None]:
df = df.append(df_a)
df = df.append(df_b)

df.reset_index(drop=True)

In [None]:
# create target (use one hot encoding)
df['is_a'].iloc[0:len(df_a)] = 1
df['is_a'].iloc[len(df_a):] = 0

In [None]:
# create X (feature matrix) and y (target)
feature_cols = ['title']
X = df[feature_cols]
y = df['is_a'].values

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,test_size=0.3,stratify=y,random_state=3050)

In [None]:
# check train and test data

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# use CountVectorizer
cvec = CountVectorizer(stop_words='english')
cvec.fit(X_train['title'])

len_features = len(cvec.get_feature_names())
print(len_features)

X_train_cv = pd.DataFrame(cvec.transform(X_train['title']).todense(),columns=cvec.get_feature_names())
X_test_cv = pd.DataFrame(cvec.transform(X_test['title']).todense(),columns=cvec.get_feature_names())

In [None]:
# use LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_cv,y_train)

lr_score = lr.score(X_test_cv, y_test)
print(lr_score)

### Bayes Classifer

### Output Model Predictions

In [None]:
# output to csv

#timestamp = dt.datetime.now()
#timestamp = timestamp.strftime(" %Y_%m_%d %H_%M_%S ")
    
# contains selected columns for feature matrix
#df_cols.to_csv(output_path + 'columns' + timestamp + '.csv')

### Descriptive and Inferential Statistics

### Outside Research

### Conclusions and Recommendations