# Project 3: Web APIs & Classification

### Contents

- [Problem Statement](#Problem-Statement)
- [Functions](#Functions)
- [Importing](#Importing)


- [Inspect Data](#Inspect-Data)
- [Clean Data](#Clean-Data)
- [Output Clean Data](#Output-Clean-Data)


- [EDA](#EDA)


- [LR Model Exploratory](#LR-Model-Exploratory)
- [Create Feature Matrix and Target](#Create-Feature-Matrix-and-Target)
- [LR Model](#LR-Model)
- [Ridge Model](#Ridge-Model)
- [Lasso Model](#Lasso-Model)


- [Output Model Predictions](#Output-Model-Predictions)


- [Descriptive and Inferential Statistics](#Descriptive-and-Inferential-Statistics)
- [Outside Research](#Outside-Research)
- [Conclusions and Recommendations](#Conclusions-and-Recommendations)

### Problem Statement

- Blank

### Functions

In [1]:
# user configuration

# True: scape data from website -> save as json files
# False: load json files
scape_data = False

subreddit = ['boardgames','mobilegames']

# scape data for subreddit[0] if scape_data is True
scape_index = 0
# scape data for subreddit[1] if scape_data is True
#scape_index = 1

url = "http://www.reddit.com/r/"
headers = {'User-agent':'Bleep blorp bot 0.1'}

num_requests = 40

In [2]:
# output scaped data to json files
def output_json(data):

    timestamp = dt.datetime.now()
    timestamp = timestamp.strftime(" %Y_%m_%d %H_%M_%S")
    
    filepath = input_path + subreddit[scape_index] + '/'
    filename = subreddit[scape_index] + timestamp + '.json'    
    
    file = open(filepath + filename,"w+")    
    json.dump(data,file)
    
    print('created',filename)

### Importing

In [3]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression,Ridge,RidgeCV,Lasso,LassoCV,ElasticNet 
from sklearn import linear_model
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score,mean_squared_error

# web
import requests
import json
from IPython.display import Image
from IPython.core.display import HTML

# others
import time
import datetime as dt
#import re
import os

In [4]:
# file paths

input_path = '../data/input/'
mid_path = '../data/mid/'
output_path = '../data/output/'

image_path = '../images/'

In [5]:
# old method (not used)
# get json from reddit boardgames

# if scape_data == True:

#     response = requests.get(url,headers=headers)
#     print(response.status_code)

#     json_data = response.json()
#     sorted(json_data.keys())

In [6]:
# old method (not used)
# print number of posts in json_data

# if scape_data == True:

#     print(len(json_data['data']['children']))

#     # print info in kind
#     print(json_data['kind'])
#     print('')

#     # print data keys
#     print(sorted(json_data['data'].keys()))
#     print('')

#     # print 1st post
#     print(json_data['data']['children'][0]['data'])
#     print('')

#     # print id of of last post
#     print(json_data['data']['after'])

In [7]:
# old method (not used)
# print ids of all posts

# if scape_data == True:

#     [post['data']['name'] for post in json_data['data']['children']]

In [8]:
# get json_data with multiple requests

if scape_data == True:

    posts = []
    after = None

    for i in range(1,num_requests+1):

        if after == None:
            params = {}
        else:
            params = {'after':after}

        json_url = url + subreddit[scape_index] + '.json'
        response = requests.get(json_url,params=params,headers=headers)

        if response.status_code == 200:       

            print('process request',i)

            json_data = response.json()
            posts.extend(json_data['data']['children'])
            after = json_data['data']['after']

            # save scaped data to files
            output_json(json_data)        

        else:
            print(response.status_code)
            break

        time.sleep(3)

In [9]:
# load json data from files (saved previously in input folder)

if scape_data == False:

    posts = []
    
    files = []
    
    filepath = input_path + subreddit[scape_index]

    # r=root, d=directories, f = files
    for r, d, f in os.walk(filepath):
        for file in f:
            if '.json' in file:
                files.append(os.path.join(r, file))

    for f in files:
        print(f)
        file_data = open(f).read()
        json_data = json.loads(file_data)
        
        posts.extend(json_data['data']['children'])
        after = json_data['data']['after']       

../data/input/boardgames 2019_07_15 21_02_41.json
../data/input/boardgames 2019_07_15 21_02_44.json
../data/input/boardgames 2019_07_15 21_02_48.json
../data/input/boardgames 2019_07_15 21_02_52.json
../data/input/boardgames 2019_07_15 21_02_56.json
../data/input/boardgames 2019_07_15 21_03_00.json
../data/input/boardgames 2019_07_15 21_03_04.json
../data/input/boardgames 2019_07_15 21_03_08.json
../data/input/boardgames 2019_07_15 21_03_12.json
../data/input/boardgames 2019_07_15 21_03_15.json
../data/input/boardgames 2019_07_15 21_03_19.json
../data/input/boardgames 2019_07_15 21_03_23.json
../data/input/boardgames 2019_07_15 21_03_27.json
../data/input/boardgames 2019_07_15 21_03_31.json
../data/input/boardgames 2019_07_15 21_03_35.json
../data/input/boardgames 2019_07_15 21_03_39.json
../data/input/boardgames 2019_07_15 21_03_43.json
../data/input/boardgames 2019_07_15 21_03_47.json
../data/input/boardgames 2019_07_15 21_03_50.json
../data/input/boardgames 2019_07_15 21_03_54.json


In [10]:
# print total number of posts
print(len(posts))

# check for duplicates
print(len(set(p['data']['name'] for p in posts)))

989
963


### Inspect Data

In [11]:
for p in posts:
    
    title = p['data']['title']
    #subreddit = p['data']['subreddit']
    score = p['data']['score']
    pinned = p['data']['pinned'] 
    
    #print(title)
    #print(subreddit)
    #print(score)
    #print(pinned)
    #print('')
    
    #keys = p['data'].keys()    
    #print(len(keys))   

In [12]:
#cols = list(posts[0]['data'].keys())
cols = ['title','score','selftext']

df = pd.DataFrame(columns=cols)

In [13]:
# convert json_data -> df

for idx,p in enumerate(posts):
    
    p_keys = p['data'].keys()
    
    for col in cols:
        
        if col in p_keys:
            df.at[idx,col] = p['data'][col]
        else:
            df.at[idx,col] = np.nan
            print(idx,col,'nan')

In [14]:
# list all columns in df_train

print(df.columns)

Index(['title', 'score', 'selftext'], dtype='object')


In [15]:
# output 1st 5 records in df_train

df.head()

Unnamed: 0,title,score,selftext
0,/r/boardgames Daily Discussion and Game Recomm...,4,**Welcome to /r/boardgames Daily Discussion an...
1,My game just won Best in Show at the Southern ...,788,The game I've been working on for about 5 year...
2,Card games for Software Developers - our Print...,12,"Hello r/boardgames,\n\nme (Mateusz) and my fri..."
3,Games with great design lessons for accessibil...,13,
4,"Kickstarter Roundup: Jul 14, 2019 | 11 Ending ...",234,"## What this is:\n\nThis is a weekly, curated ..."


In [16]:
df_desc = df.describe()
df_desc

Unnamed: 0,title,score,selftext
count,989,989,989.0
unique,963,193,820.0
top,/r/boardgames Daily Discussion and Game Recomm...,0,
freq,2,67,114.0


In [17]:
# Check for nulls in columns

null_cols = df.isnull().sum()
mask_null = null_cols > 0
null_cols[mask_null].sort_values(ascending=False)

Series([], dtype: int64)

In [18]:
# Check for nulls in rows

null_rows = df.isnull().sum(axis=1)
mask_null = null_rows > 0
null_rows[mask_null].sort_values(ascending=False)

Series([], dtype: int64)

In [19]:
# show column summary

df_info = df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 989 entries, 0 to 988
Data columns (total 3 columns):
title       989 non-null object
score       989 non-null object
selftext    989 non-null object
dtypes: object(3)
memory usage: 70.9+ KB


In [20]:
# find columns with the most empty cells

df.count(axis=0).sort_values()

title       989
score       989
selftext    989
dtype: int64

### Clean Data

In [21]:
# rename columns

In [22]:
# convert all string in cells to lowercase -> prevent duplicates when creating dummies

#df_train = df_train.applymap(lambda s:s.lower() if type(s) == str else s)
#df_test = df_test.applymap(lambda s:s.lower() if type(s) == str else s)

In [23]:
# fill nan/empty cells with na

#cols = ['pool_qual','bsmt_qual','fireplace_score','garage_qual','garage_cond','bsmt_type1_score','bsmt_type2_score']

#for col in cols:
    #df_train[col] = df_train[col].fillna(value='na')
    #df_test[col] = df_test[col].fillna(value='na')

In [24]:
# # fill nan/empty cells with 0

# cols = ['bath_half_bsmt_num','bath_full_bsmt_num','garage_area','garage_car_num','bsmt_total_area',
#         'bsmt_unfinish_area','bsmt_type2_area','bsmt_type1_area']

# for col in cols:
#     df_train[col] = df_train[col].fillna(value='0')
#     df_test[col] = df_test[col].fillna(value='0')

### Output Clean Data

In [25]:
# output to csv

df.to_csv(mid_path + 'df.csv')

### EDA

In [26]:
# df_train heatmap (staircase)

#corr = df_train.corr()
#mask = np.zeros_like(corr)
#mask[np.triu_indices_from(mask)] = True

#fig, ax = plt.subplots(figsize=(20, 10))
#sns.heatmap(corr, mask=mask, vmax=.3, square=True,cmap="coolwarm_r");

In [27]:
#create_scatterplot('Sale Price vs House Quality',df_train,x ='house_qual',y ='sale_price',hue ='lot_subclass')

#print("sale price tends to increase as house quality increases.")

In [28]:
# create histograms for all numeric columns
#df_train.hist(figsize=(15, 15));

#print('plot histograms for all numeric columns to check for zeros and abnormalities.')

In [29]:
#create_boxplot(df_train,x='lot_subclass',y='sale_price',title='Sale Price vs Lot Subclass')

#print("Most subclasses has sale price between 100K to 200K.")
#print("4 subclasses have sale price above 200k and they have more outliers.")
#print("I will convert lot_subclass to dummy variables for model predictions.")

### Create Feature Matrix and Target

### LR Model

### Ridge Model

### Lasso Model

### Output Model Predictions

In [30]:
# output to csv

#timestamp = dt.datetime.now()
#timestamp = timestamp.strftime(" %Y_%m_%d %H_%M_%S ")
    
# contains selected columns for feature matrix
#df_cols.to_csv(output_path + 'columns' + timestamp + '.csv')

### Descriptive and Inferential Statistics

### Outside Research

### Conclusions and Recommendations