In [1]:
import datetime
import glob
import ipywidgets
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV

import json
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier

import urllib.request
import urllib.parse

import time
import warnings
import xgboost as xgb

%matplotlib inline
%run ./plugins/widgets.py

Widget Loaded


## Global Parameters

In [2]:
plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['font.size'] = 14
plt.rcParams['axes.grid'] = True
plt.rcParams['figure.facecolor'] = 'white'
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

## Load Data

In [3]:
full_train_df = pd.read_csv('./devengers_train.csv')
print("Shape : ", full_train_df.shape)
full_train_df.sample(2)

Shape :  (1000, 28)


Unnamed: 0,s.no,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
90,91,2014-08-27 12:12:47,31,Male,United States,NY,No,No,No,Never,500-1000,No,Yes,Yes,Yes,Yes,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,No,Yes,No,
594,595,2014-08-27 22:12:55,29,Male,United States,OR,No,Yes,No,Never,100-500,No,Yes,Yes,Yes,Yes,Yes,Don't know,Don't know,No,No,Some of them,Yes,No,Maybe,Yes,No,


In [4]:
full_train_df['self_employed'] = full_train_df['self_employed'].fillna('No')

## Test API

In [5]:
def convert(o):
    if isinstance(o, np.int64): 
        return int(o)  
    raise TypeError

In [6]:
treat_pred = []
for i in log_progress(range(len(full_train_df))):
    json_req = {}
    for j, col in enumerate(full_train_df.columns):
        json_req[col] = full_train_df.loc[i][j]
    
    url = 'http://localhost:5000/predict'
    req = urllib.request.Request(url, json.dumps(json_req, default=convert).encode('utf8'))
    req.add_header('Content-Type', 'application/json')
    response = urllib.request.urlopen(req)
    data = response.read().decode("utf-8")
    json_data = json.loads(data)
    treat_pred.append("Yes" if json_data['treatment']==True else "No")

full_train_df['treat_pred'] = treat_pred

VBox(children=(HTML(value=''), IntProgress(value=0, max=1000)))

In [7]:
full_train_df['accurate_pred'] = full_train_df.apply(lambda row: 1 if row['treatment'] == row['treat_pred'] else 0, 
                                                     axis=1)

In [8]:
np.mean(full_train_df['accurate_pred'])

0.842

In [9]:
full_train_df

Unnamed: 0,s.no,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,treat_pred,accurate_pred
0,1,2014-08-27 11:29:31,37,Female,United States,IL,No,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,,Yes,1
1,2,2014-08-27 11:29:37,44,M,United States,IN,No,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No,,No,1
2,3,2014-08-27 11:29:44,32,Male,Canada,,No,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,,No,1
3,4,2014-08-27 11:29:46,31,Male,United Kingdom,,No,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,,Yes,1
4,5,2014-08-27 11:30:22,31,Male,United States,TX,No,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,,No,1
5,6,2014-08-27 11:31:22,33,Male,United States,TN,No,Yes,No,Sometimes,6-25,No,Yes,Yes,Not sure,No,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No,,Yes,0
6,7,2014-08-27 11:31:50,35,Female,United States,MI,No,Yes,Yes,Sometimes,1-5,Yes,Yes,No,No,No,No,No,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No,,Yes,1
7,8,2014-08-27 11:32:05,39,M,Canada,,No,No,No,Never,1-5,Yes,Yes,No,Yes,No,No,Yes,Don't know,No,No,No,No,No,No,No,No,,No,1
8,9,2014-08-27 11:32:39,42,Female,United States,IL,No,Yes,Yes,Sometimes,100-500,No,Yes,Yes,Yes,No,No,No,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No,,Yes,1
9,10,2014-08-27 11:32:43,23,Male,Canada,,No,No,No,Never,26-100,No,Yes,Don't know,No,Don't know,Don't know,Don't know,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No,,No,1


In [10]:
full_train_df.to_csv('./devengers_api_predict.csv', index=False)