# StackPros

## Data Scientists Assessment - Coding Test

<br/>

<div style="text-align: right"> Author: Haohan (David) Li</div>
<div style="text-align: right"> Data Source: StackPros </div>

### Dependencies

In [1]:
#TODO comment recheck
#TODO depensencies to top
#TODO markdown explanation
#TODO class reconstruction and feature complementation - especially cleaner
#TODO merge into class
#TODO plot unfinished
#TODO feature selection, add more, pandas feature name
#TODO model add more, imbalance, hyper tuning, ROC, other metrics

#Dependencies Check
!pip install sklearn --upgrade
!pip install xgboost --upgrade

#General
import numpy as np
import scipy as sp
import pandas as pd
import os
import html

#Feature Selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

#Preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import binarize

#Model Selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
from xgboost import XGBClassifier as XG

#Metrics
from sklearn.metrics import fbeta_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")

#Jupyter Notebook Magic
%matplotlib inline

Requirement already up-to-date: sklearn in /Users/leehh/anaconda/lib/python3.6/site-packages
Requirement already up-to-date: scikit-learn in /Users/leehh/anaconda/lib/python3.6/site-packages (from sklearn)
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Requirement already up-to-date: xgboost in /Users/leehh/anaconda/lib/python3.6/site-packages
Requirement already up-to-date: scipy in /Users/leehh/anaconda/lib/python3.6/site-packages (from xgboost)
Requirement already up-to-date: numpy in /Users/leehh/anaconda/lib/python3.6/site-packages (from xgboost)
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### Data Loading

In [None]:
class Data_Loader(object):
    
    #which column to use as index
    indCol = 0
    
    #file name
    fileName = 'StackPros_Assessment_DataScientist_S_file.csv'
    
    def __init__(self, dir:str=os.getcwd()):
        '''
        dir: the directory keepping data source, default as current directory
        '''
        self.path = dir + '/' + Data_Loader.fileName
    
    def load(self)->pd.DataFrame:
        '''
        return: the data frame contains the data
        '''
        return pd.read_csv(self.path,index_col=Data_Loader.indCol)

In [None]:
#load the data
loader = Data_Loader()
df = loader.load()

In [None]:
df.head()

In [None]:
#sample and feature size
sampleSize = df.shape[0]
featureSize = df.shape[1]

In [None]:
print(sampleSize)

### Data Cleaning

In [None]:
# a decorator that could be used in many functions after
def updateVarInfo(labelCol='Action',abandon = ['ID','RealTime']):
    
    def wrapper(func):

        ContinousTypes = [np.dtype('float')]
        ForceToContinous = ['ActionTime']
        ForceToCat = ['BannerArea','BannerRatio']

        def _wrapper(self, *args, **kwargs):
            self.varCategories = {'ContinousVars':list(),'CategoricalVars':list(),'LabelVars':labelCol}
            for col in self.df.columns:
                if col!=labelCol and col not in abandon:
                    if (self.df[col].dtype in ContinousTypes or col in ForceToContinous) and \
                    col not in ForceToCat:
                        self.varCategories['ContinousVars'].append(col)
                    else:
                        self.varCategories['CategoricalVars'].append(col)
            if labelCol:
                self.labelCounts = self.df.groupby(labelCol).size()
                
            return func(self, *args, **kwargs)
        return _wrapper
    
    return wrapper

##### Analysis on Attributes

In [None]:
class General_Analyzer(object):
    
    colNames = ['Data Type','Null Percentage','Variable Range or Value Count']
    
    def __init__(self,df:pd.DataFrame):
        '''
        df: the data frame to analyze
        '''
        self.df = df
        
    def null_check(self)->pd.Series:
        '''
        return: a report in series for null value count in percentage format
        '''
        return self.df.isnull().sum().apply(lambda x:str(x/self.df.shape[0]*100)[:8]+'%')
    
    def dtype_check(self)->pd.Series:
        '''
        return: a report in series for data type in each column
        '''
        return self.df.dtypes
    
    @updateVarInfo(None,[])
    def unique_count_or_range(self)->pd.Series:
        '''
        return: a report in series for number of unique values in each column
        '''
        variables = {}
        for col in self.df.columns:
            if col in self.varCategories['ContinousVars']:
                variables[col] = '{} -- {}'.format('%.5g'% self.df[col].min(),'%.5g'% self.df[col].max())
            else:
                variables[col] = len(self.df[col].unique())
        return pd.Series(variables)
        
    def generate_report(self)->pd.Series:
        '''
        return: a report concate null check and data type check
        '''
        newDf = pd.concat((self.dtype_check(),self.null_check(),self.unique_count_or_range()),axis=1)
        newDf.columns = General_Analyzer.colNames
        return newDf

In [None]:
generalAnalyzer = General_Analyzer(df)
generalAnalyzer.generate_report()

##### Missing Values

In [None]:
class Nan_Filler(object):
    
    labelCol = 'Action'
    
    def __init__(self, df:pd.DataFrame):
        '''
        df: Data frame to operate
        '''
        self.df = df
    
    def getFillValByLabel(self, col:str, mode:str='mean')->dict:
        fillVal = dict()
        for label in self.df[Nan_Filler.labelCol].unique():
            fillVal[label] = getattr(self.df[self.df[Nan_Filler.labelCol]==label][col],mode)()
        return fillVal
    
    #TODO faster algorithm
    def fillByLabel(self, col:str, mode:str='mean', inplace:bool=True)->pd.Series or None:
        fillVal = self.getFillValByLabel(col,mode)
        
        filledCol = self.df.apply(lambda row: row[col] if not np.isnan(row[col]) \
                             else fillVal[row[Nan_Filler.labelCol]],axis=1)
        
        if not inplace:
            return filledCol
        else:
            self.df[col] = filledCol

In [None]:
nan_filler = Nan_Filler(df)
nan_filler.fillByLabel('InteractionTime')

In [None]:
generalAnalyzer.null_check()

##### Lower Cases

In [None]:
df['BannerSize']=df['BannerSize'].map(lambda x:x.lower())

### Exploratory Data Analysis

In [None]:
class Plotter(object):
    
    labelCol = 'Action'
    
    def __init__(self,df):
        self.df = df
    
    def canvas(self,varCat,cols=2):
        plt.close()
        cols = cols
        rows = int(len(self.varCategories[varCat])/cols)+1
        fig = plt.figure(figsize=(10,10*rows//cols))
        return fig,cols,rows
    
    @updateVarInfo()
    def continousPlot(self):
        fig,cols,rows = self.canvas('ContinousVars')
        for n,col in enumerate(self.varCategories['ContinousVars']):
            ax = fig.add_subplot(rows,cols,n+1)
            ax.set_title(col)
            bw = (self.df[col].max()-self.df[col].min())/20
            for label in self.df[Plotter.labelCol].unique():
                sns.kdeplot(self.df[self.df[Plotter.labelCol]==label][col], bw=bw ,label=label)
            plt.legend(loc='upper right')
            plt.xticks(fontsize=10,rotation=90)
        plt.tight_layout()
    
    @updateVarInfo(None)
    def continousSinglePlot(self):
        fig,cols,rows = self.canvas('ContinousVars')
        for n,col in enumerate(self.varCategories['ContinousVars']):
            ax = fig.add_subplot(rows,cols,n+1)
            ax.set_title(col)
            bw = (self.df[col].max()-self.df[col].min())/20
            sns.kdeplot(self.df[col], bw=bw, legend=False)
            text = 'Mean: {}\nVariance: {}'.format('%.5g'% self.df[col].mean(),'%.5g'% self.df[col].var())
            ax.text(0.5, 0.95, text,transform=ax.transAxes, fontsize=14,verticalalignment='top')
            plt.xticks(fontsize=10,rotation=90)
        plt.tight_layout()
    
    @updateVarInfo()
    def catgoricalPlot(self):
        fig,cols,rows = self.canvas('CategoricalVars')
        for n,col in enumerate(self.varCategories['CategoricalVars']):
            ax = fig.add_subplot(rows,cols,n+1)
            ax.set_title(col)
            seriesAll = []
            colNames = []
            for label in self.df[Plotter.labelCol].unique():
                seriesAll.append(self.df[self.df[Plotter.labelCol]==label].groupby([col]).size()/
                                 self.labelCounts[label])
                colNames.append(label)
            newDf = pd.concat(seriesAll,axis=1)
            newDf.columns = colNames
            newDf.plot(kind = 'bar',ax=ax)
            plt.legend(loc='upper right')
            plt.xticks(fontsize=10,rotation=90)
        plt.tight_layout()
        
    @updateVarInfo(None) 
    def catgoricalSinglePlot(self,mergeThreshold=0.021):
        fig,cols,rows = self.canvas('CategoricalVars')
        for n,col in enumerate(self.varCategories['CategoricalVars']):
            ax = fig.add_subplot(rows,cols,n+1)
            ax.set_title(col)
            if col != Plotter.labelCol:
                each = self.df[col].value_counts().to_dict()
                SUM = sum(each.values())
                new = {'others':0}
                for key in each.keys():
                    if each[key]/SUM < mergeThreshold or key == 'other':
                        new['others'] += each[key]
                    else:
                        new[key] = each[key]
                if new['others'] == 0:
                    new.pop('others')
                plt.pie(x=new.values(),labels=new.keys(),autopct='%1.1f%%')
            else:
                plt.pie(x=self.df[col].value_counts().values,labels=self.df[col].value_counts().index,\
                        autopct='%1.1f%%')
            centre_circle=plt.Circle((0,0),0.7,color='white',fc='white',linewidth=1)
            fig.gca().add_artist(centre_circle)
        plt.tight_layout()

In [None]:
plotter = Plotter(df)
plotter.continousSinglePlot()

In [None]:
plotter.catgoricalSinglePlot()

In [None]:
plotter.continousPlot()

In [None]:
plotter.catgoricalPlot()

In [None]:
df.drop(['colour'],axis=1,inplace=True)

### Feature Enigneering

##### Feature Creation

In [None]:
def getArea(size):
    if size == 'other':
        return np.nan
    sizeInInt = [int(num) for num in size.split('x')]
    return sizeInInt[0]*sizeInInt[1]

In [None]:
df['BannerArea'] = df['BannerSize'].map(getArea)

In [None]:
def getRatio(size):
    if size == 'other':
        return np.nan
    sizeInInt = [int(num) for num in size.split('x')]
    return sizeInInt[0]/sizeInInt[1]

In [None]:
df['BannerRatio'] = df['BannerSize'].map(getRatio)

In [None]:
generalAnalyzer.generate_report()

In [None]:
nan_filler.fillByLabel('BannerArea')
nan_filler.fillByLabel('BannerRatio')

In [None]:
generalAnalyzer.null_check()

In [None]:
df.head()

In [None]:
df['RealTime'] = pd.to_datetime(df['ActionTime'], unit='us')
df['Year'] = df['RealTime'].dt.year
df['Month'] = df['RealTime'].dt.month
df['Day'] = df['RealTime'].dt.day
df['Hour'] = df['RealTime'].dt.hour


In [None]:
df.head()

In [None]:
df.drop(['ID','ActionTime','BannerSize','RealTime'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
plotter.catgoricalPlot()

In [None]:
def mapHour():
    hourMap = {'Late Night':(0,7),'Morning':(7,12),'Afternoon':(12,18),'Night':(18,24)}
    hourMapFlipped = dict()
    for dayRange,hourRange in hourMap.items():
        for hour in range(*hourRange):
            hourMapFlipped[hour] = dayRange
    return hourMapFlipped

hourMap = mapHour()

df['DayRange'] = df['Hour'].map(lambda x: hourMap[x])

df.drop(['Hour'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
plotter.catgoricalPlot()

##### Encoding

In [None]:
colToDummy = ['Website','Brand','DayRange','BannerArea','BannerRatio','Year','Month','Day']

dummies = list()

def getAllDummies(df):
    for col in colToDummy:
        dummies.append(pd.get_dummies(df[col],prefix=col))
        
getAllDummies(df)

In [None]:
encodedDf = pd.concat([df.drop(colToDummy,axis=1),*dummies],axis=1)

In [None]:
encodedDf.head()

In [None]:
encodedDf['Action'] = encodedDf['Action'].map(lambda x: 1 if x=='Click' else 0)

In [None]:
encodedDf.head()

In [None]:
labels = encodedDf['Action']

In [None]:
features = encodedDf.drop(['Action'],axis=1)

In [None]:
features.shape

In [None]:
labels.shape

##### Feature Selection

In [None]:
threshold = .999 * (1 - .999)

scaler = MinMaxScaler()

temp = scaler.fit_transform(features['InteractionTime'].values.reshape(-1,1))

temp.var() > (threshold)

In [None]:
sel = VarianceThreshold(threshold=(threshold))

sel.fit(features)

In [None]:
unselected = dict()

for i,col in enumerate(features.columns):
    if sel.variances_[i] <= threshold and col != 'InteractionTime':
        unselected[col] = sel.variances_[i]
        
unselected

In [None]:
features.drop(list(unselected.keys()),axis=1,inplace=True)

In [None]:
features.head()

In [None]:
features.shape

In [None]:
alpha = 0.05

pValues = chi2(features,labels)[1]

In [None]:
unselected = dict()

for i,col in enumerate(features.columns):
    if pValues[i] >= alpha:
        unselected[col] = sel.variances_[i]
        
unselected

In [None]:
features.drop(list(unselected.keys()),axis=1,inplace=True)

In [None]:
features.head()

In [None]:
features.shape

##### Dimensionality Reduction

In [None]:
pca = PCA(n_components = 18)
features_pca = pca.fit_transform(features)

In [None]:
pca.explained_variance_ratio_.sum()

##### Normalization

In [None]:
final_features = normalize(features_pca)

In [None]:
final_features.shape

### Modelling

##### Dataset Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, test_size=0.1, random_state=255)

##### Baseline Model - Logistic Regression

In [None]:
lrHyperParamCandidates = {'C' : np.logspace(-2,1.5,10)}

In [None]:
lr = LR(penalty='l1', solver='liblinear', class_weight='balanced', n_jobs=1)

In [None]:
clf = GridSearchCV(lr,lrHyperParamCandidates,scoring='roc_auc',n_jobs=1,verbose=1)

In [None]:
clf.fit(X_train,y_train)

In [None]:
lr_best = clf.best_estimator_

In [None]:
pred_thresholds = np.logspace(-4,-1,100)

In [None]:
def findThreshold(model,threshold):
    pred = binarize(model.predict_proba(X_train)[:,1].reshape(-1,1),threshold)
    f10 = fbeta_score(y_train,pred,10)
    return f10

In [None]:
best_threshold = 0.5 # = list(pred_thresholds)[np.argmax(np.vectorize(findThreshold)(lr_best,pred_thresholds))]

In [None]:
proba = lr_best.predict_proba(X_test)[:,1]

In [None]:
y_pred = binarize(proba.reshape(-1,1),best_threshold)

In [None]:
f10_lr = fbeta_score(y_test,y_pred,10)
f10_lr

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
def roc_plot(fpr,tpr,name,auc):
        plt.figure(figsize=(20,5))
        plt.plot(fpr,tpr)
        plt.ylim([0.0,1.0])
        plt.ylim([0.0, 1.0])
        plt.title('ROC of {}      AUC: {}'.format(name,auc))
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.grid(True)
        plt.show()

In [None]:
auc = roc_auc_score(y_test,proba)
fpr, tpr, thresholds = roc_curve(y_test,proba,pos_label=1)
roc_plot(fpr,tpr,'Logistic Regression',auc)

##### XGBoost