In [1]:
import sys

!conda install --yes -c intel scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model.logistic import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

Collecting package metdone
done

## Package Plan ##

  environment location: /Users/mm06682/.prefix/sw/miniconda

  added / updated specs:
    - scikit-learn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    blas-1.0                   |              mkl           6 KB
    ca-certificates-2019.11.27 |                0         131 KB
    certifi-2019.11.28         |           py37_0         156 KB
    conda-4.7.12               |           py37_0         3.0 MB
    intel-openmp-2019.5        |        intel_281         1.1 MB  intel
    joblib-0.14.0              |             py_0         201 KB
    libgfortran-3.0.1          |       h93005f0_2         426 KB
    llvm-openmp-4.0.1          |       hcfea43d_1         409 KB
    mkl-2019.5                 |        intel_281       157.8 MB  intel
    mkl-service-2.3.0          |   py37hfbe908c_0         201 KB

    mkl_random-1.1.0          

In [ ]:
## Read Data

In [11]:
df = pd.read_csv('/Users/mm06682/projects/school_projects/fall_2019/software_engineering/google-ad-bias-research/data/processed/final_long.csv')
for i,j in zip(*np.where(pd.isnull(df))):
    df.iloc[i,j] = "NA"

## Create data set

In [12]:
# X_column = ['age', 'gender', 'education', 'occupation', 'Hispanic', 'race','Political', 
#             'feelAboutAd', 'image','City', 'State', 'Region','Division', 'page_type', 'relevant']
X_column = ['age', 'gender', 'education', 'occupation', 'Hispanic', 'race','Political', 
            'feelAboutAd', 'image', 'State', 'Region','Division', 'page_type', 'relevant']

num_column = ['age','feelAboutAd']

X_df = pd.DataFrame()
y_df = pd.DataFrame()

for col in X_column:
    if col not in num_column:
        t = pd.Categorical(df[col])
        X_df[col] = t.rename_categories(range(len(t.categories)))
    else:
        X_df[col] = df[col]

y_df = df['rating']


X = X_df.values.astype('int32')
y = y_df.values.astype('int32')

In [13]:
X_model, X_holdout, y_model, y_holdout = train_test_split(X, y, test_size = 0.2, random_state = 100)
X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=0.2, random_state=0)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

## Models


In [18]:
def getModel(modelName):
    if modelName == "lr":
        model = LogisticRegression(dual=False,
                       fit_intercept=True,intercept_scaling=1,class_weight=None,
                       random_state=None,solver='liblinear',max_iter=100, 
                       multi_class='auto',verbose=0,warm_start=False,
                       n_jobs=None)
        grid_param = {
            'penalty':['l1', 'l2'],
            'C': [0.01, 0.10, 1.0],
        }
    elif modelName == "randomforest":
        model = RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=None,
                            min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=-1, 
                            random_state=None, verbose=0, warm_start=False, class_weight=None)
        grid_param = {
            'n_estimators': [100, 300, 500, 800, 1000],
            'min_samples_leaf': [1, 3, 5, 7]
        }
    elif modelName == "knn":
        model = KNeighborsClassifier(weights='uniform',algorithm='auto',p=2,metric='minkowski',metric_params=None,
                         n_jobs=None)
        grid_param ={
            'n_neighbors' :[2,5,10,20,30]
        }
    elif modelName == "svm":
        model = SVC(kernel='rbf', shrinking=True, 
         probability=False, verbose=False, 
         max_iter=-1, decision_function_shape='ovr', random_state=None)
        
        grid_param = {
            'C' : [0.001, 0.01, 0.1, 1, 10],
            'gamma': [0.001, 0.01, 0.1, 1]
        }
        
    return model, grid_param

## Model Grid Search

In [19]:
models = ["lr","randomforest","knn","svm"]
for m in models:
    model, param = getModel(m)
    gd_sr = GridSearchCV(estimator=model,param_grid=param,scoring='accuracy',cv=5,n_jobs=-1)
    print ("Start grid search for {}".format(m))
    gd_sr.fit(X_train, y_train)
    best_parameters = gd_sr.best_params_
    best_result = gd_sr.best_score_
    best_estimator = gd_sr.best_estimator_
    print ("Best Param for {} is {} with {} accuracy".format(m,best_parameters,best_result))
    best_estimator.fit(X_train,y_train)
    print("Validation Accuracy = {}".format(best_estimator.score(X_test,y_test)))
    print("=============================")

Start grid search for lr
Best Param for lr is {'C': 0.01, 'penalty': 'l1'} with 0.4025096525096525 accuracy
Valdiation Accurtacy = 0.42901234567901236
Start grid search for randomforest
Best Param for randomforest is {'min_samples_leaf': 3, 'n_estimators': 500} with 0.4797297297297297 accuracy
Valdiation Accurtacy = 0.4976851851851852
Start grid search for knn
Best Param for knn is {'n_neighbors': 20} with 0.41235521235521233 accuracy
Valdiation Accurtacy = 0.44830246913580246
Start grid search for svm
Best Param for svm is {'C': 1, 'gamma': 0.1} with 0.4333976833976834 accuracy
Valdiation Accurtacy = 0.4591049382716049
