In [19]:
import sys

#!conda install --yes -c intel scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model.logistic import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

In [ ]:
## Read Data

In [12]:
df = pd.read_csv('/Users/mm06682/projects/school_projects/fall_2019/software_engineering/google-ad-bias-research/data/processed/final_long.csv')
for i,j in zip(*np.where(pd.isnull(df))):
    df.iloc[i,j] = "NA"

## Create data set

In [73]:
# X_column = ['age', 'gender', 'education', 'occupation', 'Hispanic', 'race','Political', 
#             'feelAboutAd', 'image','City', 'State', 'Region','Division', 'page_type', 'relevant']
#X_column = ['age', 'gender', 'education', 'occupation', 'Hispanic', 'race','Political', 
 #           'feelAboutAd', 'State', 'Region','Division', 'page_type', 'relevant']

X_column = [ 'page_type', 'relevant', 'age', 'feelAboutAd', 'occupation', 'education']

num_column = ['age','feelAboutAd']

X_df = pd.DataFrame()
y_df = pd.DataFrame()

for col in X_column:
    if col not in num_column:
        t = pd.Categorical(df[col])
        X_df[col] = t.rename_categories(range(len(t.categories)))
    else:
        X_df[col] = df[col]

y_df = df['rating']


X = X_df.values.astype('int32')
y = y_df.values.astype('int32')

In [74]:
X_model, X_holdout, y_model, y_holdout = train_test_split(X, y, test_size = 0.2, random_state = 100)
X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=0.2, random_state=0)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

## Models


In [56]:
def getModel(modelName):
    if modelName == "lr":
        model = LogisticRegression(dual=False,
                       fit_intercept=True,intercept_scaling=1,class_weight=None,
                       random_state=None,solver='liblinear',max_iter=100, 
                       multi_class='auto',verbose=0,warm_start=False,
                       n_jobs=None)
        grid_param = {
            'penalty':['l1', 'l2'],
            'C': [0.01, 0.10, 1.0],
        }
    elif modelName == "randomforest":
        model = RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=None,
                            min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=-1, 
                            random_state=None, verbose=0, warm_start=False, class_weight=None)
        grid_param = {
            'n_estimators': [100, 300, 500, 800, 1000],
            'min_samples_leaf': [1, 3, 5, 7]
        }
    elif modelName == "knn":
        model = KNeighborsClassifier(weights='uniform',algorithm='auto',p=2,metric='minkowski',metric_params=None,
                         n_jobs=None)
        grid_param ={
            'n_neighbors' :[2,5,10,20,30]
        }
    elif modelName == "svm":
        model = SVC(kernel='rbf', shrinking=True, 
         probability=False, verbose=False, 
         max_iter=-1, decision_function_shape='ovr', random_state=None)
        
        grid_param = {
            'C' : [0.001, 0.01, 0.1, 1, 10],
            'gamma': [0.001, 0.01, 0.1, 1]
        }
        
    return model, grid_param

## Model Grid Search

In [49]:
models = ["lr","randomforest","knn","svm"]
for m in models:
    model, param = getModel(m)
    gd_sr = GridSearchCV(estimator=model,param_grid=param,scoring='accuracy',cv=5,n_jobs=-1)
    print ("Start grid search for {}".format(m))
    gd_sr.fit(X_train, y_train)
    best_parameters = gd_sr.best_params_
    best_result = gd_sr.best_score_
    best_estimator = gd_sr.best_estimator_
    print ("Best Param for {} is {} with {} accuracy".format(m,best_parameters,best_result))
    best_estimator.fit(X_train,y_train)
    print("Validation Accuracy = {}".format(best_estimator.score(X_test,y_test)))
    print("=============================")

Start grid search for lr
Best Param for lr is {'C': 0.1, 'penalty': 'l1'} with 0.40366795366795366 accuracy
Validation Accuracy = 0.42746913580246915
Start grid search for randomforest
Best Param for randomforest is {'min_samples_leaf': 7, 'n_estimators': 800} with 0.4723938223938224 accuracy
Validation Accuracy = 0.4976851851851852
Start grid search for knn
Best Param for knn is {'n_neighbors': 2} with 0.41563706563706565 accuracy
Validation Accuracy = 0.43132716049382713
Start grid search for svm
Best Param for svm is {'C': 10, 'gamma': 0.1} with 0.44324324324324327 accuracy
Validation Accuracy = 0.466820987654321


In [75]:
forest = RandomForestClassifier(n_estimators=1000,criterion='gini',max_depth=None,
                            min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=-1, 
                            random_state=None, verbose=0, warm_start=False, class_weight=None, 
                            min_samples_leaf=7)
X_train = pd.DataFrame(X_train, columns = X_df.columns)
forest.fit(X_train, y_train)
print("Accuracy: {}".format(accuracy_score(forest.predict(X_test), y_test)))
feature_importances = pd.DataFrame(forest.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

# After dinner (or in the a.m.), figure out how to subset the dfs on gender here: GOAL: make predictions and check accuracy for each gender in the test set. https://stackoverflow.com/questions/45257635/how-to-subset-data-based-on-another-column-in-python
# Not sure if fair ML tools can do this. Also, don't spend too long on this, bc tomorrow the most important thing you can do is write your sections. 


Accuracy: 0.46296296296296297
             importance
age            0.331928
feelAboutAd    0.229282
occupation     0.139350
education      0.115322
page_type      0.094404
relevant       0.089715


In [10]:
#!pip install aif360==0.2.0
#!pip install fairkit-learn


from fklearn.metric_library import UnifiedMetricLibrary, classifier_quality_score

Collecting aif360==0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/cf/ec/6497d4ee752611e80a6d5cd51ae89ae8ded02fe6c48a5424db2db7c252af/aif360-0.2.0-py3-none-any.whl (23.4MB)
[K     |████████████████████████████████| 23.4MB 1.2MB/s 
[?25hCollecting numpy<1.16,>=1.14 (from aif360==0.2.0)
[?25l  Downloading https://files.pythonhosted.org/packages/3d/c3/a69406093c9a780a74964f41cd56b06c0346d686a9b3f392d123a663f5e0/numpy-1.15.4-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.5MB)
[K     |████████████████████████████████| 24.5MB 1.1MB/s 
Collecting pandas==0.23.3 (from aif360==0.2.0)
[?25l  Downloading https://files.pythonhosted.org/packages/5d/40/a87f29155cebd25c345e71bd9251f591258f888d553ee42210528546cee8/pandas-0.23.3-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (14.0MB)
[K     |████████████████████████████████| 14.0MB 1.4MB/s 
Installing c

In [None]:
# After lunch, look at generating some metrics here that show the model would perform differently or is performing differently based on gender. Look through Brittney's examples more closely and the Fair AI examples, too. Writing the code will be simple. You want to get conceptually clear on what you're tryin to show before you do more here. 

