In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model.logistic import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Read Data

In [2]:
df = pd.read_csv('../data/processed/final_long.csv')
for i,j in zip(*np.where(pd.isnull(df))):
    df.iloc[i,j] = "NA"

## Create data set

In [3]:
# X_column = ['age', 'gender', 'education', 'occupation', 'Hispanic', 'race','Political', 
#             'feelAboutAd', 'image','City', 'State', 'Region','Division', 'page_type', 'relevant']
X_column = ['age', 'gender', 'education', 'occupation', 'Hispanic', 'race','Political', 
            'feelAboutAd', 'image', 'State', 'Region','Division', 'page_type', 'relevant']

num_column = ['age','feelAboutAd']

X_df = pd.DataFrame()
y_df = pd.DataFrame()

for col in X_column:
    if col not in num_column:
        t = pd.Categorical(df[col])
        X_df[col] = t.rename_categories(range(len(t.categories)))
    else:
        X_df[col] = df[col]

y_df = df['rating']


X = X_df.values.astype('int32')
y = y_df.values.astype('int32')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

## Models


In [6]:
def getModel(modelName):
    if modelName == "lr":
        model = LogisticRegression(penalty='l2', dual=False,tol=0.0001,C=1.0,
                       fit_intercept=True,intercept_scaling=1,class_weight=None,
                       random_state=None,solver='liblinear',max_iter=100, 
                       multi_class='warn',verbose=0,warm_start=False,
                       n_jobs=None)
        grid_param = {
            'penalty':['l1', 'l2'],
            'C': np.logspace(-4, 4, 10),
        }
    elif modelName == "randomforest":
        model = RandomForestClassifier(n_estimators='warn',criterion='gini',max_depth=None,
                           min_samples_leaf=1,min_weight_fraction_leaf=0.0,
                           min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
                            random_state=None, verbose=0, warm_start=False, class_weight=None)
        grid_param = {
            'n_estimators': [100, 300, 500, 800, 1000],
            'criterion': ['gini', 'entropy'],
            'bootstrap': [True, False]
        }
    elif modelName == "knn":
        model = KNeighborsClassifier(n_neighbors=5,weights='uniform',algorithm='auto',
                         leaf_size=30,p=2,metric='minkowski',metric_params=None,
                         n_jobs=None)
        grid_param ={
            'n_neighbors' :[2,5,10,20,30],
            'leaf_size' : [10,20,30,40]
        }
    elif modelName == "svm":
        model = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, 
         probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, 
         max_iter=-1, decision_function_shape='ovr', random_state=None)
        
        grid_param = {
            'C' : [0.001, 0.01, 0.1, 1, 10],
            'gamma': [0.001, 0.01, 0.1, 1]
        }
        
    return model, grid_param

## Model Grid Search

In [10]:
models = ["lr","randomforest","knn","svm"]
for m in models:
    model, param = getModel(m)
    gd_sr = GridSearchCV(estimator=model,param_grid=param,scoring='accuracy',cv=5,n_jobs=-1)
    print ("Start grid search for {}".format(m))
    gd_sr.fit(X_train, y_train)
    best_parameters = gd_sr.best_params_
    best_result = gd_sr.best_score_
    best_estimator = gd_sr.best_estimator_
    print ("Best Param for {} is {} with {} accuracy".format(m,best_parameters,best_result))
    best_estimator.fit(X_train,y_train)
    print("Valdiation Accurtacy = {}".format(best_estimator.score(X_test,y_test)))
    print("=============================")

Start grid search for lr




Best Param for lr is {'C': 0.046415888336127774, 'penalty': 'l1'} with 0.40873996294008647 accuracy
Valdiation Accurtacy = 0.40185185185185185
Start grid search for randomforest




Best Param for randomforest is {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 800} with 0.4567634342186535 accuracy
Valdiation Accurtacy = 0.4617283950617284
Start grid search for knn
Best Param for knn is {'leaf_size': 10, 'n_neighbors': 10} with 0.4298949969116739 accuracy
Valdiation Accurtacy = 0.4302469135802469
Start grid search for svm
Best Param for svm is {'C': 1, 'gamma': 0.1} with 0.45213094502779494 accuracy
Valdiation Accurtacy = 0.4506172839506173
