In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [2]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [3]:
y_train = y_train.drop(columns=['dec_o']).values.ravel()
y_test = y_test.drop(columns=['dec_o']).values.ravel()

# Baseline

## XGBoosting
Best performer from classification trials

In [4]:
xgb_0 = XGBClassifier(max_depth=5)
xgb_0.fit(X_train,y_train).score(X_test, y_test)

0.8353221957040573

In [5]:
importances = pd.concat([pd.DataFrame(X_train.columns),
                         pd.DataFrame(np.transpose(xgb_0.feature_importances_))], 
                        axis = 1)
importances

Unnamed: 0,0,0.1
0,gender,0.045164
1,age,0.027742
2,age_o,0.0
3,int_corr,0.02829
4,samerace,0.028672
5,date,0.027892
6,exphappy,0.029873
7,attr3_1,0.027217
8,sinc3_1,0.027132
9,fun3_1,0.030985


## Logistic Regression

In [6]:
lr_0 = LogisticRegression(max_iter=3000, penalty='l2', solver='sag')
lr_0.fit(X_train,y_train).score(X_train, y_train)

0.8352730528200537

# Manual Trials
## Without Preferences

In [7]:
X_train_2 = X_train.drop(X_train.loc[:,'attr1_1':'pf_o_sha'].head(0).columns, axis=1)
X_test_2 = X_test.drop(X_test.loc[:,'attr1_1':'pf_o_sha'].head(0).columns, axis=1)

In [8]:
xgb = XGBClassifier(max_depth=5)
xgb.fit(X_train_2,y_train).score(X_test_2, y_test)

0.8359188544152745

In [9]:
lr = LogisticRegression(max_iter=3000, penalty='l2', solver='sag')
lr.fit(X_train_2,y_train).score(X_test_2, y_test)

0.8353221957040573

## Without Gender

In [10]:
X_train_3 = X_train.drop(columns=['gender'])
X_test_3 = X_test.drop(columns=['gender'])

In [11]:
xgb = XGBClassifier(max_depth=5)
xgb.fit(X_train_3,y_train).score(X_test_3, y_test)

0.834128878281623

In [12]:
lr = LogisticRegression(max_iter=3000, penalty='l2', solver='sag')
lr.fit(X_train_3,y_train).score(X_test_3, y_test)

0.8353221957040573

# L1 Regularization
## Logistic Regression

In [13]:
# params = {'penalty': ['l1'],
#           'solver': ['liblinear', 'saga'],
#           'max_iter': [3000]}

# gscv = GridSearchCV(LogisticRegression(), params, cv=3)
# gscv.fit(X_train, y_train)
# gscv.best_params_

In [14]:
lr = LogisticRegression(max_iter=3000, penalty='l1', solver='saga')
lr.fit(X_train,y_train).score(X_train, y_train)

0.8352730528200537

In [15]:
lr.score(X_test, y_test)

0.8353221957040573

In [16]:
coefficients = pd.concat([pd.DataFrame(X_train.columns),
                          pd.DataFrame(np.transpose(lr.coef_))], 
                         axis = 1)
coefficients

Unnamed: 0,0,0.1
0,gender,-0.179258
1,age,-0.001264
2,age_o,-0.001264
3,int_corr,0.039848
4,samerace,0.001974
5,date,0.010847
6,exphappy,0.010908
7,attr3_1,-0.028286
8,sinc3_1,0.061943
9,fun3_1,-0.010194


## XGBoost

In [17]:
params = {'n_estimators': [100, 150, 200],
          'max_depth': [3, 5, 10],
          'reg_alpha': [1]}

gscv = GridSearchCV(XGBClassifier(), params, cv=3)
gscv.fit(X_train, y_train)
gscv.best_params_

{'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 1}

In [20]:
xgb = XGBClassifier(max_depth=3, reg_alpha=1)
xgb.fit(X_train,y_train).score(X_test, y_test)

0.8359188544152745

In [21]:
importances = pd.concat([pd.DataFrame(X_train.columns),
                         pd.DataFrame(np.transpose(xgb.feature_importances_))], 
                        axis = 1,
                        )
importances

Unnamed: 0,0,0.1
0,gender,0.029581
1,age,0.029631
2,age_o,0.0
3,int_corr,0.034293
4,samerace,0.038866
5,date,0.034737
6,exphappy,0.032175
7,attr3_1,0.03415
8,sinc3_1,0.029973
9,fun3_1,0.038111
