In [15]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [2]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv').values.ravel()
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv').values.ravel()

# Baseline

## XGBoosting
Best performer from classification trials

In [20]:
xgb_0 = XGBClassifier(max_depth=5)
xgb_0.fit(X_train,y_train).score(X_test, y_test)

0.8484486873508353

In [22]:
importances = pd.concat([pd.DataFrame(X_train.columns),
                         pd.DataFrame(np.transpose(xgb_0.feature_importances_))], 
                        axis = 1)
importances

Unnamed: 0,0,0.1
0,gender,0.023205
1,age,0.03458
2,age_o,0.0
3,int_corr,0.019367
4,samerace,0.020384
5,date,0.028621
6,exphappy,0.019852
7,attr3_1,0.025507
8,sinc3_1,0.025202
9,fun3_1,0.023922


## Logistic Regression

In [23]:
lr_0 = LogisticRegression(max_iter=3000, penalty='l2', solver='sag')
lr_0.fit(X_train,y_train).score(X_train, y_train)

0.8390032826022082

# Manual Trials
## Without Preferences

In [5]:
X_train_2 = X_train.drop(X_train.loc[:,'attr1_1':'pf_o_sha'].head(0).columns, axis=1)
X_test_2 = X_test.drop(X_test.loc[:,'attr1_1':'pf_o_sha'].head(0).columns, axis=1)

In [6]:
xgb = XGBClassifier(max_depth=5)
xgb.fit(X_train_2,y_train).score(X_test_2, y_test)

0.8430787589498807

In [7]:
lr = LogisticRegression(max_iter=3000, penalty='l2', solver='sag')
lr.fit(X_train_2,y_train).score(X_test_2, y_test)

0.8347255369928401

## Without Gender

In [8]:
X_train_3 = X_train.drop(columns=['gender'])
X_test_3 = X_test.drop(columns=['gender'])

In [9]:
xgb = XGBClassifier(max_depth=5)
xgb.fit(X_train_3,y_train).score(X_test_3, y_test)

0.8496420047732697

In [10]:
lr = LogisticRegression(max_iter=3000, penalty='l2', solver='sag')
lr.fit(X_train_3,y_train).score(X_test_3, y_test)

0.8359188544152745

# L1 Regularization
## Logistic Regression

In [11]:
# params = {'penalty': ['l1'],
#           'solver': ['liblinear', 'saga'],
#           'max_iter': [3000]}

# gscv = GridSearchCV(LogisticRegression(), params, cv=3)
# gscv.fit(X_train, y_train)
# gscv.best_params_

In [12]:
lr = LogisticRegression(max_iter=3000, penalty='l1', solver='saga')
lr.fit(X_train,y_train).score(X_train, y_train)

0.838704864219636

In [13]:
lr.score(X_test, y_test)

0.8347255369928401

In [16]:
coefficients = pd.concat([pd.DataFrame(X_train.columns),
                          pd.DataFrame(np.transpose(lr.coef_))], 
                         axis = 1)
coefficients

Unnamed: 0,0,0.1
0,gender,-0.227981
1,age,-0.018837
2,age_o,-0.018837
3,int_corr,0.196187
4,samerace,-0.054904
5,date,-0.117895
6,exphappy,-0.023227
7,attr3_1,0.076634
8,sinc3_1,0.013839
9,fun3_1,-0.032163


## XGBoost

In [19]:
params = {'n_estimators': [100, 150, 200],
          'max_depth': [3, 5, 10],
          'reg_alpha': [1]}

gscv = GridSearchCV(XGBClassifier(), params, cv=3)
gscv.fit(X_train, y_train)
gscv.best_params_

{'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 1}

In [24]:
xgb = XGBClassifier(max_depth=5, reg_alpha=1)
xgb.fit(X_train,y_train).score(X_test, y_test)

0.8490453460620525

In [26]:
importances = pd.concat([pd.DataFrame(X_train.columns),
                         pd.DataFrame(np.transpose(xgb.feature_importances_))], 
                        axis = 1,
                        )
importances

Unnamed: 0,0,0.1
0,gender,0.037612
1,age,0.027325
2,age_o,0.0
3,int_corr,0.01786
4,samerace,0.019479
5,date,0.031868
6,exphappy,0.018051
7,attr3_1,0.023444
8,sinc3_1,0.021474
9,fun3_1,0.020442
