In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2018-06-15 09:32:07 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.4
pandas 0.23.0
sklearn 0.19.1
matplotlib 2.2.2


In [32]:
np.random.seed(0)

# size = 1
# X = np.random.randint(0,10,size=[100,10000])
# y = np.random.randint(0,2,size=100)

def make_data(N=50000, n_vars=50,
              n_classes=2):
    np.random.seed(0)
    X = np.random.randint(10, size=(N,n_vars))
    y = np.random.choice(n_classes, N)
    return X, y

X, y = make_data(n_vars=1000)
df = pd.DataFrame(X)
df['target'] = y
print('dimension: ', df.shape)
df.head()

dimension:  (50000, 1001)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,target
0,5,0,3,3,7,9,3,5,2,4,...,7,5,7,2,4,1,1,1,7,1
1,9,0,9,0,2,1,4,6,0,6,...,4,7,3,6,7,0,6,4,2,0
2,3,2,0,9,5,8,1,0,9,5,...,0,0,7,9,0,6,9,4,6,0
3,7,6,6,6,0,7,5,2,2,4,...,3,9,5,6,3,6,7,5,6,0
4,9,9,5,7,5,3,3,4,0,1,...,3,9,5,5,2,1,6,2,9,0


In [33]:
from sklearn.feature_selection import SelectKBest, chi2

select_best = SelectKBest(chi2, k=10)
X_selected = select_best.fit_transform(X, y)
print(X_selected.shape)

(50000, 10)


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()
mse_estimate = cross_val_score(logreg, X_selected, y, cv=10, scoring='roc_auc')
mse_estimate

array([0.51637649, 0.52185299, 0.51631556, 0.50796974, 0.52757843,
       0.52495771, 0.51599955, 0.51904828, 0.5207096 , 0.51358231])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.metrics import roc_auc_score

X_train_selected = select_best.transform(X_train)
logreg = LogisticRegression()
logreg.fit(X_train_selected, y_train)

y_pred_proba = logreg.predict_proba(X_train_selected)[:, 1]
roc_auc_score(y_train, y_pred_proba)

0.8121744791666667

In [16]:
X_test_selected = select_best.transform(X_test)
y_pred_proba = logreg.predict_proba(X_test_selected)[:, 1]
roc_auc_score(y_test, y_pred_proba)

0.7474747474747475

In [17]:
from sklearn.pipeline import Pipeline

select_best = SelectKBest(chi2, k=2)
logreg = LogisticRegression()
pipeline = Pipeline([
    ('chi2_select', select_best),
    ('logistic', logreg)
])
pipeline.fit(X_train, y_train)
y_pred_proba = pipeline.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_pred_proba)

0.8606770833333333

In [18]:
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred_proba)

0.6111111111111112

In [None]:
take_out_set = df.ix[random.sample(df.index, 30)]
training_set = df[~(df.isin(take_out_set)).all(axis=1)]
 
corr = training_set.corr()['target'][df.corr()['target'] < 1].abs()
corr.sort(ascending=False)
features = corr.index[[0,1]].values
 
training_input = training_set[features].values
training_output = training_set['target']
 
logreg = LogisticRegression()
logreg.fit(training_input, training_output)
 

y_take_out = logreg.predict(take_out_set[features])
mean_squared_error(take_out_set.target, y_take_out)
# 0.53333333333333333