In [54]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.grid_search import GridSearchCV
import time
import sys
#reload(sys)
#sys.setdefaultencoding("utf-8")

In [55]:
train_fname = '../data/train_transformed_3.csv'
test_fname = '../data/test_transformed_2.csv'
df = pd.read_csv(train_fname, sep=';')
df_test = pd.read_csv(test_fname, sep=';')
f_con = ['INV_NB',
        'INV_NB_PAYS','INV_NB_TYPE','cited_n','cited_age_min','cited_age_median','cited_age_max','cited_age_mean',
        'cited_age_std','NB_BACKWARD_NPL','NB_BACKWARD_XY','NB_BACKWARD_I','NB_BACKWARD_AUTRE','NB_BACKWARD_PL',
        'NB_BACKWARD','pct_NB_IPC','pct_NB_IPC_LY','oecd_NB_ROOT_CLASSES','oecd_NB_BACKWARD_PL','oecd_NB_BACKWARD_NPL',
        'IDX_ORIGIN','IDX_RADIC','PRIORITY_MONTH','FILING_MONTH','PUBLICATION_MONTH','BEGIN_MONTH']
(n_samples,n_variables) = (df.shape[0],len(f_con))

#df = df.reindex(np.random.permutation(df.index));
X_train1 = df[f_con].values
y_train = df.VARIABLE_CIBLE == 'GRANTED'
X_test1 = df_test[f_con].values
imputer = Imputer()
# Imputer permet de combler les trous quand des données manquent. Par défaut il prend la moyenne de la dernière donnée vue
# et de la prochaine. Ces données ne sont donc pas normalisées.
X_train1 = imputer.fit_transform(X_train1)
X_test1 = imputer.fit_transform(X_test1)
# Normalisation des features.
scale(X_train1,copy=False);
scale(X_test1,copy=False);
print(X_train1.shape)

(259431, 26)


In [56]:
X = np.concatenate((X_train1,X_test1), axis=0)
scale(X, copy=False);
pca = PCA(n_components=26)
X = pca.fit_transform(X)
X_train = X[0:X_train1.shape[0]]
X_test = X[X_train1.shape[0]:]

In [57]:
n_taken = n_samples
start = time.time()
model = make_pipeline(PolynomialFeatures(degree=2, include_bias = False), LogisticRegression(C=0.014))
model.fit(X_train[0:n_taken], y_train[0:n_taken])
print(time.time() - start)

233.025444031


In [58]:
y_pred_train = model.predict_proba(X_train)[:, 1]
print('Score (optimiste) sur le train : %s' % roc_auc_score(y_train, y_pred_train))
print(model.score(X_train[0:n_taken], y_train[0:n_taken]))

Score (optimiste) sur le train : 0.620966543456
0.606955221234


In [None]:
# last : 0.623626227995
# 0.572773523603

In [None]:
#n_taken = n_samples
#nb_thread = 4
#model = make_pipeline(PolynomialFeatures(degree=2, include_bias = False), LogisticRegression())
#pf = PolynomialFeatures(degree = 2, include_bias = False)
#X_train_poly = pf.fit_transform(X_train)
#X_test_poly = pf.fit_transform(X_test)
#model.fit(X_train[0:n_taken], y_train[0:n_taken]);
#parameters = {'C':[0.1,0.2,0.3, 0.4,0.5,0.6,0.7,0.8,0.9,1.]}
#def scorer(estimator, X, y):
#    return roc_auc_score(y, estimator.predict_proba(X)[:,1])
#lr = LogisticRegression()
#clf = GridSearchCV(lr, parameters, scorer, n_jobs = nb_thread)
#clf.fit(X_train_poly[0:n_taken],y_train[0:n_taken])
#y_pred_train = clf.predict_proba(X_train_poly)[:, 1]
#print('Score (optimiste) sur le train : %s' % roc_auc_score(y_train, y_pred_train))
#print(clf.score(X_train_poly[0:n_taken], y_train_poly[0:n_taken]))

In [41]:
pd.DataFrame(data=y_pred_train, columns=['VARIABLE_CIBLE']).to_csv(path_or_buf='../y_trained/cont_reglogpoly_2_27.02.26.txt', sep=';')

In [42]:
from sklearn.externals import joblib
joblib.dump(model, '../trained_models/cont_relogpoly_2_27_02_16.pkl')

['../trained_models/cont_relogpoly_2_27_02_16.pkl',
 '../trained_models/cont_relogpoly_2_27_02_16.pkl_01.npy',
 '../trained_models/cont_relogpoly_2_27_02_16.pkl_02.npy',
 '../trained_models/cont_relogpoly_2_27_02_16.pkl_03.npy']

In [None]:
print('Score (optimiste) sur le train : %s' % roc_auc_score(y_train, y_pred_train))
print(model.score(X_train[0:n_taken], y_train[0:n_taken]))

In [None]:
# 0.591120640085
# C = 0.014

### Démarche :
#### Le kernel explose si on utilise les 28 dimensions avec tout les samples :
#### 259431*29*30/2 = 112,852,485 floats codés sur 64 bits = 8 bytes.
#### On a donc réduit la dimension via PCA jusqu'à ce que le kernel n'explose plus, ce qui nous amené à 25 composantes :
####  259431*25*26/2 = 84,315,075.
#### Ensuite on a cherché le meilleur C pour 25 dimensions. Il faudrait faire une propre CV avec tout les samples et attendre un moment.


In [43]:
# Application au test set et sauvegarde de la soumission correspondante.
y_pred = model.predict_proba(X_test)[:, 1]
np.savetxt('../subs/continue_polylogreg3.txt', y_pred, fmt='%s')

In [None]:
from sklearn.metrics import roc_curve

In [None]:
print(roc_auc_score(y_train, y_pred_train))

In [None]:
fpr, tpr, thresholds = roc_curve(y_train,y_pred_train)
plt.plot([0,1],[0,1])
plt.plot(fpr, tpr, lw=1)

In [None]:
plt.hist(y_pred_train)

In [None]:
plt.hist(y_train - y_pred_train)

In [None]:
plt.hist(abs(y_train - y_pred_train) > 0.5)