In [267]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
import time

In [268]:
df = pd.read_csv('data-challenge-winners/data/train_better.csv', sep=';')
X = df['message'].values
y = df['y'].values
X_vec = CountVectorizer(binary=True, ngram_range=(1, 1)).fit_transform(X)
n_samples, n_features = X_vec.shape
n = (n_samples*8)//10
X_train, y_train, X_valid, y_valid = X_vec[:n,:], y[:n], X_vec[n:,:], y[n:]
C = 1.

In [279]:
def g(X, Y, w, C):
    n,p = X.shape
    exp_vect = np.exp(-Y*(X.dot(w)))
    return C*np.sum(np.log(1. + exp_vect)) + np.sum(w**2)/2.

def gradg(X, Y, w, C):
    n,p = X.shape
    exp_vect = np.exp(-Y*(X.dot(w)))
    coefs = (-Y*exp_vect/(1. + exp_vect))
    return C*((X.T).dot(coefs)) + w

def gsafe(X, Y, w, C):
    n,p = X.shape
    val = 0.
    
    for i in range(n):
        val += C*np.log(1. + np.exp(Y[i]*(X[i].dot(w))))
    for i in range(p):
        val += (w[i]**2)/2.
    return val[0]

def gradgsafe(X, Y, w, C):
    n,p = X.shape
    grad = 0.
    for i in range(n):
        grad -= C*Y[i]*np.exp(Y[i]*(X[i].dot(w)))/(1. + np.exp(Y[i]*(X[i].dot(w))))*X[i]
    return grad + w

In [280]:
y_train[y_train == 0.] = -1.
print(g(X_train, y_train, np.zeros((n_features)), C))
print(gsafe(X_train, y_train, np.zeros((n_features,)), C))

print(gradg(X_train, y_train, np.zeros((n_features,)), C))
print(gradgsafe(X_train, y_train, np.zeros((n_features,)), C)[:10])

2448.19584174
2448.19584174
[ 1.   0.5  1.  ..., -0.5  0.5  0. ]
[ 1.   0.5  1.   1.5  0.   0.   0.5  0.   0.5  0.5]


In [281]:
alpha = 0.002
C = 1.
eps = 5e-3
w = np.zeros((n_features,))
cur_grad = gradg(X_train, y_train, w, C)
norm_grad = np.linalg.norm(cur_grad)
cmp = 0
while(norm_grad > eps):
    cmp += 1
    w -= alpha*cur_grad
    cur_grad = gradg(X_train, y_train, w, C)
    norm_grad = np.linalg.norm(cur_grad)
print(cmp)

3229


In [282]:
print(w)

[ -1.04833331e-02   3.50558352e-01  -1.04833331e-02 ...,   1.65503660e-01
  -2.15429687e-06   0.00000000e+00]


In [275]:
C = 1.
eps = 5e-5
y_train[y_train == -1.] = 0.
clf = LogisticRegression(tol=eps,C=C,fit_intercept=False) # On gagne en accuracy avec le paramètre class_weight={1 : np.sum(y_train)/n, 0 : 1 - np.sum(y_train)/n}
clf.fit(X_train, y_train);

In [276]:
print(clf.score(X_valid, y_valid))
print(np.sum((X_valid.dot(w) > 0) == y_valid)/y_valid.shape[0])
print(np.sum((X_valid.dot(clf.coef_[0]) > 0) == y_valid)/y_valid.shape[0])

0.759909399773
0.759909399773
0.759909399773


In [257]:
print(w[:10])
print(clf.coef_[0][:10]*(w[0]/clf.coef_[0][0]))

[-0.01048333  0.35055835 -0.01048333 -0.1096141   0.22257449  0.
 -0.07012405  0.23371962 -0.10144089 -0.07012405]
[-0.01048333  0.35048289 -0.01048333 -0.10955681  0.2225475   0.
 -0.07012537  0.23374659 -0.10143925 -0.07012537]


In [199]:
print(X_valid.dot(w)[:10])
print(X_valid.dot(clf.coef_[0])[:10])

[-2.73948078 -0.86833897  1.6978246  -0.56910457 -1.93739927 -5.00348261
 -3.08222253 -2.8249492   2.07081694 -6.29770315]
[-2.83830254 -0.89321167  1.67202746 -0.6717057  -1.96469071 -4.96725079
 -2.93569254 -2.49598114  2.00957457 -5.87275062]


In [200]:
print(g(X_train, y_train, w, C))
print(g(X_train, y_train, clf.coef_[0], C))

2288.16871174
2287.00519216


In [201]:
def g(X, Y, w, w0,C):
    n,p = X.shape
    exp_vect = np.exp(-Y*(X.dot(w) + w0))
    return C*np.sum(np.log(1. + exp_vect)) + (np.sum(w**2) + w0**2)/2.

def gradg(X, Y, w, w0, C):
    n,p = X.shape
    exp_vect = np.exp(-Y*(X.dot(w) + w0))
    coefs = (-Y*exp_vect/(1. + exp_vect))
    return [np.sum(coefs) + w0,C*((X.T).dot(coefs)) + w]

In [206]:
y_train[y_train == 0.] = -1.
print(g(X_train, y_train, np.zeros((n_features)), 0., C))
print(gradg(X_train, y_train, np.zeros((n_features,)), 0., C))

2448.19584174
[681.0, array([ 1. ,  0.5,  1. , ..., -0.5,  0.5,  0. ])]


In [210]:
alpha = 0.002
C = 1.
eps = 5e-2
w = np.zeros((n_features,))
w0 = 0.
cur_grad = gradg(X_train, y_train, w, w0, C)
norm_grad = np.sqrt(cur_grad[0]**2 + np.sum(cur_grad[1]**2))
cmp = 0
while(norm_grad > eps):
    cmp += 1
    w0 -= alpha*cur_grad[0]
    w -= alpha*cur_grad[1]
    cur_grad = gradg(X_train, y_train, w, w0, C)
    norm_grad = np.sqrt(cur_grad[0]**2 + np.sum(cur_grad[1]**2))
print(cmp)

2228


In [211]:
C = 1.
eps = 5e-2
y_train[y_train == -1.] = 0.
clf = LogisticRegression(tol=eps,C=C,fit_intercept=True) # On gagne en accuracy avec le paramètre class_weight={1 : np.sum(y_train)/n, 0 : 1 - np.sum(y_train)/n}
clf.fit(X_train, y_train);

In [215]:
print(clf.score(X_valid, y_valid))
print(np.sum((X_valid.dot(w) + w0 > 0) == y_valid)/y_valid.shape[0])
print(np.sum((X_valid.dot(clf.coef_[0]) + clf.intercept_[0] > 0) == y_valid)/y_valid.shape[0])

0.796149490374
0.799546998867
0.796149490374


In [216]:
print(w[:10])
print(clf.coef_[0][:10]*(w[0]/clf.coef_[0][0]))

[-0.01937315  0.31482182 -0.01937315 -0.06755514  0.17557353  0.         -0.0832368
  0.20073568 -0.08540067 -0.0832368 ]
[-0.01937315  0.47299238 -0.01937315 -0.12366426  0.25532929  0.
 -0.12573309  0.30210009 -0.1300292  -0.12573309]


In [219]:
print(g(X_train, y_train, w, w0, C))
print(g(X_train, y_train, clf.coef_[0], clf.intercept_[0], C))

2341.72468755
2341.12531461
