In [3]:
#Importation des librairies utilisées
import unicodedata 
import time
import pandas as pd
import numpy as np
import random
import nltk
import collections
import itertools
import csv
import warnings
import pickle
import scipy

import os
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [4]:
Y_train = pd.read_csv("data/cdiscount_train_subset.csv").fillna("")["Categorie1"]
Y_valid = pd.read_csv("data/cdiscount_valid.csv").fillna("")["Categorie1"]

# Regression Logistique

In [5]:
DATA_DIR = "data/features"

from scipy import sparse

metadata_list_lr = []

parameters = [[None, "count"],
              [300, "count"],
              [10000, "count"],
              [None, "tfidf"],
              [300, "tfidf"],
              [10000, "tfidf"],]

for nb_hash, vectorizer in parameters:
    print("nb_hash : " + str(nb_hash) + ", vectorizer : " + str(vectorizer))
    X_train = sparse.load_npz(DATA_DIR +"/vec_train_nb_hash_" + str(nb_hash) + "_vectorizer_" + str(vectorizer)+".npz")
    X_valid = sparse.load_npz(DATA_DIR +"/vec_valid_nb_hash_" + str(nb_hash) + "_vectorizer_" + str(vectorizer)+".npz")
    ts = time.time()
    cla = LogisticRegression()
    cla.fit(X_train,Y_train.values)
    te=time.time()
    t_learning = te-ts
    ts = time.time()
    score_train=cla.score(X_train,Y_train)
    score_valid=cla.score(X_valid,Y_valid)
    te=time.time()
    t_predict = te-ts
    metadata = {"typeW2V": None, "nb_hash": nb_hash, "vectorizer":vectorizer , "learning_time" : t_learning, "predict_time":t_predict, "score_train": score_train, "score_valid": score_valid}
    print(metadata)
    metadata_list_lr.append(metadata)
       

nb_hash : None, vectorizer : count
{'nb_hash': None, 'score_train': 0.97930526315789479, 'typeW2V': None, 'learning_predict': 0.2245628833770752, 'vectorizer': 'count', 'score_valid': 0.90980000000000005, 'learning_time': 51.61705780029297}
nb_hash : 300, vectorizer : count
{'nb_hash': 300, 'score_train': 0.72312631578947373, 'typeW2V': None, 'learning_predict': 0.19028806686401367, 'vectorizer': 'count', 'score_valid': 0.70499999999999996, 'learning_time': 32.592347145080566}
nb_hash : 10000, vectorizer : count
{'nb_hash': 10000, 'score_train': 0.96926315789473683, 'typeW2V': None, 'learning_predict': 0.21926307678222656, 'vectorizer': 'count', 'score_valid': 0.88819999999999999, 'learning_time': 44.44723200798035}
nb_hash : None, vectorizer : tfidf
{'nb_hash': None, 'score_train': 0.91381052631578952, 'typeW2V': None, 'learning_predict': 0.22490191459655762, 'vectorizer': 'tfidf', 'score_valid': 0.87519999999999998, 'learning_time': 23.655981063842773}
nb_hash : 300, vectorizer : tfi

In [6]:
print("")

for model_name in ["CBOW","skip-gram", "online"]:
    print("Word2Vec :" + model_name)

    X_train = np.load(DATA_DIR +"/embedded_train_nb_hash_" + model_name+".npy")
    X_valid = np.load(DATA_DIR +"/embedded_valid_nb_hash_" + model_name+".npy")
    
    ts = time.time()
    cla = LogisticRegression()
    cla.fit(X_train,Y_train.values)
    te=time.time()
    t_learning = te-ts
    ts = time.time()
    score_train=cla.score(X_train,Y_train)
    score_valid=cla.score(X_valid,Y_valid)
    te=time.time()
    t_predict = te-ts
    metadata = {"typeW2V": model_name ,"nb_hash": None, "vectorizer":"word2vec" ,"learning_time" : t_learning, "predict_time":t_predict, "score_train": score_train, "score_valid": score_valid}
    print(metadata)
    metadata_list_lr.append(metadata)



Word2Vec :CBOW
{'nb_hash': None, 'score_train': 0.80271578947368416, 'typeW2V': 'CBOW', 'learning_predict': 0.4588019847869873, 'vectorizer': 'word2vec', 'score_valid': 0.79600000000000004, 'learning_time': 806.843672990799}
Word2Vec :skip-gram
{'nb_hash': None, 'score_train': 0.84796842105263159, 'typeW2V': 'skip-gram', 'learning_predict': 0.41788697242736816, 'vectorizer': 'word2vec', 'score_valid': 0.84440000000000004, 'learning_time': 409.39480996131897}
Word2Vec :online
{'nb_hash': None, 'score_train': 0.77548421052631578, 'typeW2V': 'online', 'learning_predict': 0.24233794212341309, 'vectorizer': 'word2vec', 'score_valid': 0.75280000000000002, 'learning_time': 790.7946720123291}


In [8]:
pd.DataFrame(metadata_list_lr)

Unnamed: 0,learning_predict,learning_time,nb_hash,score_train,score_valid,typeW2V,vectorizer
0,0.224563,51.617058,,0.979305,0.9098,,count
1,0.190288,32.592347,300.0,0.723126,0.705,,count
2,0.219263,44.447232,10000.0,0.969263,0.8882,,count
3,0.224902,23.655981,,0.913811,0.8752,,tfidf
4,0.19734,21.833364,300.0,0.715979,0.6994,,tfidf
5,0.23105,22.137742,10000.0,0.902053,0.861,,tfidf
6,0.458802,806.843673,,0.802716,0.796,CBOW,word2vec
7,0.417887,409.39481,,0.847968,0.8444,skip-gram,word2vec
8,0.242338,790.794672,,0.775484,0.7528,online,word2vec


# Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
metadata_list_rf = []

parameters = [[None, "count"],
              [300, "count"],
              [10000, "count"],
              [None, "tfidf"],
              [300, "tfidf"],
              [10000, "tfidf"],]

for nb_hash, vectorizer in parameters:
    print("nb_hash : " + str(nb_hash) + ", vectorizer : " + str(vectorizer))
    X_train = sparse.load_npz(DATA_DIR +"/vec_train_nb_hash_" + str(nb_hash) + "_vectorizer_" + str(vectorizer)+".npz")
    X_valid = sparse.load_npz(DATA_DIR +"/vec_valid_nb_hash_" + str(nb_hash) + "_vectorizer_" + str(vectorizer)+".npz")
    ts = time.time()
    cla = RandomForestClassifier(n_estimators=100)
    cla.fit(X_train,Y_train.values)
    te=time.time()
    t_learning = te-ts
    ts = time.time()
    score_train=cla.score(X_train,Y_train)
    score_valid=cla.score(X_valid,Y_valid)
    te=time.time()
    t_predict = te-ts
    metadata = {"typeW2V": None, "nb_hash": nb_hash, "vectorizer":vectorizer , "learning_time" : t_learning, "predict_time":t_predict, "score_train": score_train, "score_valid": score_valid}
    print(metadata)
    metadata_list_rf.append(metadata)

nb_hash : None, vectorizer : count
{'nb_hash': None, 'score_train': 0.9986105263157895, 'typeW2V': None, 'vectorizer': 'count', 'score_valid': 0.86260000000000003, 'learning_time': 429.4773950576782, 'predict_time': 23.091992139816284}
nb_hash : 300, vectorizer : count
{'nb_hash': 300, 'score_train': 0.99836842105263157, 'typeW2V': None, 'vectorizer': 'count', 'score_valid': 0.7702, 'learning_time': 309.0696289539337, 'predict_time': 10.80103588104248}
nb_hash : 10000, vectorizer : count
{'nb_hash': 10000, 'score_train': 0.99860000000000004, 'typeW2V': None, 'vectorizer': 'count', 'score_valid': 0.85360000000000003, 'learning_time': 227.18568587303162, 'predict_time': 20.715173959732056}
nb_hash : None, vectorizer : tfidf
{'nb_hash': None, 'score_train': 0.9986105263157895, 'typeW2V': None, 'vectorizer': 'tfidf', 'score_valid': 0.85160000000000002, 'learning_time': 356.13234305381775, 'predict_time': 21.244513034820557}
nb_hash : 300, vectorizer : tfidf
{'nb_hash': 300, 'score_train': 

In [11]:
print("")

for model_name in ["CBOW","skip-gram", "online"]:
    print("Word2Vec :" + model_name)

    X_train = np.load(DATA_DIR +"/embedded_train_nb_hash_" + model_name+".npy")
    X_valid = np.load(DATA_DIR +"/embedded_valid_nb_hash_" + model_name+".npy")
    
    ts = time.time()
    cla = RandomForestClassifier(n_estimators=100)
    cla.fit(X_train,Y_train.values)
    te=time.time()
    t_learning = te-ts
    ts = time.time()
    score_train=cla.score(X_train,Y_train)
    score_valid=cla.score(X_valid,Y_valid)
    te=time.time()
    t_predict = te-ts
    metadata = {"typeW2V": model_name ,"nb_hash": None, "vectorizer":"word2vec" ,"learning_time" : t_learning, "predict_time":t_predict, "score_train": score_train, "score_valid": score_valid}
    print(metadata)
    metadata_list_rf.append(metadata)



Word2Vec :CBOW
{'nb_hash': None, 'score_train': 0.9986105263157895, 'typeW2V': 'CBOW', 'vectorizer': 'word2vec', 'score_valid': 0.8206, 'learning_time': 339.57674407958984, 'predict_time': 5.852169990539551}
Word2Vec :skip-gram
{'nb_hash': None, 'score_train': 0.99860000000000004, 'typeW2V': 'skip-gram', 'vectorizer': 'word2vec', 'score_valid': 0.85019999999999996, 'learning_time': 362.4189829826355, 'predict_time': 6.2934041023254395}
Word2Vec :online
{'nb_hash': None, 'score_train': 0.99106315789473687, 'typeW2V': 'online', 'vectorizer': 'word2vec', 'score_valid': 0.73680000000000001, 'learning_time': 316.4473259449005, 'predict_time': 7.3000171184539795}


In [None]:
pd.DataFrame(metadata_list_lr)