In [1]:
import os
import io
import nltk
import time
import csv
import re
import pandas as pd
import numpy as np
from nltk.tokenize import ToktokTokenizer
from string import punctuation
import matplotlib.pyplot as plt
import tarfile,os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from gensim.sklearn_api import W2VTransformer
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ParameterGrid
from gensim.models import Word2Vec
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
np.set_printoptions(threshold=sys.maxsize)

In [3]:
PATH_TRAINING = "/content/drive/My Drive/IFT6285/IFT6285 Projet/clean_trainset.pkl"
PATH_TEST = "/content/drive/My Drive/IFT6285/IFT6285 Projet/clean_testset.pkl"
trainSet = pd.read_pickle(PATH_TRAINING)
testSet = pd.read_pickle(PATH_TEST)

In [4]:
trainX = trainSet['tokenized_text_string']
trainY = trainSet['age']

testX = testSet['tokenized_text_string']
testY = testSet['age']

### TFIDF

In [41]:
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=1000), RandomForestClassifier())
model.fit(trainX, trainY)

pred = model.predict(testX)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.5840207760464405


In [42]:
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=1000), XGBClassifier())
model.fit(trainX, trainY)

pred = model.predict(testX)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.5859150626336694


In [44]:
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=1000), MLPClassifier())
model.fit(trainX, trainY)

pred = model.predict(testX)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.5209288114879316


In [45]:
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=100000), ComplementNB())
model.fit(trainX, trainY)

pred = model.predict(testX)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.6219981668194318


In [57]:
from sklearn.dummy import DummyClassifier
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=100000), DummyClassifier(strategy='most_frequent'))
model.fit(trainX, trainY)

pred = model.predict(testX)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.40152765047357164


In [72]:
# A more complete hyper-parameter search
tfidf_XGB = make_pipeline(TfidfVectorizer(stop_words='english'), XGBClassifier())
hyperParams = {
        'xgbclassifier__min_child_weight': [1, 5, 10],
        'xgbclassifier__gamma': [0.5, 1, 1.5, 2, 5],
        'xgbclassifier__subsample': [0.6, 0.8, 1.0],
        'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0],
        'xgbclassifier__max_depth': [2,3,5],
        'xgbclassifier__n_estim': [50, 100, 200],
        'tfidfvectorizer__max_features': [100, 500, 1000]
        }

rs_tfidf_XGB = RandomizedSearchCV(tfidf_XGB, param_distributions=hyperParams, scoring='accuracy', n_iter=40, verbose=2, n_jobs=-1)
rs_tfidf_XGB.fit(trainX[:1000], trainY[:1000])
print(rs_tfidf_XGB.best_params_)
y_pred = rs_tfidf_XGB.predict(testX)
test_acc = round(accuracy_score(testY, y_pred), 3)
print(test_acc)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 20.5min finished


{'xgbclassifier__subsample': 0.6, 'xgbclassifier__n_estim': 100, 'xgbclassifier__min_child_weight': 1, 'xgbclassifier__max_depth': 5, 'xgbclassifier__gamma': 1, 'xgbclassifier__colsample_bytree': 0.6, 'tfidfvectorizer__max_features': 500}
0.55


In [41]:
# A more complete hyper-parameter search
tfidf_CNB = make_pipeline(TfidfVectorizer(stop_words='english'), ComplementNB())
hyperParams = {
        'complementnb__alpha': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1],
        'complementnb__norm': [False, True],
        'tfidfvectorizer__max_features': [100, 500, 1000, 2000]
        }

rs_tfidf_CNB = RandomizedSearchCV(tfidf_CNB, param_distributions=hyperParams, scoring='accuracy', n_iter=40, verbose=2, n_jobs=-1)
rs_tfidf_CNB.fit(trainX, trainY)
print(rs_tfidf_CNB.best_params_)
y_pred = rs_tfidf_CNB.predict(testX)
test_acc = round(accuracy_score(testY, y_pred), 3)
print(test_acc)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 126.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 159.5min finished


{'tfidfvectorizer__max_features': 2000, 'complementnb__norm': False, 'complementnb__alpha': 1}
0.631


In [85]:
# A more complete hyper-parameter search
tfidf_RF = make_pipeline(TfidfVectorizer(stop_words='english'), RandomForestClassifier())
hyperParams = {
        'randomforestclassifier__n_estimators': [50, 100, 200],
        'randomforestclassifier__criterion': ['gini', 'entropy'],
        'randomforestclassifier__min_samples_split': [1, 2, 3],
        'tfidfvectorizer__max_features': [100, 500, 1000]
        }

rs_tfidf_RF = RandomizedSearchCV(tfidf_RF, param_distributions=hyperParams, scoring='accuracy', n_iter=40, verbose=2, n_jobs=-1)
rs_tfidf_RF.fit(trainX[:1000], trainY[:1000])
print(rs_tfidf_RF.best_params_)
y_pred = rs_tfidf_RF.predict(testX)
test_acc = round(accuracy_score(testY, y_pred), 3)
print(test_acc)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 11.8min finished


{'tfidfvectorizer__max_features': 500, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__criterion': 'entropy'}
0.545


In [86]:
# A more complete hyper-parameter search
tfidf_MLP = make_pipeline(TfidfVectorizer(stop_words='english'), MLPClassifier())
hyperParams = {
        'mlpclassifier__hidden_layer_sizes': [50, 100, 200],
        'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'mlpclassifier__learning_rate_init': [0.001, 0.0001, 0.0005],
        'tfidfvectorizer__max_features': [100, 500, 1000, 2000]
        }

rs_tfidf_MLP = RandomizedSearchCV(tfidf_MLP, param_distributions=hyperParams, scoring='accuracy', n_iter=40, verbose=2, n_jobs=-1)
rs_tfidf_MLP.fit(trainX[:1000], trainY[:1000])
print(rs_tfidf_MLP.best_params_)
y_pred = rs_tfidf_MLP.predict(testX)
test_acc = round(accuracy_score(testY, y_pred), 3)
print(test_acc)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 39.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 53.1min finished


{'tfidfvectorizer__max_features': 1000, 'mlpclassifier__learning_rate_init': 0.0001, 'mlpclassifier__hidden_layer_sizes': 200, 'mlpclassifier__activation': 'tanh'}
0.565


### Word2Vec

In [13]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [12]:
w2v_model = Word2Vec.load("/content/drive/My Drive/IFT6285/IFT6285 Projet/w2vmodel.model")

In [20]:
testX_word2vec = averaged_word_vectorizer(corpus=testX.values, model=w2v_model, num_features=100)

  if __name__ == '__main__':


In [21]:
trainX_word2vec = averaged_word_vectorizer(corpus=trainX.values, model=w2v_model, num_features=100)

  if __name__ == '__main__':


In [26]:
model = RandomForestClassifier()
model.fit(trainX_word2vec, trainY)

pred = model.predict(testX_word2vec)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.5021998166819431


In [27]:
model = MLPClassifier()
model.fit(trainX_word2vec, trainY)

pred = model.predict(testX_word2vec)
accuracy = accuracy_score(pred, testY)
print(accuracy)



0.5376718606782768


In [28]:
model = XGBClassifier()
model.fit(trainX_word2vec, trainY)

pred = model.predict(testX_word2vec)
accuracy = accuracy_score(pred, testY)
print(accuracy)

0.537610754659334


In [37]:
model = BernoulliNB()
model.fit(trainX_word2vec, trainY)

pred = model.predict(testX_word2vec)
accuracy = accuracy_score(pred, testY)
print(accuracy)


0.48594561564314087


### Best Models

In [58]:
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=100000), ComplementNB(alpha=1, norm=False))
start = time.time()
model.fit(trainX, trainY)
print(time.time() - start)
pred = model.predict(testX)
accuracy = accuracy_score(pred, testY)
print(accuracy)

temps: 49.702991008758545
accuracy: 0.6673388328750381


### Save the predictions on blind test

In [46]:
blindSet = pd.read_pickle('/content/drive/My Drive/IFT6285/IFT6285 Projet/blind_test.pkl')

In [48]:
blindSetX = blindSet['tokenized_text_string']

In [49]:
blindPred = model.predict(blindSetX)

In [51]:
blindSet['age'] = blindPred

In [55]:
blindSet.to_pickle('blindTest_age.pkl')