In [1]:
import sys
import os

sys.path.append('~/your_path_to_project/adp_chatbot_assistant/src')

import numpy as np
import pandas as pd

from core.model.estimator import Estimator
from core.utils.data_modification import read_data
from datetime import datetime
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import remove_stopwords
from scipy.stats import uniform
from scipy.stats import randint


In [121]:
param_distributions_logit = {
    'classifier__penalty': ['l1', 'l2'],  # L1 and L2 regularization
    'classifier__C': uniform(loc=0, scale=10),  # Sample C values from a uniform distribution between 0 and 10
    'classifier__solver': ['saga'],  # Testing different solvers
    'classifier__max_iter': [15000]  # Testing different max iterations
}

param_distributions_rf = {
    'classifier__n_estimators': randint(100, 200),  # Randomly sample from 100 to 1000 trees
    'classifier__max_depth': [None, 10],     # Test None (no limit) and different tree depths
    'classifier__min_samples_split': randint(2, 20), # Randomly sample min samples for split
    'classifier__min_samples_leaf': randint(1, 10),  # Randomly sample min samples at leaf
    'classifier__max_features': ['sqrt', 'log2']     # sqrt or log2 features
}

param_distributions_xbg = {
    'classifier__n_estimators': randint(300, 1500),  # Number of trees in the forest
    'classifier__max_depth': randint(3, 10),         # Maximum depth of each tree
    'classifier__learning_rate': uniform(0.01, 0.3), # Learning rate for boosting
    'classifier__subsample': uniform(0.5, 0.5),      # Subsample ratio of the training instances
    'classifier__colsample_bytree': uniform(0.5, 0.5),# Subsample ratio of columns when constructing each tree
    'classifier__gamma': uniform(0, 10),             # Minimum loss reduction
    'classifier__reg_alpha': uniform(0, 10),         # L1 regularization term
    'classifier__reg_lambda': uniform(0, 10)         # L2 regularization term
}

In [122]:
settings ={
    'logit': [LogisticRegression(random_state=1506,  class_weight='balanced'), param_distributions_logit, None], 
    'random_forest': [RandomForestClassifier(random_state=1506, warm_start=True), param_distributions_rf, None],
    'xgb': [xgb.XGBClassifier(random_state=1506, eval_metric='mlogloss'), param_distributions_xbg, None]
}

# TfidfVectorizer

In [123]:
for model_name in settings.keys():
    
    print(model_name)
    model = Estimator()
    
    model.set_model(classifier=settings[model_name][0])

    df, topics = read_data() 
    
    X_train, X_test, y_train, y_test = model.train_test_split(df)

    model.param_distributions = settings[model_name][1]
    
    model.train_with_random_search(X_train['message'], y_train, n_iter=25)
    
    y_train_preds = model.model_pipeline.predict_proba(X_train.message)
    y_test_preds = model.model_pipeline.predict_proba(X_test.message)

    settings[model_name][2] = model.model_pipeline
    
    # model.print_results_report_full(y_train_preds, y_test_preds, y_train, y_test)
    print(pd.DataFrame(model.get_metrics(y_train_preds, y_test_preds, y_train, y_test)).rename(index={0: 'train_error',
                                                                                          1: 'test_error'}), '\n')
    # print(settings[model_name][2][1].get_params())


logit




             accuracy  f1_score  precision  recall
train_error     0.984  0.983957   0.984061   0.984
test_error      0.950  0.950647   0.952417   0.950 

random_forest
             accuracy  f1_score  precision  recall
train_error     0.992  0.991897   0.992123   0.992
test_error      0.966  0.965566   0.966218   0.966 

xgb
             accuracy  f1_score  precision  recall
train_error     0.932  0.930981   0.932083   0.932
test_error      0.912  0.911404   0.912493   0.912 



# Word2vec

In [124]:
settings_w2v ={
    'logit': [LogisticRegression(random_state=1506,  class_weight='balanced'), param_distributions_logit, None], 
    'random_forest': [RandomForestClassifier(random_state=1506, warm_start=True), param_distributions_rf, None],
    'xgb': [xgb.XGBClassifier(random_state=1506, eval_metric='mlogloss'), param_distributions_xbg, None]
}

In [125]:
# Download the required 'punkt' tokenizer data
nltk.download('punkt')

# word2vec_model = api.load('word2vec-google-news-300')
word2vec = api.load('glove-wiki-gigaword-50')


[nltk_data] Downloading package punkt to /home/eduardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [126]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec)

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in remove_stopwords(words) if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [127]:
for model_name in settings.keys():
    
    print(model_name)
    model = Estimator()
    
    model.set_model(classifier=settings_w2v[model_name][0], word_model=MeanEmbeddingVectorizer(word2vec))
    
    df, topics = read_data()
    
    X_train, X_test, y_train, y_test = model.train_test_split(df)

    model.param_distributions = settings[model_name][1]
    
    model.train_with_random_search(X_train['message'], y_train, n_iter=5)
    
    y_train_preds = model.model_pipeline.predict_proba(X_train.message)
    y_test_preds = model.model_pipeline.predict_proba(X_test.message)

    settings[model_name][2] = model.model_pipeline
    
     # model.print_results_report_full(y_train_preds, y_test_preds, y_train, y_test)
    print(pd.DataFrame(model.get_metrics(y_train_preds, y_test_preds, y_train, y_test)).rename(index={0: 'train_error',
                                                                                          1: 'test_error'}), '\n')
    # print(settings[model_name][2][1].get_params())
    

logit
             accuracy  f1_score  precision  recall
train_error     0.697  0.698185   0.701214   0.697
test_error      0.678  0.680977   0.685236   0.678 

random_forest
             accuracy  f1_score  precision  recall
train_error     0.932  0.930128   0.933189   0.932
test_error      0.698  0.685601   0.695954   0.698 

xgb
             accuracy  f1_score  precision  recall
train_error    0.7355  0.727957   0.739336  0.7355
test_error     0.6420  0.624648   0.627075  0.6420 

