In [1]:
import os
import re

import sys
sys.path.append("..")

import time
import pickle
import graphviz
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression

from scripts.plotting_functions import plot_feature_importance, plot_coefficients_significance
from scripts.plotting_functions import export_tree_graph, render_tree_graph

from scripts.analyzers import simple_stemmer, regexp_stemmer

from bokeh.plotting import output_notebook, show

output_notebook()


SEED = 99
np.random.seed(SEED)


DATA_FOLDER = os.path.join(os.getcwd(), "../data")
RESULTS_FOLDER = os.path.join(DATA_FOLDER, "results")
CV_RESULTS_FOLDER = os.path.join(RESULTS_FOLDER, "csv")

<h3>Caricamento Dati</h3>

In [2]:
train_fp = os.path.join(os.getcwd(), "../data/csv/train.csv")
train = pd.read_csv(
    train_fp,
    index_col=['review_id']
)

In [3]:
test_fp = os.path.join(os.getcwd(), "../data/csv/train.csv")
test = pd.read_csv(
    test_fp,
    index_col=['review_id']
)

<h3>Split $X$, $Y$</h3>

In [4]:
X_train, Y_train = train['text'], train['stars']
X_test, Y_test = test['text'], test['stars']

<h3>Addestramento miglior Albero di decisione</h3>

In [5]:
dtr_pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=simple_stemmer)),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False)),
    ('model', DecisionTreeRegressor(splitter='random', max_depth=100, min_samples_leaf=500, min_samples_split=1000, max_features=5000, random_state=SEED))
])

In [6]:
dtr_pipeline.fit(X_train, Y_train)

Pipeline(steps=[('cv',
                 CountVectorizer(analyzer=<function simple_stemmer at 0x11957fee0>)),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
                ('model',
                 DecisionTreeRegressor(max_depth=100, max_features=5000,
                                       min_samples_leaf=500,
                                       min_samples_split=1000, random_state=99,
                                       splitter='random'))])

In [7]:
dtr_features_names = (dtr_pipeline.steps[0][1].vocabulary_)

In [8]:
len(dtr_features_names)

46377

In [9]:
dtr_features_importance = dtr_pipeline.steps[2][1].feature_importances_

In [10]:
len(dtr_features_importance)

46377

<h3>Visualizzazione Albero di Decisione</h3>

In [11]:
export_tree_graph(
    decision_tree = dtr_pipeline.steps[2][1],
    vocabulary = dtr_pipeline.steps[0][1].vocabulary_,
    out_file = "tree.dot"
)

In [12]:
render_tree_graph("tree.dot")

<img src="./imgs/dtr_1.png" style="width=800;height=600">

<h3>Mapping <code>features_names</code> - <code>features_importance</code></h3>

In [13]:
dtr_importance_names = []
for name in dtr_features_names:
    index = dtr_features_names[name]
    if(dtr_features_importance[index] > 0):
        dtr_importance_names.append((name, dtr_features_importance[index]))
        
dtr_importance_names.sort(reverse=True, key=lambda t : t[1])

In [14]:
n = 25
dtr_importance_fig = plot_feature_importance(
    [t[0] for t in dtr_importance_names[:n]],
    [t[1] for t in dtr_importance_names[:n]],
    title="Importanza migliori {} features Decision Tree Regressor".format(n),
)
show(dtr_importance_fig)

<h3>Addestramento miglior modello lineare addestrato con Stocastic Gradient Descent</h3>

In [15]:
pipeline_sgd = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False)),
    ('model', SGDRegressor(learning_rate='constant', alpha=0.00001, penalty='l2', max_iter=2500))
])

In [16]:
pipeline_sgd.fit(X_train, Y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
                ('model',
                 SGDRegressor(alpha=1e-05, learning_rate='constant',
                              max_iter=2500))])

<h3>Mapping <code>features_names</code> - <code>coefficients</code></h3>

In [17]:
sgd_coefficients = pipeline_sgd.steps[2][1].coef_

In [18]:
len(sgd_coefficients)

65230

In [19]:
sgd_features_names = pipeline_sgd.steps[0][1].vocabulary_

In [20]:
len(sgd_features_names)

65230

In [21]:
sgd_coeff_names = []
for name in sgd_features_names:
    index = sgd_features_names[name]
    sgd_coeff_names.append((name, sgd_coefficients[index], abs(sgd_coefficients[index])))

In [22]:
sgd_coeff_names.sort(reverse=True, key=lambda t : t[2])

In [23]:
sgd_importance_fig = plot_coefficients_significance(sgd_coeff_names, n)

In [24]:
show(sgd_importance_fig)

<h3>Addestramento miglior modello lineare in forma chiusa</h3>

In [26]:
lr_pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=regexp_stemmer)),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False)),
    ('model', LinearRegression(normalize=True))
])

In [27]:
lr_pipeline.fit(X_train, Y_train)

Pipeline(steps=[('cv',
                 CountVectorizer(analyzer=<function regexp_stemmer at 0x11b321b80>)),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
                ('model', LinearRegression(normalize=True))])

In [28]:
lr_features_names = lr_pipeline.steps[0][1].vocabulary_

In [29]:
len(lr_features_names)

43803

In [30]:
lr_coefficients = lr_pipeline.steps[2][1].coef_

In [31]:
len(lr_coefficients)

43803

In [32]:
lr_coeff_names = []
for name in lr_features_names:
    index = lr_features_names[name]
    lr_coeff_names.append((name, lr_coefficients[index], abs(lr_coefficients[index])))

In [33]:
lr_coeff_names.sort(reverse=True, key=lambda t : t[2])

In [34]:
lin_importance_fig = plot_coefficients_significance(lr_coeff_names, 25)

In [35]:
show(lin_importance_fig)