In [1]:
import os
import time
import re
import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor, LinearRegression


from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, LabelSet, FactorRange, HoverTool


output_notebook()


SEED = 99
np.random.seed(SEED)


DATA_FOLDER = os.path.join(os.getcwd(), "../data")
RESULTS_FOLDER = os.path.join(DATA_FOLDER, "results")
CV_RESULTS_FOLDER = os.path.join(RESULTS_FOLDER, "csv")


<h3>Caricamento Dati</h3>

In [2]:
train_fp = os.path.join(os.getcwd(), "../data/csv/train.csv")
train = pd.read_csv(
    train_fp,
    index_col=['review_id']
)

In [3]:
test_fp = os.path.join(os.getcwd(), "../data/csv/train.csv")
test = pd.read_csv(
    test_fp,
    index_col=['review_id']
)

<h3>Split $X$, $Y$</h3>

In [4]:
X_train, Y_train = train['text'], train['stars']
X_test, Y_test = test['text'], test['stars']


<h3>Analizzatori</h3>

In [5]:
analyzer = CountVectorizer().build_analyzer()

eng_stopwords = stopwords.words('english')

porter_stemmer = PorterStemmer()

rex = "[0-9]+[a-zA-Z0-9]*"
    
rex = re.compile(rex)

def stem(doc):
    return (porter_stemmer.stem(token) for token in analyzer(doc) if token not in eng_stopwords)

def stem_re(doc):
    res = []
    doc = re.sub(rex, "", doc)
    for token in analyzer(doc):
            if (token not in eng_stopwords):
                res.append(porter_stemmer.stem(token))
    return res


<h3>Funzioni Grafiche</h3>

In [6]:
def plot_feature_importance(features_names, features_importance, **kwargs):
    
    fig = figure(
        title=kwargs['title'] if 'title' in kwargs.keys() else 'Default Title',
        x_axis_label=kwargs['x_axis_label'] if 'x_axis_label' in kwargs.keys() else 'Feature',
        y_axis_label=kwargs['y_axis_label'] if 'y_axis_label' in kwargs.keys() else 'Importanza',
        width=kwargs['width'] if 'width' in kwargs.keys() else 950,
        height=kwargs['height'] if 'height' in kwargs.keys() else 600,
        x_range=features_names
    )
    
    fig.xaxis.major_label_orientation = kwargs['x_mlo'] if 'x_mlo' in kwargs.keys() else 1
    
    source = ColumnDataSource(data=dict(
        xs=features_names,
        hs=features_importance
    ))
    
    fig.vbar(
        x='xs',
        top='hs',
        width=kwargs['bars_width'] if 'bars_width' in kwargs.keys() else 0.25,
        fill_color=kwargs['bars_color'] if 'bars_color' in kwargs.keys() else "#12ab39",
        source=source
    )
    
    return fig

def plot_coefficients_significance(features_coeff, n, **kwargs):
    
    n = 25
    
    pos_source = ColumnDataSource(data=dict(
        xs=[t[0] for t in features_coeff[:n] if t[1] >= 0],
        hs=[abs(t[1]) for t in features_coeff[:n] if t[1] >= 0],
    ))
    
    neg_source = ColumnDataSource(data=dict(
        xs=[t[0] for t in features_coeff[:n] if t[1] < 0 ],
        hs=[abs(t[1]) for t in features_coeff[:n] if t[1] < 0 ],
    ))
    
    tools = 'hover,box_zoom,pan,save,reset,wheel_zoom'

    fig = figure(
        title=kwargs['title'] if 'title' in kwargs.keys() else 'Default Title',
        x_axis_label=kwargs['x_axis_label'] if 'x_axis_label' in kwargs.keys() else 'Coefficiente',
        y_axis_label=kwargs['y_axis_label'] if 'y_axis_label' in kwargs.keys() else 'Valore Assoluto',
        width=kwargs['width'] if 'width' in kwargs.keys() else 900,
        height=kwargs['height'] if 'height' in kwargs.keys() else 600,
        x_range=[t[0] for t in features_coeff[:n]],
        tools=tools
    )
    
    fig.xaxis.major_label_orientation = kwargs['x_mlo'] if 'x_mlo' in kwargs.keys() else 1

    fig.vbar(
        x='xs',
        top='hs',
        color='#cc2c11',
        width=kwargs['bars_width'] if 'bars_width' in kwargs.keys() else 0.2,
        alpha=kwargs['bars_alpha'] if 'bars_alpha' in kwargs.keys() else 0.7,
        legend_label='Positivi',
        source=pos_source
    )
    fig.vbar(
        x='xs',
        top='hs',
        color='#145b9c',
        width=kwargs['bars_width'] if 'bars_width' in kwargs.keys() else 0.2,
        alpha=kwargs['bars_alpha'] if 'bars_alpha' in kwargs.keys() else 0.5,
        legend_label='Negativi',
        source=neg_source
    )
    
    hover = fig.select(dict(type=HoverTool))
    hover.tooltips = [("nome", "@xs"), ("valore", "@hs")]
    hover.mode = 'mouse'
    
    return fig

<h3>Addestramento miglior Albero di decisione</h3>

In [7]:
dtr_pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=stem)),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False)),
    ('model', DecisionTreeRegressor(splitter='random', max_depth=100, min_samples_leaf=500, min_samples_split=1000, max_features=5000, random_state=SEED))
])

In [8]:
dtr_pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer=<function stem at 0x7fec40c30290>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokeniz...
                 TfidfTransformer(norm='l2', smooth_idf=False,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                       max_depth=100, max_features=5000,
                               

In [9]:
dtr_features_names = (dtr_pipeline.steps[0][1].vocabulary_)

In [10]:
len(dtr_features_names)

46377

In [11]:
dtr_features_importance = dtr_pipeline.steps[2][1].feature_importances_

In [12]:
len(dtr_features_importance)

46377

<h3>Mapping <code>features_names</code> - <code>features_importance</code></h3>

In [13]:
dtr_importance_names = []
for name in dtr_features_names:
    index = dtr_features_names[name]
    if(dtr_features_importance[index] > 0):
        dtr_importance_names.append((name, dtr_features_importance[index]))
        
dtr_importance_names.sort(reverse=True, key=lambda t : t[1])

In [14]:
n = 25
dtr_importance_fig = plot_feature_importance(
    [t[0] for t in dtr_importance_names[:n]],
    [t[1] for t in dtr_importance_names[:n]],
    title="Importanza migliori {} features Decision Tree Regressor".format(n),
)
show(dtr_importance_fig)

<h3>Addestramento miglior modello lineare addestrato con Stocastic Gradient Descent</h3>

In [15]:
pipeline_sgd = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False)),
    ('model', SGDRegressor(learning_rate='constant', alpha=0.00001, penalty='l2', max_iter=2500))
])

In [16]:
pipeline_sgd.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)...
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 SGDRegressor(alpha=1e-05, average=False, early_stopping=False,
                              epsilon=0.1, eta0=0.01, fit_intercept=True,
                              l1_ratio=0.15, learning_rate='constant',
               

<h3>Mapping <code>features_names</code> - <code>coefficients</code></h3>

In [17]:
sgd_coefficients = pipeline_sgd.steps[2][1].coef_

In [18]:
len(sgd_coefficients)

65230

In [19]:
sgd_features_names = pipeline_sgd.steps[0][1].vocabulary_

In [20]:
len(sgd_features_names)

65230

In [21]:
sgd_coeff_names = []
for name in sgd_features_names:
    index = sgd_features_names[name]
    sgd_coeff_names.append((name, sgd_coefficients[index], abs(sgd_coefficients[index])))

In [22]:
sgd_coeff_names.sort(reverse=True, key=lambda t : t[2])
#sgd_coeff_names

In [23]:
sgd_importance_fig = plot_coefficients_significance(sgd_coeff_names, n)

In [24]:
show(sgd_importance_fig)

<h3>Addestramento miglior modello lineare in forma chiusa</h3>

In [25]:
lr_pipeline = Pipeline([
    ('cv', CountVectorizer(analyzer=stem_re)),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False)),
    ('model', LinearRegression(normalize=True))
])

In [26]:
lr_pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer=<function stem_re at 0x7fec40c30440>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=False,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normaliz

In [28]:
lr_features_names = lr_pipeline.steps[0][1].vocabulary_

In [29]:
len(lr_features_names)

43803

In [30]:
lr_coefficients = lr_pipeline.steps[2][1].coef_

In [31]:
len(lr_coefficients)

43803

In [32]:
lr_coeff_names = []
for name in lr_features_names:
    index = lr_features_names[name]
    lr_coeff_names.append((name, lr_coefficients[index], abs(lr_coefficients[index])))

In [33]:
lr_coeff_names.sort(reverse=True, key=lambda t : t[2])
#lr_coeff_names

In [34]:
lin_importance_fig = plot_coefficients_significance(lr_coeff_names, 25)

In [35]:
show(lin_importance_fig)