# TRAIN TREND CLASSIFICATOR

Takes in input the articles DB and outputs a trained model

In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
%matplotlib inline
pd.set_option('display.max_colwidth', 300)

## Load data

In [3]:
input_filename = 'temp/db'
articoli = pd.read_csv(input_filename, sep=";")

In [4]:
#articoli.head()

In [5]:
articoli.count()

Unnamed: 0      2170
ID              2170
Titolo          2170
Data            2170
Testo           2170
Trend           2170
Txt             2170
Anno            2170
Mese            2170
Day             2170
Week            2170
Keywords        2170
Words           2125
ComputedTags    2125
NumberOfTags    2170
Company          542
Tag             1522
Vertical        1596
Tech             977
Application      759
Location         571
dtype: int64

## Train classification model

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pandas import Series
import pickle

In [7]:
# convert text to features
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(articoli['Trend'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [8]:
with open('stop_words', 'rb') as stopwords_dump:
    stop_words = pickle.load(stopwords_dump)

In [9]:
tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7, stop_words=stop_words)
X = tfidfconverter.fit_transform(articoli['Txt']).toarray()
y = Series(articoli['Trend']).values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# Bell'articolo su random forest: https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
y_pred = classifier.predict(X_test)

In [13]:
#print(confusion_matrix(y_test,y_pred))

In [14]:
#print(classification_report(y_test,y_pred))

In [15]:
# SALVATO: 0.555556
# ATTUALE: 0.536817
print("Accuracy of model: %f" % accuracy_score(y_test, y_pred))

Accuracy of model: 0.518433


## Save classification model

In [None]:
# https://stackabuse.com/text-classification-with-python-and-scikit-learn/
# INTERESTING FOR SAVING MODEL
import pickle

with open('trend_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)
    
with open('trend_tfidfconverter', 'wb') as picklefile:
    pickle.dump(tfidfconverter,picklefile)