# Data science lab: process and methods #
Winter project, A.A. 2019/2020  
Gabriele Degola, s273479

In [None]:
import string
import re
import csv
import pandas as pd
import numpy as np
import spacy
from spacy_langdetect import LanguageDetector
from wordcloud import WordCloud
from googletrans import Translator
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [None]:
dataset = pd.read_csv(r"./data/development.csv")
print(dataset.shape)
test = pd.read_csv(r"./data/evaluation.csv")
print(test.shape)
other = pd.read_csv(r"./data/external.csv") # additional reviews are loaded
dataset.head()

### Data exploration ###

In [None]:
print(dataset['class'].unique())
dataset['class'].hist(bins=3) # plot labels distribution
plt.show()
dataset['class'].describe()

WordCloud generation

In [None]:
wordList = []
sw = stopwords.words('italian')
sw.append('molto')
for rev in dataset['text'].append(test['text'], ignore_index=True):
    rev = rev.lower()
    rev = re.sub('[^a-zèàìùòé]', ' ', rev)
    for word in rev.split():
        if len(word) > 3 and word not in sw:
            wordList.append(word)
fdist = FreqDist(wordList)

wc = WordCloud(width=1200, height=800, background_color='white')
plt.figure(figsize=(10,6))
plt.imshow(wc.generate_from_frequencies(dict(fdist.most_common(200))), interpolation='bilinear')
plt.axis('off')
plt.show()

Replace three dots with ellipsis

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: x.replace('...','…'))
test['text'] = test['text'].apply(lambda x: x.replace('...','…'))

Analyse most frequent characters

In [None]:
def find_most_occ_char(input_list):
    wc = Counter("".join(input_list))
    print(wc)

print("positive training reviews:", end=' ')
find_most_occ_char(dataset.loc[dataset['class']=='pos', 'text'])
print("negative training reviews:", end=' ')
find_most_occ_char(dataset.loc[dataset['class']=='neg', 'text'])
print("test reviews:", end=' ')
find_most_occ_char(test['text'])

Plot punctuation distribution

In [None]:
def count_occurrencies(chars, string):
    tot = 0
    for char in chars:
        tot += string.count(char)
    return tot

fig, ax = plt.subplots()
width = 0.4
x = np.arange(3)
y = []
tmp = []
for rev in dataset.loc[dataset['class'] == 'pos', 'text']:
    tmp.append(count_occurrencies(['…'],rev))
y.append(np.mean(tmp))
tmp = []
for rev in dataset.loc[dataset['class'] == 'pos', 'text']:
    tmp.append(count_occurrencies(['!'],rev))
y.append(np.mean(tmp))
tmp = []
for rev in dataset.loc[dataset['class'] == 'pos', 'text']:
    tmp.append(count_occurrencies(['?'],rev))
y.append(np.mean(tmp))
ax.bar(x-width/2, y, width=width, label='pos')
y = []
tmp = []
for rev in dataset.loc[dataset['class'] == 'neg', 'text']:
    tmp.append(count_occurrencies(['…'],rev))
y.append(np.mean(tmp))
tmp = []
for rev in dataset.loc[dataset['class'] == 'neg', 'text']:
    tmp.append(count_occurrencies(['!'],rev))
y.append(np.mean(tmp))
tmp = []
for rev in dataset.loc[dataset['class'] == 'neg', 'text']:
    tmp.append(count_occurrencies(['?'],rev))
y.append(np.mean(tmp))
ax.bar(x+width/2, y, width=width, label='neg')
ax.set_xticks(x)
ax.set_xticklabels(['…','!','?'])
plt.title("Average occurencies per review")
plt.legend()
plt.show()

Find and translate non italian reviews with spacy's language detector

In [None]:
nlp = spacy.load('it_core_news_sm', disable=['ner'])
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [None]:
wrong = []
for i,rev in enumerate(dataset['text']):
    if nlp(rev)._.language['language'] != 'it':
        wrong.append(i)
        
wrong_test = []
for i,rev in enumerate(test['text']):
    if nlp(rev)._.language['language'] != 'it':
        wrong_test.append(i)
        
dataset.iloc[wrong]['text']

In development set, if a review is totally not in italian it is removed, in the evaluation set is translated. If multilingual only italian part is kept.

In [None]:
def rev_translate(df, ind, label, action):
    for i in ind:
        it = [str(x) for x in nlp(df.loc[i, label]).sents if x._.language['language'] == 'it'] # italian sentences
        if not it:
            if action == 'translate':
                trans = Translator()
                df.loc[i, label] = trans.translate(df.loc[i, label], src=nlp(df.loc[i, label])._.language['language'], dest='it').text
            else if action == 'remove':
                df.drop(i, inplace=True)
        else:
            df.loc[i, label] = " ".join(it)

rev_translate(dataset, wrong, 'text', 'remove')
rev_translate(test, wrong_test, 'text', 'translate')

Clean reviews, lowercase, remove useless punctuation and keep emojis. Emojis and !?… are treated as separate tokens.

In [None]:
def add_spaces(x):
    for char in ['?','!','…']:
        x = x.replace(char,' ' + char + ' ')
    return x

dataset['cleaned_text'] = dataset['text'].apply(lambda x: x.lower())
dataset['cleaned_text'] = dataset['cleaned_text'].apply(lambda x: re.sub("[^a-zèàìùòé?!…👍👎🔝😊❤☺😉😱😳😘😍🙈🤔😡👏😅😠😂]", " ", x))
dataset['cleaned_text'] = dataset['cleaned_text'].apply(add_spaces)
test['cleaned_text'] = test['text'].apply(lambda x: x.lower())
test['cleaned_text'] = test['cleaned_text'].apply(lambda x: re.sub("[^a-zèàìùòé?!…👍👎🔝😊❤☺😉😱😳😘😍🙈🤔😡👏😅😠😂]", " ", x))
test['cleaned_text'] = test['cleaned_text'].apply(add_spaces)
dataset['size'] = dataset['cleaned_text'].apply(lambda x: len(x.split()))
test['size'] = test['cleaned_text'].apply(lambda x: len(x.split()))
dataset.head()

In [None]:
find_most_occ_char(dataset.loc[dataset['class']=='pos', 'cleaned_text'])
find_most_occ_char(dataset.loc[dataset['class']=='neg', 'cleaned_text'])
find_most_occ_char(test['cleaned_text'])

Analyse reviews' length

In [None]:
dataset['size'].describe()

In [None]:
print(dataset.loc[(dataset['size'] < 15), 'text'])
print(dataset.loc[(dataset['size'] > 1000), 'text'])

In [None]:
dataset.loc[(dataset['class'] == 'pos'), 'size'].plot(kind='hist', bins=500, figsize=(10,5), title="Reviews length distribution", label='pos')
test['size'].plot(kind='hist', bins=500, label='eval')
dataset.loc[(dataset['class'] == 'neg'), 'size'].plot(kind='hist', bins=500, label='neg')
plt.legend()
plt.show()

Check correlation with proper names

In [None]:
names = pd.read_csv(r'nomi.csv', usecols=['Name'], squeeze=True)
names = pd.Series(names.apply(lambda x: x.lower()).unique())
names.isin(['mia']).sum()
names = names[names != 'mia']
names.isin(["nicolo'"]).sum()
names = names.apply(lambda x: x.replace("'",''))
names.isin(["nicolo'"]).sum()

def has_names(x):
    for name in names:
        if name in x.split():
            return True
    return False

dataset['names'] = dataset['cleaned_text'].apply(has_names)
dataset[dataset['names'] == True]['class'].hist(bins=3)
plt.show()
dataset.head()

### Data preprocessing ###
Convert text to bag-of-words model with tf-idf weighting scheme

In [None]:
class StemTokenizer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer('italian')
    def __call__(self, document):
        lemmas = []
        for t in word_tokenize(document):
            t = self.stemmer.stem(t)
            t = t.strip()
            if len(t) > 2 or t in ['?','!','…','👍','👎','🔝','😊','❤','☺','😉','😱','😳','😘','😍','🙈','🤔','😡','👏','😅','😠','😂']:
                lemmas.append(t)
        return lemmas
    
# process stopwords
sw = stopwords.words('italian')
sw.remove('non')
stemmer = SnowballStemmer('italian')
new_sw = set()
for t in sw:
    new_sw.add(stemmer.stem(t))

tokenizer = StemTokenizer()
vect = TfidfVectorizer(tokenizer=tokenizer, max_df=0.6, min_df=4, ngram_range=(1,3), stop_words=new_sw)
data = dataset['cleaned_text']
data = data.append(test['cleaned_text'], ignore_index=True)
vect.fit(data)

In [None]:
X = tfidf.transform(dataset['cleaned_text'])

Try dimensionality reduction, low performances

In [None]:
# svd = TruncatedSVD(n_components=2000)
# X_svd = svd.fit_transform(X)
# print(svd.explained_variance_ratio_.sum())

### Algorithm choice ###

Multinomial Naive Bayes classifier

In [None]:
clf = MultinomialNB()
f1 = cross_val_score(clf, X, dataset['class'], cv=5, scoring='f1_weighted')
print('MultinomialNB:')
print('F1 scores:')
print(f1)
print(f1.mean())

y_pred_nb = cross_val_predict(clf, X, dataset['class'], cv=5)
print('Accuracy:')
print(accuracy_score(dataset['class'], y_pred_nb))
print('Precision recall fscore support')
print(precision_recall_fscore_support(dataset['class'], y_pred_nb))

Support Vector Machines classifier with linear kernel

In [None]:
clf = LinearSVC(max_iter=2000)
f1 = cross_val_score(clf, X, dataset['class'], cv=5, scoring='f1_weighted')
print('LinearSVC:')
print('F1 scores:')
print(f1)
print(f1.mean())

y_pred_svc = cross_val_predict(clf, X, dataset['class'], cv=5)
print('Accuracy:')
print(accuracy_score(dataset['class'], y_pred_svc))
print('Precision recall fscore support')
print(precision_recall_fscore_support(dataset['class'], y_pred_svc))

Random Forest classifier

In [None]:
clf = RandomForestClassifier(n_estimators=100)
f1 = cross_val_score(clf, X, dataset['class'], cv=5, scoring='f1_weighted')
print('RandomForestClassifier:')
print('F1 scores:')
print(f1)
print(f1.mean())

y_pred_rf = cross_val_predict(clf, X, dataset['class'], cv=5)
print('Accuracy:')
print(accuracy_score(dataset['class'], y_pred_rf))
print('Precision recall fscore support')
print(precision_recall_fscore_support(dataset['class'], y_pred_rf))

Plot ROC curves

In [None]:
mapp = {'pos':1, 'neg':0}
class_bin = dataset['class'].replace(mapp)

fpr_svc, tpr_svc, _ = roc_curve(class_bin, pd.Series(y_pred_svc).replace(mapp))
fpr_nb, tpr_nb, _ = roc_curve(class_bin, pd.Series(y_pred_nb).replace(mapp))
fpr_rf, tpr_rf, _ = roc_curve(class_bin, pd.Series(y_pred_rf).replace(mapp))

plt.figure(figsize=(10,10))
plt.title('ROC curve comparison')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.plot(fpr_svc, tpr_svc, label='Linear SVC')
plt.plot(fpr_nb, tpr_nb, label='Multinomial NB')
plt.plot(fpr_rf, tpr_rf, label='Random Forest')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.show()

### Tuning and validation ###
Grid search to find best hyperparameters

In [None]:
clf = LinearSVC(max_iter=2000)
grid = {'dual':[True,False], 'tol':[1e-3,1e-4,1e-5], 'C':[0.1,1,10,25,50,100], 'fit_intercept':[True,False], 'intercept_scaling':[0.1,1,10], 'class_weight':[None,'balanced']}
gridsearch = GridSearchCV(clf, grid, scoring='f1_weighted', cv=3, verbose=2, error_score='raise')
gridsearch.fit(X, dataset['class'])

print(gridsearch.best_params_)

In [None]:
clf = gridsearch.best_estimator_
f1 = cross_val_score(clf, X, dataset['class'], cv=5, scoring='f1_weighted')
print('F1 scores for the best estimator:')
print(f1)
print(f1.mean())

In [None]:
y_pred_val = cross_val_predict(clf, X, dataset['class'], cv=5)

Plot confusion matrix and normalized confusion matrix

In [None]:
fig,ax = plt.subplots(1,2, figsize=(14,5))

# build the confusion matrix
conf_mat = confusion_matrix(class_bin, pd.Series(y_pred_val).replace(mapp))

# plot the result
conf_mat_df = pd.DataFrame(conf_mat, index = ['neg','pos'], columns = ['neg','pos'])
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'
sns.heatmap(conf_mat_df, annot=True, cmap='GnBu', annot_kws={"size": 16}, fmt='g', cbar=True, ax=ax[0])
ax[0].set_title('Confusion matrix')

# count positive and negative labels in the development set
y_pos = class_bin.value_counts()[1]
y_neg = class_bin.value_counts()[0]

# plot normalized confusion matrix with recall on the diagonal
norm_conf_mat = conf_mat * np.array([[1.0 / y_neg,1.0/y_neg],[1.0/y_pos,1.0/y_pos]])
norm_conf_mat_df = pd.DataFrame(norm_conf_mat, index = ['neg','pos'], columns = ['neg','pos'])
norm_conf_mat_df.index.name = 'Actual'
norm_conf_mat_df.columns.name = 'Predicted'
sns.heatmap(norm_conf_mat_df, annot=True, cmap='GnBu', annot_kws={"size": 16}, fmt='g', cbar=True, ax=ax[1])
ax[1].set_title('Normalized confusion matrix')
plt.show()

#### Evaluation set prediction ##

In [None]:
X_test = vect.transform(test['cleaned_text'])

In [None]:
clf = gridsearch.best_estimator_
clf.fit(X, dataset['class'])
y_pred = clf.predict(X_test)

In [None]:
pd.Series(y_pred).hist(bins=3) # predicted labels distribution
plt.show()

Export the results

In [None]:
with open("out.csv", mode='w', newline="", encoding='UTF-8') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(['Id', 'Predicted'])
    for j,el in enumerate(y_pred):
        writer.writerow([j,el])