In [14]:
import pandas as pd
from stop_words import get_stop_words
from nltk.corpus import stopwords
import re
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.snowball import ItalianStemmer
import nltk
from num2words import num2words
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
import numpy as np

In [6]:
dev = pd.read_csv('exam_development.csv')
eval = pd.read_csv('exam_evaluation.csv')

print("Development dataset:\n")
print("Shape: " + str(dev.shape) + "\n")
print("Values counts:\n" + str(dev.loc[:, "class"].value_counts()) + "\n\n\n")

print("Evaluation dataset:\n")
print("Shape: " + str(eval.shape))

dict = dev.loc[:, "class"].value_counts().to_dict()

Development dataset:

Shape: (28754, 2)

Values counts:
pos    19532
neg     9222
Name: class, dtype: int64



Evaluation dataset:

Shape: (12323, 1)


In [17]:
# Stemmer
stemmer = ItalianStemmer()

print("Development dataset")
print("-  -  -  -  -  -  -  -  -")
print("Lower case text")
dev.loc[:, "text"] = dev.text.apply(lambda x: str.lower(x).replace("'", " "))

print("Remove punctuation")
dev.loc[:, "text"] = dev.text.apply(lambda x: " ".join(re.findall('[\w]+', x)))

print("Remove italian stopwords")
dev.loc[:, "text"] = dev.text.apply(lambda x: removeStopWords(x))

print("Stemming process")
dev.loc[:, "text"] = dev.text.apply(lambda x: stem(x))

print("- - - - - - - - - - - -")

print("Evaluation dataset")
print("-  -  -  -  -  -  -  -  -")

print("Lower case text")
eval.loc[:, "text"] = eval.text.apply(lambda x: str.lower(x).replace("'", " "))

print("Remove punctuation")
eval.loc[:, "text"] = eval.text.apply(lambda x: " ".join(re.findall('[\w]+', x)))

print("Remove italian stopwords")
eval.loc[:, "text"] = eval.text.apply(lambda x: removeStopWords(x))

print("Stemming process")
eval.loc[:, "text"] = eval.text.apply(lambda x: stem(x))

Development dataset
-  -  -  -  -  -  -  -  -
Lower case text
Remove punctuation
Remove italian stopwords
Stemming process
- - - - - - - - - - - -
Evaluation dataset
-  -  -  -  -  -  -  -  -
Lower case text
Remove punctuation
Remove italian stopwords
Stemming process


In [22]:
# Local training
cv = TfidfVectorizer(ngram_range=(1, 2), binary=True, max_df=0.3)
cv.fit(dev.loc[:, "text"])
X = cv.transform(dev.loc[:, "text"])

print("Fit and Predict:")

X_train, X_test, y_train, y_test = train_test_split(X, dev.loc[:, "class"], test_size=0.2, random_state=0)

for c in [0.5, 1, 5, 10]:
    lr = svm.LinearSVC(class_weight=dict, C=c, max_iter=15000)
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    print("Accuracy for C:%s \n(accuracy_score):%s"
          % (c, accuracy_score(y_test, predictions)))
    print("(f1_score):", f1_score(y_test, predictions, average='weighted'))

feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), lr.coef_[0]
    )
}

print("\n\nBest positive words:")
for best_positive in sorted(
        feature_to_coef.items(),
        key=lambda x: x[1],
        reverse=True)[:5]:
    print(best_positive)
print("\nBest negative words:")
for best_negative in sorted(
        feature_to_coef.items(),
        key=lambda x: x[1])[:5]:
    print(best_negative)

cv = TfidfVectorizer(ngram_range=(1, 2), max_df=0.3)
cv.fit(dev.loc[:, "text"])
X = cv.transform(dev.loc[:, "text"])
X_test = cv.transform(eval.loc[:, "text"])

lr = svm.LinearSVC(class_weight=dict, max_iter=15000)
lr.fit(X, dev.loc[:, "class"])

predictions = lr.predict(X_test)

with open('exam_export.csv', 'w') as file:
    file.write("Id,Predicted\n")
    for index in eval.index:
        s = predictions[index]
        file.write(str(index) + "," + s + "\n")

Fit and Predict:
Accuracy for C:0.5 
(accuracy_score):0.9680055642496957
(f1_score): 0.9679298239468118
Accuracy for C:1 
(accuracy_score):0.9680055642496957
(f1_score): 0.9679298239468118
Accuracy for C:5 
(accuracy_score):0.9680055642496957
(f1_score): 0.9679298239468118
Accuracy for C:10 
(accuracy_score):0.9680055642496957
(f1_score): 0.9679298239468118


Best positive words:
('perfett', 4.315061287108056)
('eccellent', 4.122722642589093)
('fantast', 3.511618121202611)
('confortevol', 3.191626867207938)
('po', 3.187816105687944)

Best negative words:
('pessim', -4.904545308063406)
('sporc', -4.551094328473539)
('scars', -3.9961653985313177)
('scortes', -3.7267099986243837)
('vecc', -3.3512247498507683)


In [16]:
stopWords = stopwords.words('italian')

def removeStopWords(s):
    s = ' '.join(word for word in s.split() if word not in stopWords)
    return s

def stem(s):
    global stemmer
    s = ' '.join(stemmer.stem(word) for word in s.split())
    return s