In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords as sw
from nltk.stem import SnowballStemmer
from sklearn.svm import SVC
import it_core_news_sm
nlp = it_core_news_sm.load(disable=['tagger','textcat','ner','parser'])
import string
import re
import progressbar
import matplotlib.pyplot as plt
from unicodedata import name
import itertools
%matplotlib inline
import math


In [23]:
mysw = sw.words("italian")

class LemmaTokenizer(object):
    def __init__(self,total_docs=0,stop_words=[],common_words=[]):
        self.lemmatizer = nlp
        self.stemmer = SnowballStemmer("italian")
        self.total_docs = total_docs
        self.num_doc = 0
        self.bar = None
        stop_words = " ".join(stop_words)
        stop_words = ''.join(c[0] for c in itertools.groupby(stop_words))
        self.stop_words = ["esser","eser","essere","esere"]

        for sw in nlp(stop_words):
            self.stop_words.append(self.stemmer.stem(sw.lemma_))
        self.common_words = common_words
    
    
    def __call__(self, document):
 
        if(self.num_doc == 0):
            self.bar = progressbar.ProgressBar(maxval=self.total_docs, \
                   widgets=[progressbar.Bar('≡', '[', ']'), ' ', progressbar.Percentage()])
            self.bar.start()
        self.num_doc += 1
        document = ''.join(c[0] for c in itertools.groupby(document))
        try:
            document = re.sub('[\U0001F602-\U0001F64F]', lambda m: " "+name(m.group())+ " ", document)
        except:
            i=0
        try:
            document = re.sub('?', ' interogative ', document)
        except:
            i=0
        try:
            document = re.sub('!', ' esclamative ', document)
        except:
            i=0
        document = re.sub('[^A-Za-zéèòçàù\s]+', ' ', document)
        document = re.sub('k', 'ch', document)
        document = re.sub('wi fi', 'wifi', document)
        
        lemmas = []
        for tt in self.lemmatizer(document):
            if tt.text.isalpha():
                t = tt.lemma_.strip()
                t = self.stemmer.stem(t)
                if(t == "no" or t == "non" or t == "not"):
                    lemmas.append("no")
                elif(t.startswith('molt') or t.startswith('stel')):
                    lemmas.append(t)
                elif len(t) >= 2 and len(t)<16 and t not in self.stop_words:
                    t = self.stemmer.stem(t)
                    lemmas.append(t)
        if(self.num_doc >= self.total_docs):
            self.bar.finish()
        else:
            self.bar.update(self.num_doc)   
        return lemmas

    
    def __call2__(self,document):
        #print("\n\n\n\n\n\n"+document)
        if(self.num_doc == 0):
            self.bar = progressbar.ProgressBar(maxval=self.total_docs, \
                   widgets=[progressbar.Bar('≡', '[', ']'), ' ', progressbar.Percentage()])
            self.bar.start()
        self.num_doc += 1
        lemmas = []
        document = ''.join(c[0] for c in itertools.groupby(document))
        document = re.sub('[\,\;\.\:\-\(\)\"\!\?]+', ' | ', document)
        document = re.sub('[^A-Za-z0-9éèòçàù|]+', ' ', document)
        for subdoc in sent_tokenize(document):
            ds = document.split("|")
            for sents in ds:
                current_sents = []
                for t in nlp(sents): 
                    pt = self.stemmer.stem(t.lemma_)
                    if(t.lemma_ == "no" or t.lemma_ == "non"):
                        #lemmas.append(pt)
                        current_sents.append(pt)
                    elif(not t.is_alpha or len(pt) < 2 or len(pt) > 16 or pt in self.stop_words):
                        continue
                    elif(t.is_stop or pt in self.common_words):
                        current_sents.append(pt)
                    else:
                        lemmas.append(pt)
                        current_sents.append(pt)
                
                ran = len(current_sents)-3
                if(ran>-1):
                    current_sents.sort()
                    for s in range(ran): #ngram_range optimization
                        lemmas.append(current_sents[s]+" "+current_sents[s+1])
                        lemmas.append(current_sents[s]+" "+current_sents[s+1] + " " + current_sents[s+2])
                    lemmas.append(current_sents[-1]+" "+current_sents[-2])
                elif(ran > -2):
                    current_sents.sort()
                    lemmas.append(current_sents[0]+" "+current_sents[1])
                            
        if(self.num_doc >= self.total_docs):
            self.bar.finish()
        else:
            self.bar.update(self.num_doc)
        return lemmas
    
    def clear_bar(self,total_docs=0):
        self.num_doc = 0
        self.total_docs = total_docs
    
    



In [24]:
datadir = "./datasrc/dataset_winter_2020/"

#dataset used in program
datadev = pd.read_csv(datadir+"development.csv")
dataeva = pd.read_csv(datadir+"evaluation.csv")

tokenizer = LemmaTokenizer(total_docs=datadev['text'].count(),stop_words=mysw)
vectorizer = TfidfVectorizer(input='content',encoding="utf-8",tokenizer=tokenizer,ngram_range = (1,3),max_df=0.90,min_df=0.0003)#, use_idf=False)#,ngram_range=(3,4))#,max_df=0.62)#,strip_accents='unicode',max_df=1.0)#,min_df=0.01)
X_tfidf = vectorizer.fit_transform(datadev['text'])


[≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡] 100%


In [26]:
model = SVC(kernel="rbf", C=2.2 , gamma=0.725 , class_weight='balanced')
tokenizer.clear_bar(total_docs=dataeva['text'].count())
x_test = vectorizer.transform(dataeva['text'])
model.fit(X_tfidf,datadev['class'].to_numpy())
y_pred = model.predict(x_test)

[≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡] 100%


In [27]:
def print_solution(y_pred):
    fff1 = np.asarray([["Id","Predicted"]])
    fff2 = np.column_stack((dataeva.index.values,y_pred))
    fff3 = np.concatenate((fff1,fff2))
    np.savetxt(datadir+"sample_submission.csv", fff3,fmt='%s', delimiter=",")
print_solution(y_pred)

In [28]:
model = MultinomialNB() 
score_type = "f1_weighted"
cvs = cross_val_score(model,X_tfidf,datadev['class'],cv=5,scoring = score_type, n_jobs = 7)
print(f"{score_type} for each iteration:{cvs}")
print(f"{score_type} (statistics): {cvs.mean():.3f} (+/- {cvs.std() * 2:.3f})")

f1_weighted for each iteration:[0.96074725 0.95941106 0.9538043  0.95639121 0.95980215]
f1_weighted (statistics): 0.958 (+/- 0.005)


In [25]:
#model = MultinomialNB() #used just for fast testing lemmatizer and vectorizer -> svc too slow
model = SVC(kernel="rbf", C=2.2 , gamma=0.725 , class_weight='balanced')
score_type = "f1_weighted"
cvs = cross_val_score(model,X_tfidf,datadev['class'],cv=5,scoring = score_type, n_jobs = 7)
print(f"{score_type} for each iteration:{cvs}")
print(f"{score_type} (statistics): {cvs.mean():.3f} (+/- {cvs.std() * 2:.3f})")

f1_weighted for each iteration:[0.96913932 0.97098886 0.9642998  0.96747226 0.96614143]
f1_weighted (statistics): 0.968 (+/- 0.005)


In [9]:
# C10   gamma 1     - Cs = [0.01,0.1,1,10]                         gammas = [0.001,0.01,0.1,1]
# C8    gamma 0.8   - Cs = [8,9,10,11,12]                          gammas = [0.6,0.8,1,1.2,1.4]
# C5    gamma 0.7   - Cs = [5,6,7,8]                               gammas = [0.7,0,75,0.8,0.85,0.9]
# C2    gamma 0.73  - Cs = [2,3,4,5,5.5]                           gammas = [0.65,0.67,0.69,0.71,0.73]
# C2.4  gamma 0.72  - Cs = [1.4,1.6,1.8,2,2.2,2.4]                 gammas = [0.72,0.73,0.74]
# C2.2  gamma 0.725 - Cs = [2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9]   gammas = [0.725,0.73,0.735]

Cs = [2.15,2.2,2.25]
gammas = [0.712,0.723,0.725,0.727,0.729]
param_grid = {'C': Cs, 'gamma' : gammas }
grid_search = GridSearchCV(SVC(kernel="rbf",class_weight='balanced'), param_grid,n_jobs=6, cv=4, scoring="f1_weighted")
grid_search.fit(X_tfidf, datadev['class'])
grid_search.best_params_


{'C': 2.2, 'gamma': 0.725}