In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import ComplementNB
#from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
#from nltk.tokenize import sent_tokenize
#from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw
from nltk.stem import SnowballStemmer
from sklearn.svm import SVC
import it_core_news_sm
nlp = it_core_news_sm.load(disable=['tagger','textcat','ner','parser'])
import string
import re
import progressbar
import matplotlib.pyplot as plt
import itertools
from unicodedata import name
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline as mp
from sklearn.pipeline import FeatureUnion as fu

%matplotlib inline
url = 'https://raw.githubusercontent.com/mrblasco/genderNamesITA/master/gender_firstnames_ITA.csv'
itanames = pd.read_csv(url, error_bad_lines=False)['nome']
itanames = itanames[~itanames.str.isalpha()==False]                                                                  
mysw = sw.words("italian")
mysw = itanames.values.tolist() +['milano','venezia']

class LemmaTokenizer(object):
    def __init__(self,total_docs=0,stop_words=[],common_words=[]):
        self.lemmatizer = nlp
        self.stemmer = SnowballStemmer("italian")
        self.total_docs = total_docs
        self.num_doc = 0
        self.bar = None
        stop_words = " ".join(stop_words)
        stop_words = ''.join(c[0] for c in itertools.groupby(stop_words))
        self.stop_words = ["esser","eser","essere","esere","il"]

        for sw in nlp(stop_words):
            self.stop_words.append(self.stemmer.stem(sw.lemma_))
        self.common_words = common_words
    
    def __call__(self, document):
 
        if(self.num_doc == 0):
            self.bar = progressbar.ProgressBar(maxval=self.total_docs, \
                   widgets=[progressbar.Bar('≡', '[', ']'), ' ', progressbar.Percentage()])
            self.bar.start()
        self.num_doc += 1
        document = ''.join(c[0] for c in itertools.groupby(document))

        document = re.sub('[^A-Za-zéèòçàù\s]+', ' ', document)
        document = re.sub('k', 'ch', document)
        document = re.sub('wi fi', 'wifi', document)
        document = re.sub('isim', '', document)
        document = re.sub('albergo', 'hotel', document)
        document = re.sub('hotel', 'strutur', document)
        document = re.sub('cordiale', 'gentile', document)

        
        lemmas = []
        for tt in self.lemmatizer(document):
            if tt.text.isalpha():
                t = tt.lemma_.strip()
                if(t == "no" or t == "non" or t == "not"):
                    lemmas.append("no")
                    continue
                if t == "stella":
                    lemmas.append(t)
                    continue
                t = self.stemmer.stem(t)
                if len(t) >= 2 and not tt.is_stop and t not in self.stop_words or t.startswith('molt'):
                    lemmas.append(t)
        if(self.num_doc >= self.total_docs):
            self.bar.finish()
        else:
            self.bar.update(self.num_doc)   
        return lemmas


    def clear_bar(self,total_docs=0):
        self.num_doc = 0
        self.total_docs = total_docs
    
datadir = "./datasrc/dataset_winter_2020/"

#dataset used in program
datadev = pd.read_csv(datadir+"development.csv").to_numpy()
dataeva = pd.read_csv(datadir+"evaluation.csv")

tokenizer = LemmaTokenizer(total_docs=datadev[:,0].size,stop_words=mysw)
vectorizer = TfidfVectorizer(input='content',tokenizer=tokenizer,ngram_range = (1,3),max_df=0.9,min_df=0.0003,encoding="utf-8")
X_tfidf = vectorizer.fit_transform(datadev[:,0])

[≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡] 100%


In [2]:
svd = TruncatedSVD(n_components=70, random_state=42, algorithm = 'arpack',tol=0)
X_svd = svd.fit_transform(X_tfidf)

In [16]:
#from sklearn.ensemble import RandomForestClassifier
model = SVC()
score_type = "f1_weighted"
cvs = cross_val_score(model,X_svd,datadev[:,1],cv=5,scoring = score_type, n_jobs = 7)
print(f"{score_type} for each iteration:{cvs}")
print(f"{score_type} (statistics): {cvs.mean():.3f} (+/- {cvs.std() * 2:.3f})")


f1_weighted for each iteration:[0.94714691 0.94915492 0.94016083 0.94397755 0.94367385]
f1_weighted (statistics): 0.945 (+/- 0.006)


In [12]:
model = SVC(kernel="rbf")
tokenizer.clear_bar(total_docs=dataeva['text'].count())
x_test = vectorizer.transform(dataeva['text'])
x_test_svd = svd.transform(x_test)
model.fit(X_svd,datadev[:,1])
y_pred = model.predict(x_test_svd)

[≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡] 100%


In [13]:
def print_solution(y_pred):
    fff1 = np.asarray([["Id","Predicted"]])
    fff2 = np.column_stack((dataeva.index.values,y_pred))
    fff3 = np.concatenate((fff1,fff2))
    np.savetxt(datadir+"sample_submission.csv", fff3,fmt='%s', delimiter=",")
print_solution(y_pred)

In [None]:
#from sklearn.manifold import TSNE
#from sklearn.pipeline import make_pipeline as mp
#tsne = TSNE(n_components=50, perplexity=50, verbose=1, init='random', learning_rate=2000,n_iter=1000,early_exaggeration=12, method='exact')
#X_svd = svd.fit_transform(X_tfidf)
#pipeline = mp(svd, tsne)
#red_X = pipeline.fit_transform(X_tfidf)

#tsne = TSNE(n_components=50, perplexity=50, verbose=1, init='random', learning_rate=2000,n_iter=1000,early_exaggeration=12, method='exact')
#X_svd = svd.fit_transform(X_tfidf)
#pipeline = make_pipeline(svd, tsne)
#red_X = pipeline.fit_transform(X_tfidf)

#model = LogisticRegression(n_jobs=6) #used just for fast testing lemmatizer and vectorizer -> svc too slow but higher accuracy
model = SVC(kernel="rbf",C=2.4,gamma=0.72,class_weight='balanced')
score_type = "f1_weighted"
cvs = cross_val_score(model,X_svd,datadev['class'],cv=5,scoring = score_type, n_jobs = 7)
print(f"{score_type} for each iteration:{cvs}")
print(f"{score_type} (statistics): {cvs.mean():.3f} (+/- {cvs.std() * 2:.3f})")

In [None]:
from yellowbrick.text import FreqDistVisualizer
features   = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(X_tfidf)
visualizer.show()

visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(X_tfidf[datadev[:,1]=='pos'])
visualizer.show()

visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(X_tfidf[datadev[:,1]=='neg'])
visualizer.show()


from yellowbrick.text import TSNEVisualizer
tsne = TSNEVisualizer()
tsne.fit(X_svd,datadev[:,1],alphafloat = 0.01,decomposestring=None)
tsne.show()

In [None]:
tsne = TSNE()
red_X = tsne.fit_transform(X_svd)


In [None]:
#from scipy.sparse import hstack

X_pos = X_tfidf[datadev[:,1]=='pos']
X_neg = X_tfidf[datadev[:,1]=='neg']

svd = TruncatedSVD(n_components=50, random_state=42)
pos_svd = svd.fit_transform(X_pos)
neg_svd = svd.transform(X_neg)

combined_svd = np.concatenate((pos_svd, neg_svd), axis=0)

print(combined_svd.shape)

In [None]:
X_pos = datadev[datadev[:,1]=='pos'][:,0]
X_neg = datadev[datadev[:,1]=='neg'][:,0]
pos_docs = X_pos.shape
neg_docs = X_neg.shape
total_docs = pos_docs + neg_docs
evadocs = []

num_words = [] 
num_p_words = []
num_n_words = []

for doc in X_pos:
    size = len(doc)
    num_p_words.append(size)
    
for doc in X_neg:
    size = len(doc)
    num_n_words.append(size)
    
num_words = num_p_words + num_n_words

for doc in dataeva[:,0]:
    evadocs.append(len(doc))
    

In [None]:
def count_elements(seq) -> dict:
    hist = {}
    for i in seq:
        hist[i] = hist.get(i, 0) + 1
    hist = {np.log(k): np.log(v) for k, v in hist.items()}
    hist = np.array(list(hist.items()))
    return hist

pos_dict = count_elements(num_p_words)
neg_dict = count_elements(num_n_words)
all_dict = count_elements(num_words)
eva_dict = count_elements(evadocs)


import matplotlib.pyplot as plt

def plot_histogram(p = {}, n = {}, a = {},eva = {}):
    kwargs = dict(histtype='stepfilled', alpha=1,bins=30)
    #n, bins, patches = plt.hist(x=dic.keys(), bins='auto', color='#0504aa',
    #                            alpha=0.7, rwidth=0.85)
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Length (log)')
    plt.ylabel('Documents (log)')
    plt.title('development')
    #plt.text(23, 45, r'$\mu=15, b=3$')
    #maxfreq = 10000
    # Set a clean upper y-axis limit.
    #plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
    plt.hist(a[:,0], **kwargs,label='all',weights=a[:,1])
    plt.hist(p[:,0], **kwargs,label='pos',weights=p[:,1])
    plt.hist(n[:,0], **kwargs,label='neg',weights=n[:,1])
    plt.legend(loc='upper right')
    plt.show()
    plt.xlabel('Length (log)')
    plt.ylabel('Documents (log)')
    plt.title('evaluation')
    plt.hist(eva[:,0], **kwargs,label='eva',weights=eva[:,1])
    plt.legend(loc='upper right')
    plt.show()
    #plt.hist2d(pt.values(), p.values(), bins=30, cmap='Blues')
    #cb = plt.colorbar()
    #cb.set_label('counts in bin')
    #plt.show()
    #plt.hist2d(nt.values(), n.values(), bins=30, cmap='Greens')
    #cb = plt.colorbar()
    #cb.set_label('counts in bin')
    #plt.show()
    #plt.hist2d(at.values(), a.values(), bins=30, cmap='Reds')
    #cb = plt.colorbar()
    #cb.set_label('counts in bin')
    #plt.show()


    
plot_histogram(pos_dict,neg_dict,all_dict,eva_dict)
print("development statistics:")
print(max(num_words),min(num_words),sum(num_words)/len(num_words))
print("development statistics: pos")
print(max(num_p_words),min(num_p_words),sum(num_p_words)/len(num_p_words))
print("development statistics: neg")
print(max(num_n_words),min(num_n_words),sum(num_n_words)/len(num_n_words))
print("evaluation statistics:")
print(max(evadocs),min(evadocs),sum(evadocs)/len(evadocs))

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=5500, random_state=42)
print(f"Total variance explained: {np.sum(svd.explained_variance_ratio_):.2f}")
cum_variance = np.cumsum(svd.explained_variance_ratio_)
idx = np.argmax(cum_variance > .85)
print(idx)
tsne = TSNE(n_components=50, perplexity=50, verbose=1, init='random', learning_rate=2000,n_iter=1000,early_exaggeration=12, method='exact')
#X_svd = svd.fit_transform(X_tfidf)
pipeline = make_pipeline(svd, tsne)
red_X = pipeline.fit_transform(X_tfidf)

In [None]:
vectorizer.get_feature_names()

In [None]:
word_positions = {v: k for k, v in vectorizer.vocabulary_.items()}
tfidf = X_tfidf[datadev[:,1]=='pos']
tfidf_sum = np.sum(tfidf, axis=0) # numpy.matrix
tfidf_sum = np.asarray(tfidf_sum).reshape(-1) # numpy.array of shape (1, X.shape[1])
top_indices = tfidf_sum.argsort()

top_indices = top_indices[-50:]
p = {word_positions[idx]: tfidf_sum[idx] for idx in top_indices}.keys()
#print(p)
tfidf = X_tfidf[datadev[:,1]=='neg']
tfidf_sum = np.sum(tfidf, axis=0) # numpy.matrix
tfidf_sum = np.asarray(tfidf_sum).reshape(-1) # numpy.array of shape (1, X.shape[1])
top_indices = tfidf_sum.argsort()

top_indices = top_indices[-50:]
n = {word_positions[idx]: tfidf_sum[idx] for idx in top_indices}.keys()
#print(n)

#print(n.keys() - (p.keys() & n.keys()))

word_positions = {v: k for k, v in vectorizer.vocabulary_.items()}
tfidf = X_tfidf
tfidf_sum = np.sum(tfidf, axis=0) # numpy.matrix
tfidf_sum = np.asarray(tfidf_sum).reshape(-1) # numpy.array of shape (1, X.shape[1])
top_indices = tfidf_sum.argsort()
top_indices = top_indices[-50:]
a = {word_positions[idx]: tfidf_sum[idx] for idx in top_indices}
a = a.keys() & set(list(n)+list(p))
common = p & n
print(p-common)
print(n - common)
print(common)
#print(a-common)

In [None]:
"uppercase" in vectorizer.stop_words_

In [None]:
word_positions = {v: k for k, v in vectorizer.vocabulary_.items()}
labels = ['pos','neg']
_words = []
for label in labels:

    # compute the total tfidf for each label
    tfidf = X_tfidf[datadev['class'].to_numpy() == label]
    tfidf_sum = np.sum(tfidf, axis=0) # numpy.matrix
    tfidf_sum = np.asarray(tfidf_sum).reshape(-1) # numpy.array of shape (1, X.shape[1])
    top_indices = tfidf_sum.argsort()

    top_indices = top_indices[-10:]
    _words.append({word_positions[idx]: tfidf_sum[idx] for idx in top_indices})

In [None]:
common_words_polarity = {}
for pword in _words[0].keys():
    for nword in _words[1].keys():
        if(pword == nword):
            common_words_polarity[pword] = _words[0][pword]-_words[1][pword]

only_pos = set(_words[1].keys()) - set(set(_words[0].keys()) & set(_words[1].keys()))
only_pos

In [None]:
_words

In [None]:
X_pos = datadev[datadev['class']=='pos']
X_neg = datadev[datadev['class']=='neg']
Xp_tfidf = vectorizer.fit_transform(X_pos)
Xn_tfidf = vectorizer.fit_transform(X_neg)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(datadev['text'].to_numpy(), datadev['class'].to_numpy(), test_size=0.33, random_state=0)

#train preprocessing
tokenizer2 = LemmaTokenizer(total_docs = datadev['text'].count(),stop_words=mysw)
vectorizer2 = TfidfVectorizer(input='content',tokenizer=tokenizer,ngram_range = (1,3),max_df=0.9,min_df=0.0003,encoding="utf-8",use_idf=False)#use_idf=False)#,ngram_range=(3,4))#,max_df=0.62)#,strip_accents='unicode',max_df=1.0)#,min_df=0.01)
X_train_tfidf = vectorizer2.fit_transform(X_train)
print("vectorization train done...")

#test preprocessing

X_test_tfidf = vectorizer2.transform(X_test)
#X_test_svd = svd.fit_transform(X_test_tfidf)
print("vectorization test done...")

testclsf = SVC(kernel="rbf",C=2,gamma=0.73,class_weight='balanced')
testclsf.fit(X_train_tfidf, y_train)
y_pred_test = testclsf.predict(X_test_tfidf)



In [None]:
#X_pred_wrong_tfidf = X_test_tfidf[y_test != y_pred_test]
def generate_wordclouds(X_tfidf, y_lab ,word_positions,title = ""):
    
    labels = ['pos','neg']
    top_count = 8
    min_support = 0.3
    dist_words = sorted(v for k, v in word_positions.items())
    _words = []

    for label in labels:
        
        # compute the total tfidf for each label
        tfidf = X_tfidf[y_lab == label]
        tfidf_sum = np.sum(tfidf, axis=0) # numpy.matrix
        tfidf_sum = np.asarray(tfidf_sum).reshape(-1) # numpy.array of shape (1, X.shape[1])
        top_indices = tfidf_sum.argsort()
        
        top_indices = top_indices[-top_count:]
        _words.append({word_positions[idx]: tfidf_sum[idx] for idx in top_indices})
        
        term_weights = {word_positions[idx]: tfidf_sum[idx] for idx in top_indices}
        wc = WordCloud(width=1200, height=800, background_color="white")
        wordcloud = wc.generate_from_frequencies(term_weights)

        fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis("off")
        fig.suptitle(f"{title} sentiment {label}") 
    
    return _words


word_positions = {v: k for k, v in vectorizer2.vocabulary_.items()}
shouldbe = generate_wordclouds(X_tfidf, y_test[y_test != y_pred_test],word_positions)
reallyis = generate_wordclouds(X_tfidf, y_pred_test[y_test != y_pred_test],word_positions)

for i in range(2):
    common = set(set(shouldbe[i]) & set(reallyis[i]))
    print(str(i)+"-shouldbe:")
    print(set(shouldbe[i])-common)
    print(str(i)+"-reallyis:")
    print(set(reallyis[i])-common)

In [None]:
np.dot((datadev=='pos'),rapporto)

In [None]:
X_tfidf[2]