# Classification Tests

spiegazione test

### Imports

In [1]:
import re
import xml.etree.ElementTree as ET
import nltk
from nltk.stem.snowball import ItalianStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
import pandas as pd
import numpy as np
import collections
from sklearn.model_selection import cross_val_score
# Skopt functions
from sklearn.metrics import make_scorer
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

In [2]:
#nltk.download()

## Classes

In [3]:
class CarWordsHandler:
    # https://github.com/n8barr/automotive-model-year-data
    def __init__(self, cars_file):
        self.brands_list = set()
        self.models_list = set()
        f = open(cars_file, "r")
        cars_list = f.read().splitlines()
        for i in range(len(cars_list)):
            brand = cars_list[i].split(',')[1][2:-1].lower()
            model = cars_list[i].split(',')[2][2:-2].lower()
            self.brands_list.add(brand)
            self.models_list.add(model)
        # remove some useless models
        self.models_list.remove('i')
        self.models_list.remove('gli')
        self.models_list.remove('estate')
        self.brands_list = list(self.brands_list)
        self.models_list = list(self.models_list)
        self.brands_list.sort()
        self.models_list.sort()
    
    # binary search to get if a word is a brand 
    def isBrand(self, word):
        word = word.lower()
        first = 0
        last = len(self.brands_list) -1
        while first < last:
            mid = int((last + first) / 2)
            if word == self.brands_list[mid]:
                return True
            elif word < self.brands_list[mid]:
                last = mid
            elif word > self.brands_list[mid]:
                first = mid
            if last-first == 1:
                if self.brands_list[first] == word or self.brands_list[last] == word:
                    return True
                else:
                    return False
    # binary search to get if a word is a brand 
    def isModel(self, word):
        word = word.lower()
        first = 0
        last = len(self.models_list) -1
        while first < last:
            mid = int((last + first) / 2)
            if word == self.models_list[mid]:
                return True
            elif word < self.models_list[mid]:
                last = mid
            elif word > self.models_list[mid]:
                first = mid
            if last-first == 1:
                if self.models_list[first] == word or self.models_list[last] == word:
                    return True
                else:
                    return False

## Functions

In [4]:
# encoding issues
def correctEncodings(comment):
    fin_comment = comment
    fin_comment = re.sub('Ã¨', 'è', fin_comment)
    fin_comment = re.sub('Ã©', 'é', fin_comment)
    fin_comment = re.sub('Ã¬', 'ì', fin_comment)
    fin_comment = re.sub('Ã²', 'ò', fin_comment)
    fin_comment = re.sub('Ã¹', 'ù', fin_comment)
    fin_comment = re.sub('Ã', 'à', fin_comment)
    return fin_comment
# recognize an URL
def isURL(word):
    # http://forum.rusconi.it/gentemotori/viewtopic.php ? t=434&sid=57c88f1b507d8f57717ea18e74e25324Â 
    return len(re.findall("^((http(s){0,1}://)|(www.))\S+$", word)) > 0
# recognize an image tag
def isPicture(word):
    return len(re.findall("^<img.*>$", word)) > 0
# remove punctation
def removePunctation(comment):
    return re.sub(r'\s{2,}', ' ', str(re.sub(r'[\'\"\,\.,\:\-]', ' ', comment)))
# fix issues on urls
def replaceURLs(comment):
    return str(re.sub(r'(http(s){0,1}://|www.)(([^\s]+)|/)+((\s\?\s)[^\s]+){0,1}', 'URL', comment)).replace(u'\xa0', u' ')
# replace images
def replaceIMGs(comment):
    return str(re.sub(r'<img.+>', 'IMG', comment))
# replace brands
def replaceBrands(cwhandler, comment):
    tokens = comment.split(' ')
    for i in range(len(tokens)):
        if cwhandler.isBrand(tokens[i]):
            tokens[i] = 'BRAND'
    return ' '.join(tokens)
# replace models
def replaceModels(cwhandler, comment):
    tokens = comment.split(' ')
    for i in range(len(tokens)):
        if cwhandler.isModel(tokens[i]):
            tokens[i] = 'MODEL'
    return ' '.join(tokens)
# replace question marks
def replaceQMarks(comment):
    comment = re.sub(r'\?{2,}', ' MULTI_QMARK', comment)
    comment = re.sub(r'\?', ' QMARK', comment)
    return comment
# replace esclamation marks
def replaceEMarks(comment):
    comment = re.sub(r'\!{2,}', ' MULTI_EMARK', comment)
    comment = re.sub(r'\!', ' EMARK', comment)
    return comment
# remove character repetitions
def removeRepeat(comment):
    return re.sub(r'(a-zA-Z)\1{2,}', r'\1\1\1', comment)
# replace speed
def replaceSpeed(comment):
    return re.sub(r'([0-9\.*]+(\s*(\-|\/|\s)\s*)+){0,1}[0-9\.*]+(\s*)(km\/h|mph)', 'SPEED', comment)
# replace consumption
def replaceConsumption(comment):
    return re.sub(r'([0-9\.*]+(\s*(\-|\/|\s)\s*)+){0,1}[0-9\.*]+(\s*)(km\/l|mpg)', 'CONSUMPTION', comment)
# replace weight
def replaceWeight(comment):
    return re.sub(r'[0-9\.*]+(\s*)(kg|tonnellate|ton|chili|kili)', 'WEIGHT', comment)

In [5]:
class ItalianWordsHandler:
    # https://dspace-clarin-it.ilc.cnr.it/repository/xmlui/handle/20.500.11752/ILC-73
    def __init__(self, words_file):
        # words information
        self.words_dict = dict()
        root = ET.parse(words_file).getroot()
        for entry in root.findall('Lexicon/LexicalEntry'):
            word = entry.find('Lemma').get('writtenForm')
            pos = entry.get('partOfSpeech')
            senti = entry.find('Sense/Sentiment').get('polarity')
            conf = entry.find('Sense/Confidence').get('score')
            self.words_dict[word] = {'POS': pos, 'Sentiment': senti, 'Confidence': conf}
        # stemmer
        self.it_stem = ItalianStemmer()
        
    # get word info. None if not exists
    def getWordInfo(self, word):
        # fields: POS, Sentiment, Confidence
        return self.words_dict.get(word)
    
    # italian stemmer http://snowball.tartarus.org/algorithms/italian/stemmer.html
    def stem(self, word):
        return self.it_stem.stem(word)
    
    # correct words
    def correctWords(self, text):
        # not yet implemented
        return text

In [6]:
class Preprocessor:
    
    def __init__(self):
        self.cwh = CarWordsHandler('resources/cars_data.sql')
        self.iwh = ItalianWordsHandler('resources/ita_opeNER.xml')
    # preprocess text
    # allowed methods: word, swnt, pos
    # ner (named entity recognition), replacing for instance 100 km/h with SPEED
    def preprocessText(self, text, method='word', use_stemmer=False, ner=False):
        if method not in ['word', 'pos', 'swnt']:
            raise ValueError('Method not recognized. Select from word, swnt, pos')
        # correct encodings (not yet implemented)
        fin_text = correctEncodings(text)
        # remove punctation
        fin_text = removePunctation(fin_text)
        # some basic preprocessing
        fin_text = fin_text.lower()
        # correct words (not yet)
        fin_text = self.iwh.correctWords(fin_text)
        # manage repetitions
        fin_text = removeRepeat(fin_text)
        # manage punctation
        fin_text = replaceQMarks(fin_text)
        fin_text = replaceEMarks(fin_text)
        # manage URLs
        fin_text = replaceURLs(fin_text)
        # manage Images
        fin_text = replaceIMGs(fin_text)
        # NOW DEPENDS ON NER
        if ner:
            # manage brands and models
            fin_text = replaceBrands(self.cwh, fin_text)
            fin_text = replaceModels(self.cwh, fin_text)
            # manage speed consumption and weight
            fin_text = replaceSpeed(fin_text)
            fin_text = replaceConsumption(fin_text)
            fin_text = replaceWeight(fin_text)
        # NOW DEPENDS ON METHOD
        if method == 'word':
            # just do nothing except eventually stemming
            if use_stemmer:
                tokens = fin_text.split(' ')
                fin_text = ' '.join([t if t.isupper() else self.iwh.stem(t) for t in tokens]) 
        elif method == 'swnt':
            tokens = fin_text.split(' ')
            swnt_tokens = []
            for t in tokens:
                info = self.iwh.getWordInfo(t)
                if info == None or info['Sentiment'] == None:
                    swnt_tokens.append(t)
                else:
                    # confidence 0-100
                    swnt_tokens.append(str(info['Sentiment'])[:3].upper() + '_' + str(int(float(info['Confidence'])*10)))
            fin_text = ' '.join(swnt_tokens)
            # stemmer
            if use_stemmer:
                tokens = fin_text.split(' ')
                fin_text = ' '.join([t if t.isupper() else self.iwh.stem(t) for t in tokens]) 
        elif method == 'pos':
            tokens = fin_text.split(' ')
            pos_tokens = []
            # pos
            for t in tokens:
                info = self.iwh.getWordInfo(t)
                if info == None or info['POS'] == None:
                    # unknown tag
                    pos_tokens.append('UNK')
                else:
                    pos_tokens.append(str(info['POS']).upper())
            # pos_word
            for t in tokens:
                info = self.iwh.getWordInfo(t)
                if info == None or info['POS'] == None:
                    pos_tokens.append('UNK_' + str(t))
                else:
                    pos_tokens.append(str(info['POS']).upper() + '_' + str(t))
                    
            fin_text = ' '.join(pos_tokens)
            # stemmer
            if use_stemmer:
                tokens = fin_text.split(' ')
                fin_text = ' '.join([t if t.isupper() else self.iwh.stem(t) for t in tokens]) 
        
        return str(re.sub(r'\s{2,}', ' ', fin_text))

In [7]:
class Vectorizer:
    
    def __init__(self, list_comments, method='bow', max_features=1000, ngrams=2, just_presence=False):
        if method not in ['bow', 'tfidf']:
            raise ValueError('Method not recognized. Select from bow, tfidf')
        if method == 'bow':
            self.vectorizer = CountVectorizer(ngram_range=(1,ngrams), binary=just_presence, lowercase=False, max_features=max_features)
        elif method == 'tfidf':
            self.vectorizer = TfidfVectorizer(ngram_range=(1,ngrams), lowercase=False, max_features=max_features)
        # fit vectorizer
        self.vectorizer.fit(list_comments)          
        
        
        '''
        self.list_comments = list_comments
        # initialize tfidf weights
        self.idf_dict = {}
        idf_dict_uni = {}
        idf_dict_big = {}
        # unigrams
        for comment in list_comments:
            tokens = list(set(comment.split()))
            for t in tokens:
                if idf_dict_uni.get(t) != None:
                    idf_dict_uni[t] = idf_dict_uni[t] + 1
                else:
                    idf_dict_uni[t] = 1
        # bigrams
        for comment in list_comments:
            tokens = comment.split()
            for i in range(len(tokens) -2):
                big = (tokens[i], tokens[i+1])
                if idf_dict_big.get(big) != None:
                    idf_dict_big[big] = idf_dict_big[big] + 1
                else:
                    idf_dict_big[big] = 1
        # cut most frequent
        idf_dict_uni = Counter(idf_dict_uni).most_common(most_common_unigrams)
        idf_dict_big = Counter(idf_dict_big).most_common(most_common_bigrams)
        self.idf_dict.update(idf_dict_uni)
        self.idf_dict.update(idf_dict_big)
        '''
        
    def vectorize(self, comment):
        
        return self.vectorizer.transform([comment])
        
        '''
        if method not in ['bow', 'tfidf']:
            raise ValueError('Method not recognized. Select from bow, tfidf')
        unigrams = comment.split(' ')
        bigrams = []
        for i in range(len(unigrams) -2):
            bigrams.append((unigrams[i], unigrams[i+1]))
        if method == 'bow':
            bow_dict = dict.fromkeys(self.idf_dict, 0)
            for u in unigrams + bigrams:
                if bow_dict.get(u) != None:
                    if just_presence:
                        bow_dict[u] = 1
                    else:
                        bow_dict[u] = bow_dict[u] +1
            return list(bow_dict.values())
        elif method == 'tfidf':
            tf_dict = dict.fromkeys(self.idf_dict, 0)
            for u in unigrams + bigrams:
                if tf_dict.get(u) != None:
                    tf_dict[u] = tf_dict[u] +1
        '''
        
    def get_feature_names(self):
        return self.vectorizer.get_feature_names()

Vectorization Example

In [8]:
v = Vectorizer(list_comments=['ciao come va ?', 'vediamo come scrive ciao questa tastiera', 'non lo so, per me è falso'], method='tfidf', max_features=100, ngrams=2, just_presence=True)
print(v.get_feature_names())
print(v.vectorize('ciao ciao come va ?'))

['ciao', 'ciao come', 'ciao questa', 'come', 'come scrive', 'come va', 'falso', 'lo', 'lo so', 'me', 'me falso', 'non', 'non lo', 'per', 'per me', 'questa', 'questa tastiera', 'scrive', 'scrive ciao', 'so', 'so per', 'tastiera', 'va', 'vediamo', 'vediamo come']
  (0, 22)	0.41197297843389025
  (0, 5)	0.41197297843389025
  (0, 3)	0.3133160688892059
  (0, 1)	0.41197297843389025
  (0, 0)	0.6266321377784118


Preprocessing Example

In [9]:
text = 'Sono reali calcolati nel arco del tutto anno nel estate qualcosa in piÃ¹ causa gomme di 17" e climatizzatore nel inverno un po di meno. Per quanto riguarda le autostrade quelle che percorro io principalmente la A4 e molto congestionata cosi spesso la media e 110-115 km/h che ovviamente influisce positivamente a i consumi. Ma quello che mi piace di piÃ¹ Ã¨ assenza dei guasti. Sulla vecchia Accord il primo guasto lo ho avuto a 200000 km si Ã¨ rotto il termostato della clima. Ogni tanto faccio giro di altri forum e leggo delle turbine rotte catene di distribuzione progettate male iniettori fatti male mah nel 2015 per me sono le cose incomprensibili . Con tutti gli difetti che puÃ² avere preferisco la Honda. '
print(text)
print('##########################################################################################')
p = Preprocessor()
print(p.preprocessText(text, ner=False, use_stemmer=False, method='word'))
print('##########################################################################################')
print(p.preprocessText(text, ner=False, use_stemmer=False, method='swnt'))
print('##########################################################################################')
print(p.preprocessText(text, ner=False, use_stemmer=False, method='pos'))


Sono reali calcolati nel arco del tutto anno nel estate qualcosa in piÃ¹ causa gomme di 17" e climatizzatore nel inverno un po di meno. Per quanto riguarda le autostrade quelle che percorro io principalmente la A4 e molto congestionata cosi spesso la media e 110-115 km/h che ovviamente influisce positivamente a i consumi. Ma quello che mi piace di piÃ¹ Ã¨ assenza dei guasti. Sulla vecchia Accord il primo guasto lo ho avuto a 200000 km si Ã¨ rotto il termostato della clima. Ogni tanto faccio giro di altri forum e leggo delle turbine rotte catene di distribuzione progettate male iniettori fatti male mah nel 2015 per me sono le cose incomprensibili . Con tutti gli difetti che puÃ² avere preferisco la Honda. 
##########################################################################################
sono reali calcolati nel arco del tutto anno nel estate qualcosa in più causa gomme di 17 e climatizzatore nel inverno un po di meno per quanto riguarda le autostrade quelle che percorro io prin

For each comment divide Text from Quote by setting _TEXT or _QUOTE at the end of each word. This after preprocessing

In [10]:
def combine_text_quote(text, quote):
    text_tokens = text.split(' ')
    quote_tokens = quote.split(' ')
    combined_tokens = []
    for tt in text_tokens:
        combined_tokens.append(str(tt) + '_TEXT')
    for qt in quote_tokens:
        combined_tokens.append(str(qt) + '_QUOTE')
    return ' '.join(combined_tokens)

In [11]:
print(combine_text_quote('Bene grazie', 'ciao come va'))

Bene_TEXT grazie_TEXT ciao_QUOTE come_QUOTE va_QUOTE


# Dataset

In [12]:
dataset = pd.read_csv('dataset.csv').fillna('')[['TESTO', '(Testo Citato)', 'Brand']]
dataset

Unnamed: 0,TESTO,(Testo Citato),Brand
0,Allora il problema è che non sono aggiornati i...,,irrilevante
1,E' virgolettato appositamente.... E soprattutt...,,irrilevante
2,Mah io sulla mappa ev-way ho visto solo tipo 2...,'Inferiore non s ma probabilmente uguale (il m...,irrilevante
3,Ah però.... uno pensa di averne viste tante su...,Sinceramente una differenza di 9.800? non mi p...,irrilevante
4,Basta darsi delle regole e per questo tipo di ...,,irrilevante
5,Personalmente posso ritenermi un possessore di...,,positivo
6,Confermo che il posteriore è riuscitissimo e a...,"""sono stato anch'io in conce per altri motivi ...",irrilevante
7,Mi sa che l'unica cosa apprezzabile del rst è ...,,irrilevante
8,grazie ! ! ! ! !,,irrilevante
9,Cosimo hai centrato completamente,,irrilevante


Preprocessing

In [13]:
pp = Preprocessor()
# TESTO
for i in range(len(dataset)):
    dataset['TESTO'][i] = pp.preprocessText(dataset['TESTO'][i], method='word', use_stemmer=True, ner=True)
# (Testo Citato)
for i in range(len(dataset)):
    dataset['(Testo Citato)'][i] = pp.preprocessText(dataset['(Testo Citato)'][i], method='word', use_stemmer=True, ner=True)    

In [14]:
# combine labels "molto positivo" = "positivo" and so on
dataset['Brand'] = dataset['Brand'].replace('molto positivo', 'positivo')
dataset['Brand'] = dataset['Brand'].replace('molto negativo', 'negativo')

Vectorization

In [15]:
preprocessed_dataset = []
for i in range(len(dataset)):
    preprocessed_dataset.append((combine_text_quote(dataset['TESTO'][i], dataset['(Testo Citato)'][i]), dataset['Brand'][i]))

In [16]:
preprocessed_dataset[10]

('bisogn_TEXT distingu_TEXT tra_TEXT le_TEXT garanz_TEXT supplementar_TEXT cio_TEXT quell_TEXT acquist_TEXT success_TEXT in_TEXT cui_TEXT firm_TEXT un_TEXT ver_TEXT e_TEXT pror_TEXT contratt_TEXT un_TEXT cui_TEXT esul_TEXT l_TEXT applic_TEXT del_TEXT decret_TEXT mont_TEXT e_TEXT la_TEXT garanz_TEXT tip_TEXT quell_TEXT corean_TEXT che_TEXT son_TEXT dirett_TEXT di_TEXT 5/7_TEXT anni_TEXT EMARK_TEXT _TEXT _QUOTE',
 'irrilevante')

Split into training and test

In [17]:
numpy_dataset = np.array(preprocessed_dataset)
print('Length: ' + str(numpy_dataset.shape[0]))

Length: 7183


In [18]:
np.random.seed(26)
np.random.shuffle(numpy_dataset)
train_dataset, test_dataset = numpy_dataset[:6000,:], numpy_dataset[6000:,:]

Some statistics

In [19]:
collections.Counter(train_dataset[:,1])

Counter({'irrilevante': 4867, 'negativo': 349, 'positivo': 543, 'neutro': 241})

In [20]:
# fit vectorizer
MAX_FEATURES = 10000
vec = Vectorizer(train_dataset[:,0], method='bow', max_features=MAX_FEATURES, ngrams=2, just_presence=False)

In [21]:
# try
vec.vectorize('bisogn_TEXT distingu_TEXT tra_TEXT le_TEXT').toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

Finally replace text with vector, and replace label with numerical representation

In [22]:
# final dataset for the classification
train_X = np.empty([train_dataset.shape[0], MAX_FEATURES])
train_y = np.empty([train_dataset.shape[0], 1])

for i in range(train_dataset.shape[0]):
    #train_X[i] = vec.vectorize(train_dataset[i, 0]).toarray()[0].copy()
    # copy vectorized elements in final dataset
    vectorized = vec.vectorize(train_dataset[i, 0]).toarray()[0]
    for j in range(train_X.shape[1]):
        train_X[i,j] = vectorized[j]
    if train_dataset[i, 1] == 'irrilevante':
        train_y[i] = 0
    elif train_dataset[i, 1] == 'positivo':
        train_y[i] = 1
    elif train_dataset[i, 1] == 'neutro':
        train_y[i] = 2
    elif train_dataset[i, 1] == 'negativo':
        train_y[i] = 3

## SVM _TFIDF Classifier

https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a

In [23]:
svm_clf = svm.SVC()

# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# scorer
my_scorer = make_scorer(f1_score)

# Define search space for SVM;
search_space_SVM = {"C": Real(0.01, 10),
                    "kernel": Categorical(categories=['linear', 'rbf', 'poly', 'sigmoid']),
                    "degree": Categorical(categories=[3, 5]),
                    "gamma": Categorical(categories=['scale']),
                    "coef0": Real(0.01, 10),
                    "class_weight": Categorical(categories=['balanced']),
                    "probability": Categorical(categories=[True, False]),
                    "decision_function_shape": Categorical(categories=['ovo', 'ovr'])
                   }

opt = BayesSearchCV(svm_clf,
                    search_spaces=search_space_SVM,
                    scoring=my_scorer,
                    n_iter=40,
                    cv=skf,
                    n_jobs=-1,
                    return_train_score=True,
                    random_state=4,
                    verbose=10,
                   )

In [24]:
# fit model
opt.fit(train_X, train_y)

best_score = opt.best_score_
best_score_std = opt.cv_results_['std_test_score'][opt.best_index_]
best_params = opt.best_params_

print('Best score: ' + str(best_score) + " std: " + str(best_score_std))
print('Best params: ' + str(best_params))
#preds = opt.predict(x_test)
#print('f1-score: ' + str(f1_score(y_test, preds)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] C=8.216234727026476, class_weight=balanced, coef0=2.6829639217572407, decision_function_shape=ovr, degree=3, gamma=scale, kernel=rbf, probability=False 
[CV] C=8.216234727026476, class_weight=balanced, coef0=2.6829639217572407, decision_function_shape=ovr, degree=3, gamma=scale, kernel=rbf, probability=False 


  y = column_or_1d(y, warn=True)


JoblibTypeError: JoblibTypeError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7f904aa005d0, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/giuseppe/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/home/giusep.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7f904aa005d0, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/giuseppe/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/home/giusep.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    122         except (RuntimeError, AssertionError):
    123             old_loop = None
    124         try:
    125             self._setup_logging()
    126             asyncio.set_event_loop(self.asyncio_loop)
--> 127             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    128         finally:
    129             asyncio.set_event_loop(old_loop)
    130 
    131     def stop(self):

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    433             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    434                                    finalizer=self._asyncgen_finalizer_hook)
    435         try:
    436             events._set_running_loop(self)
    437             while True:
--> 438                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    439                 if self._stopping:
    440                     break
    441         finally:
    442             self._stopping = False

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1446                         logger.warning('Executing %s took %.3f seconds',
   1447                                        _format_handle(handle), dt)
   1448                 finally:
   1449                     self._current_handle = None
   1450             else:
-> 1451                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(11, 1)>>
   1452         handle = None  # Needed to break cycles when an exception occurs.
   1453 
   1454     def _set_coroutine_wrapper(self, enabled):
   1455         try:

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/asyncio/events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(11, 1)>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (11, 1)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=11, events=1)
    112             self.writers.remove(fd)
    113         del self.handlers[fd]
    114 
    115     def _handle_events(self, fd, events):
    116         fileobj, handler_func = self.handlers[fd]
--> 117         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    118 
    119     def start(self):
    120         try:
    121             old_loop = asyncio.get_event_loop()

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 7, 15, 9, 57, 24, 939341, tzinfo=tzutc()), 'msg_id': 'f4061d9d08dc41148a39e2289f5c494b', 'msg_type': 'execute_request', 'session': '4494e62eb7694f53b2f24b6d1ffda52f', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'f4061d9d08dc41148a39e2289f5c494b', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'4494e62eb7694f53b2f24b6d1ffda52f']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 7, 15, 9, 57, 24, 939341, tzinfo=tzutc()), 'msg_id': 'f4061d9d08dc41148a39e2289f5c494b', 'msg_type': 'execute_request', 'session': '4494e62eb7694f53b2f24b6d1ffda52f', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'f4061d9d08dc41148a39e2289f5c494b', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'4494e62eb7694f53b2f24b6d1ffda52f'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 7, 15, 9, 57, 24, 939341, tzinfo=tzutc()), 'msg_id': 'f4061d9d08dc41148a39e2289f5c494b', 'msg_type': 'execute_request', 'session': '4494e62eb7694f53b2f24b6d1ffda52f', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'f4061d9d08dc41148a39e2289f5c494b', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))"
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))",), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))",)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))", store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = "# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))"
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="# fit model\nopt.fit(train_X, train_y)\n\nbest_scor...rint('f1-score: ' + str(f1_score(y_test, preds)))", store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Expr object>], cell_name='<ipython-input-24-13c01577410d>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7f900536af28, executi...rue silent=False shell_futures=True> result=None>)
   2898 
   2899         try:
   2900             for i, node in enumerate(to_run_exec):
   2901                 mod = ast.Module([node])
   2902                 code = compiler(mod, cell_name, "exec")
-> 2903                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f9009ddb540, file "<ipython-input-24-13c01577410d>", line 2>
        result = <ExecutionResult object at 7f900536af28, executi...rue silent=False shell_futures=True> result=None>
   2904                     return True
   2905 
   2906             for i, node in enumerate(to_run_interactive):
   2907                 mod = ast.Interactive([node])

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f9009ddb540, file "<ipython-input-24-13c01577410d>", line 2>, result=<ExecutionResult object at 7f900536af28, executi...rue silent=False shell_futures=True> result=None>)
   2958         outflag = True  # happens in more places, so it's easier as default
   2959         try:
   2960             try:
   2961                 self.hooks.pre_run_code_hook()
   2962                 #rprint('Running code', repr(code_obj)) # dbg
-> 2963                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f9009ddb540, file "<ipython-input-24-13c01577410d>", line 2>
        self.user_global_ns = {'BayesSearchCV': <class 'skopt.searchcv.BayesSearchCV'>, 'CarWordsHandler': <class '__main__.CarWordsHandler'>, 'Categorical': <class 'skopt.space.space.Categorical'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ET': <module 'xml.etree.ElementTree' from '/home/gius...naconda3/lib/python3.6/xml/etree/ElementTree.py'>, 'In': ['', 'import re\nimport xml.etree.ElementTree as ET\nimp..., classification_report, accuracy_score, f1_score', '#nltk.download()', 'class CarWordsHandler:\n    # https://github.com/...           else:\n                    return False', "# encoding issues\ndef correctEncodings(comment):...g|tonnellate|ton|chili|kili)', 'WEIGHT', comment)", 'class ItalianWordsHandler:\n    # https://dspace-...        # not yet implemented\n        return text', "class Preprocessor:\n    \n    def __init__(self):...     return str(re.sub(r'\\s{2,}', ' ', fin_text))", 'class Vectorizer:\n    \n    def __init__(self, li...       return self.vectorizer.get_feature_names()', "v = Vectorizer(list_comments=['ciao come va ?', ...ames())\nprint(v.vectorize('ciao ciao come va ?'))", "text = 'Sono reali calcolati nel arco del tutto ...ext, ner=False, use_stemmer=False, method='pos'))", "def combine_text_quote(text, quote):\n    text_to... + '_QUOTE')\n    return ' '.join(combined_tokens)", "print(combine_text_quote('Bene grazie', 'ciao come va'))", "dataset = pd.read_csv('dataset.csv').fillna('')[['TESTO', '(Testo Citato)', 'Brand']]\ndataset", "pp = Preprocessor()\n# TESTO\nfor i in range(len(d...], method='word', use_stemmer=True, ner=True)    ", '# combine labels "molto positivo" = "positivo" a...et[\'Brand\'].replace(\'molto negativo\', \'negativo\')', "preprocessed_dataset = []\nfor i in range(len(dat...aset['(Testo Citato)'][i]), dataset['Brand'][i]))", 'preprocessed_dataset[10]', "numpy_dataset = np.array(preprocessed_dataset)\nprint('Length: ' + str(numpy_dataset.shape[0]))", 'np.random.seed(26)\nnp.random.shuffle(numpy_datas... = numpy_dataset[:6000,:], numpy_dataset[6000:,:]', 'collections.Counter(train_dataset[:,1])', ...], 'Integer': <class 'skopt.space.space.Integer'>, 'ItalianStemmer': <class 'nltk.stem.snowball.ItalianStemmer'>, 'ItalianWordsHandler': <class '__main__.ItalianWordsHandler'>, 'MAX_FEATURES': 10000, ...}
        self.user_ns = {'BayesSearchCV': <class 'skopt.searchcv.BayesSearchCV'>, 'CarWordsHandler': <class '__main__.CarWordsHandler'>, 'Categorical': <class 'skopt.space.space.Categorical'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ET': <module 'xml.etree.ElementTree' from '/home/gius...naconda3/lib/python3.6/xml/etree/ElementTree.py'>, 'In': ['', 'import re\nimport xml.etree.ElementTree as ET\nimp..., classification_report, accuracy_score, f1_score', '#nltk.download()', 'class CarWordsHandler:\n    # https://github.com/...           else:\n                    return False', "# encoding issues\ndef correctEncodings(comment):...g|tonnellate|ton|chili|kili)', 'WEIGHT', comment)", 'class ItalianWordsHandler:\n    # https://dspace-...        # not yet implemented\n        return text', "class Preprocessor:\n    \n    def __init__(self):...     return str(re.sub(r'\\s{2,}', ' ', fin_text))", 'class Vectorizer:\n    \n    def __init__(self, li...       return self.vectorizer.get_feature_names()', "v = Vectorizer(list_comments=['ciao come va ?', ...ames())\nprint(v.vectorize('ciao ciao come va ?'))", "text = 'Sono reali calcolati nel arco del tutto ...ext, ner=False, use_stemmer=False, method='pos'))", "def combine_text_quote(text, quote):\n    text_to... + '_QUOTE')\n    return ' '.join(combined_tokens)", "print(combine_text_quote('Bene grazie', 'ciao come va'))", "dataset = pd.read_csv('dataset.csv').fillna('')[['TESTO', '(Testo Citato)', 'Brand']]\ndataset", "pp = Preprocessor()\n# TESTO\nfor i in range(len(d...], method='word', use_stemmer=True, ner=True)    ", '# combine labels "molto positivo" = "positivo" a...et[\'Brand\'].replace(\'molto negativo\', \'negativo\')', "preprocessed_dataset = []\nfor i in range(len(dat...aset['(Testo Citato)'][i]), dataset['Brand'][i]))", 'preprocessed_dataset[10]', "numpy_dataset = np.array(preprocessed_dataset)\nprint('Length: ' + str(numpy_dataset.shape[0]))", 'np.random.seed(26)\nnp.random.shuffle(numpy_datas... = numpy_dataset[:6000,:], numpy_dataset[6000:,:]', 'collections.Counter(train_dataset[:,1])', ...], 'Integer': <class 'skopt.space.space.Integer'>, 'ItalianStemmer': <class 'nltk.stem.snowball.ItalianStemmer'>, 'ItalianWordsHandler': <class '__main__.ItalianWordsHandler'>, 'MAX_FEATURES': 10000, ...}
   2964             finally:
   2965                 # Reset our crash handler in place
   2966                 sys.excepthook = old_excepthook
   2967         except SystemExit as e:

...........................................................................
/home/giuseppe/Desktop/tesi/src/<ipython-input-24-13c01577410d> in <module>()
      1 # fit model
----> 2 opt.fit(train_X, train_y)
      3 
      4 best_score = opt.best_score_
      5 best_score_std = opt.cv_results_['std_test_score'][opt.best_index_]
      6 best_params = opt.best_params_
      7 
      8 print('Best score: ' + str(best_score) + " std: " + str(best_score_std))
      9 print('Best params: ' + str(best_params))
     10 #preds = opt.predict(x_test)

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/skopt/searchcv.py in fit(self=BayesSearchCV(cv=StratifiedKFold(n_splits=5, ran...=('ovo', 'ovr'), prior=None)},
       verbose=10), X=array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), y=array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), groups=None, callback=None)
    657                 # when n_iter < n_points points left for evaluation
    658                 n_points_adjusted = min(n_iter, n_points)
    659 
    660                 optim_result = self._step(
    661                     X, y, search_space, optimizer,
--> 662                     groups=groups, n_points=n_points_adjusted
        groups = None
        n_points = 1
        n_points_adjusted = 1
    663                 )
    664                 n_iter -= n_points
    665 
    666                 if eval_callbacks(callbacks, optim_result):

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/skopt/searchcv.py in _step(self=BayesSearchCV(cv=StratifiedKFold(n_splits=5, ran...=('ovo', 'ovr'), prior=None)},
       verbose=10), X=array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), y=array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), search_space={'C': Real(low=0.01, high=10, prior='uniform', transform='identity'), 'class_weight': Categorical(categories=('balanced',), prior=None), 'coef0': Real(low=0.01, high=10, prior='uniform', transform='identity'), 'decision_function_shape': Categorical(categories=('ovo', 'ovr'), prior=None), 'degree': Categorical(categories=(3, 5), prior=None), 'gamma': Categorical(categories=('scale',), prior=None), 'kernel': Categorical(categories=('linear', 'rbf', 'poly', 'sigmoid'), prior=None), 'probability': Categorical(categories=(True, False), prior=None)}, optimizer=<skopt.optimizer.optimizer.Optimizer object>, groups=None, n_points=1)
    550         all_cv_results = self.cv_results_
    551 
    552         # HACK: this adds compatibility with different versions of sklearn
    553         refit = self.refit
    554         self.refit = False
--> 555         self._fit(X, y, groups, params_dict)
        self._fit = <bound method BayesSearchCV._fit of BayesSearchC...('ovo', 'ovr'), prior=None)},
       verbose=10)>
        X = array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
        y = array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])
        groups = None
        params_dict = [{'C': 8.216234727026476, 'class_weight': 'balanced', 'coef0': 2.6829639217572407, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'probability': False}]
    556         self.refit = refit
    557 
    558         # merge existing and new cv_results_
    559         for k in self.cv_results_:

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/skopt/searchcv.py in _fit(self=BayesSearchCV(cv=StratifiedKFold(n_splits=5, ran...=('ovo', 'ovr'), prior=None)},
       verbose=10), X=array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), y=array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), groups=None, parameter_iterable=[{'C': 8.216234727026476, 'class_weight': 'balanced', 'coef0': 2.6829639217572407, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'probability': False}])
    398                 return_train_score=self.return_train_score,
    399                 return_n_test_samples=True,
    400                 return_times=True, return_parameters=True,
    401                 error_score=self.error_score
    402             )
--> 403             for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = [{'C': 8.216234727026476, 'class_weight': 'balanced', 'coef0': 2.6829639217572407, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'probability': False}]
    404             for train, test in cv_iter)
    405 
    406         # if one choose to see train score, "out" will contain train score info
    407         if self.return_train_score:

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BayesSearchCV._fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
TypeError                                          Mon Jul 15 11:57:27 2019
PID: 16500                Python 3.6.8: /home/giuseppe/anaconda3/bin/python
...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (SVC(C=8.216234727026476, cache_size=200, class_w...e=None, shrinking=True, tol=0.001, verbose=False), memmap([[0., 0., 1., ..., 0., 0., 0.],
        [... 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), make_scorer(f1_score), array([   0,    1,    2, ..., 5997, 5998, 5999]), array([  10,   19,   27, ..., 5989, 5993, 5994]), 10, {'C': 8.216234727026476, 'class_weight': 'balanced', 'coef0': 2.6829639217572407, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'probability': False}), {'error_score': 'raise', 'fit_params': None, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (SVC(C=8.216234727026476, cache_size=200, class_w...e=None, shrinking=True, tol=0.001, verbose=False), memmap([[0., 0., 1., ..., 0., 0., 0.],
        [... 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), make_scorer(f1_score), array([   0,    1,    2, ..., 5997, 5998, 5999]), array([  10,   19,   27, ..., 5989, 5993, 5994]), 10, {'C': 8.216234727026476, 'class_weight': 'balanced', 'coef0': 2.6829639217572407, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'probability': False})
        kwargs = {'error_score': 'raise', 'fit_params': None, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=SVC(C=8.216234727026476, cache_size=200, class_w...e=None, shrinking=True, tol=0.001, verbose=False), X=memmap([[0., 0., 1., ..., 0., 0., 0.],
        [... 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), y=array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), scorer=make_scorer(f1_score), train=array([   0,    1,    2, ..., 5997, 5998, 5999]), test=array([  10,   19,   27, ..., 5989, 5993, 5994]), verbose=10, parameters={'C': 8.216234727026476, 'class_weight': 'balanced', 'coef0': 2.6829639217572407, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'probability': False}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method BaseLibSVM.fit of SVC(C=8.21623472...=None, shrinking=True, tol=0.001, verbose=False)>
        X_train = memmap([[0., 0., 1., ..., 0., 0., 0.],
        [... 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])
        y_train = array([[0.],
       [3.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])
        fit_params = {}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self=SVC(C=8.216234727026476, cache_size=200, class_w...e=None, shrinking=True, tol=0.001, verbose=False), X=array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), y=array([0., 3., 0., ..., 0., 0., 0.]), sample_weight=array([], dtype=float64))
    182         fit = self._sparse_fit if self._sparse else self._dense_fit
    183         if self.verbose:  # pragma: no cover
    184             print('[LibSVM]', end='')
    185 
    186         seed = rnd.randint(np.iinfo('i').max)
--> 187         fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
        fit = <bound method BaseLibSVM._dense_fit of SVC(C=8.2...=None, shrinking=True, tol=0.001, verbose=False)>
        X = array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
        y = array([0., 3., 0., ..., 0., 0., 0.])
        sample_weight = array([], dtype=float64)
        solver_type = 0
        kernel = 'rbf'
        seed = 872969141
    188         # see comment on the other call to np.iinfo in this file
    189 
    190         self.shape_fit_ = X.shape
    191 

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py in _dense_fit(self=SVC(C=8.216234727026476, cache_size=200, class_w...e=None, shrinking=True, tol=0.001, verbose=False), X=array([[0., 0., 1., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), y=array([0., 3., 0., ..., 0., 0., 0.]), sample_weight=array([], dtype=float64), solver_type=0, kernel='rbf', random_seed=872969141)
    249                 class_weight=self.class_weight_, kernel=kernel, C=self.C,
    250                 nu=self.nu, probability=self.probability, degree=self.degree,
    251                 shrinking=self.shrinking, tol=self.tol,
    252                 cache_size=self.cache_size, coef0=self.coef0,
    253                 gamma=self._gamma, epsilon=self.epsilon,
--> 254                 max_iter=self.max_iter, random_seed=random_seed)
        self.max_iter = -1
        random_seed = 872969141
    255 
    256         self._warn_from_fit_status()
    257 
    258     def _sparse_fit(self, X, y, sample_weight, solver_type, kernel,

...........................................................................
/home/giuseppe/anaconda3/lib/python3.6/site-packages/sklearn/svm/libsvm.cpython-36m-x86_64-linux-gnu.so in sklearn.svm.libsvm.fit()

TypeError: must be real number, not str
___________________________________________________________________________

In [25]:
import sklearn
print(sklearn.__version__)

0.19.1
