# Classification Tests

spiegazione test

### Imports

In [121]:
import re
import xml.etree.ElementTree as ET
import nltk
from nltk.stem.snowball import ItalianStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
#nltk.download()

## Classes

In [5]:
class CarWordsHandler:
    # https://github.com/n8barr/automotive-model-year-data
    def __init__(self, cars_file):
        self.brands_list = set()
        self.models_list = set()
        f = open(cars_file, "r")
        cars_list = f.read().splitlines()
        for i in range(len(cars_list)):
            brand = cars_list[i].split(',')[1][2:-1].lower()
            model = cars_list[i].split(',')[2][2:-2].lower()
            self.brands_list.add(brand)
            self.models_list.add(model)
        # remove some useless models
        self.models_list.remove('i')
        self.models_list.remove('gli')
        self.models_list.remove('estate')
        self.brands_list = list(self.brands_list)
        self.models_list = list(self.models_list)
        self.brands_list.sort()
        self.models_list.sort()
    
    # binary search to get if a word is a brand 
    def isBrand(self, word):
        word = word.lower()
        first = 0
        last = len(self.brands_list) -1
        while first < last:
            mid = int((last + first) / 2)
            if word == self.brands_list[mid]:
                return True
            elif word < self.brands_list[mid]:
                last = mid
            elif word > self.brands_list[mid]:
                first = mid
            if last-first == 1:
                if self.brands_list[first] == word or self.brands_list[last] == word:
                    return True
                else:
                    return False
    # binary search to get if a word is a brand 
    def isModel(self, word):
        word = word.lower()
        first = 0
        last = len(self.models_list) -1
        while first < last:
            mid = int((last + first) / 2)
            if word == self.models_list[mid]:
                return True
            elif word < self.models_list[mid]:
                last = mid
            elif word > self.models_list[mid]:
                first = mid
            if last-first == 1:
                if self.models_list[first] == word or self.models_list[last] == word:
                    return True
                else:
                    return False

## Functions

In [6]:
# encoding issues
def correctEncodings(comment):
    fin_comment = comment
    fin_comment = re.sub('Ã¨', 'è', fin_comment)
    fin_comment = re.sub('Ã©', 'é', fin_comment)
    fin_comment = re.sub('Ã¬', 'ì', fin_comment)
    fin_comment = re.sub('Ã²', 'ò', fin_comment)
    fin_comment = re.sub('Ã¹', 'ù', fin_comment)
    fin_comment = re.sub('Ã', 'à', fin_comment)
    return fin_comment
# recognize an URL
def isURL(word):
    # http://forum.rusconi.it/gentemotori/viewtopic.php ? t=434&sid=57c88f1b507d8f57717ea18e74e25324Â 
    return len(re.findall("^((http(s){0,1}://)|(www.))\S+$", word)) > 0
# recognize an image tag
def isPicture(word):
    return len(re.findall("^<img.*>$", word)) > 0
# fix issues on urls
def replaceURLs(comment):
    return str(re.sub(r'(http(s){0,1}://|www.)(([^\s]+)|/)+((\s\?\s)[^\s]+){0,1}', 'URL', comment)).replace(u'\xa0', u' ')
# replace images
def replaceIMGs(comment):
    return str(re.sub(r'<img.+>', 'IMG', comment))
# replace brands
def replaceBrands(cwhandler, comment):
    tokens = comment.split(' ')
    for i in range(len(tokens)):
        if cwhandler.isBrand(tokens[i]):
            tokens[i] = 'BRAND'
    return ' '.join(tokens)
# replace models
def replaceModels(cwhandler, comment):
    tokens = comment.split(' ')
    for i in range(len(tokens)):
        if cwhandler.isModel(tokens[i]):
            tokens[i] = 'MODEL'
    return ' '.join(tokens)
# replace question marks
def replaceQMarks(comment):
    comment = re.sub(r'\?{2,}', ' MULTI_QMARK', comment)
    comment = re.sub(r'\?', ' QMARK', comment)
    return comment
# replace esclamation marks
def replaceEMarks(comment):
    comment = re.sub(r'\!{2,}', ' MULTI_EMARK', comment)
    comment = re.sub(r'\!', ' EMARK', comment)
    return comment
# remove character repetitions
def removeRepeat(comment):
    return re.sub(r'(a-zA-Z)\1{2,}', r'\1\1\1', comment)
# replace speed
def replaceSpeed(comment):
    return re.sub(r'([0-9\.*]+(\s*(\-|\/|\s)\s*)+){0,1}[0-9\.*]+(\s*)(km/h|mph)', 'SPEED', comment)
# replace consumption
def replaceConsumption(comment):
    return re.sub(r'([0-9\.*]+(\s*(\-|\/|\s)\s*)+){0,1}[0-9\.*]+(\s*)(km/l|mpg)', 'CONSUMPTION', comment)
# replace weight
def replaceWeight(comment):
    return re.sub(r'[0-9\.*]+(\s*)(kg|tonnellate|ton|chili|kili)', 'WEIGHT', comment)

In [7]:
class ItalianWordsHandler:
    # https://dspace-clarin-it.ilc.cnr.it/repository/xmlui/handle/20.500.11752/ILC-73
    def __init__(self, words_file):
        # words information
        self.words_dict = dict()
        root = ET.parse(words_file).getroot()
        for entry in root.findall('Lexicon/LexicalEntry'):
            word = entry.find('Lemma').get('writtenForm')
            pos = entry.get('partOfSpeech')
            senti = entry.find('Sense/Sentiment').get('polarity')
            conf = entry.find('Sense/Confidence').get('score')
            self.words_dict[word] = {'POS': pos, 'Sentiment': senti, 'Confidence': conf}
        # stemmer
        self.it_stem = ItalianStemmer()
        
    # get word info. None if not exists
    def getWordInfo(self, word):
        # fields: POS, Sentiment, Confidence
        return self.words_dict.get(word)
    
    # italian stemmer http://snowball.tartarus.org/algorithms/italian/stemmer.html
    def stem(self, word):
        return self.it_stem.stem(word)
    
    # correct words
    def correctWords(self, text):
        # not yet implemented
        return text

In [8]:
class Preprocessor:
    
    def __init__(self):
        self.cwh = CarWordsHandler('resources/cars_data.sql')
        self.iwh = ItalianWordsHandler('resources/ita_opeNER.xml')
    # preprocess text
    # allowed methods: word, swnt, pos
    # ner (named entity recognition), replacing for instance 100 km/h with SPEED
    def preprocessText(self, text, method='word', use_stemmer=False, ner=False):
        if method not in ['word', 'pos', 'swnt']:
            raise ValueError('Method not recognized. Select from word, swnt, pos')
        # correct encodings (not yet implemented)
        fin_text = correctEncodings(text)
        # some basic preprocessing
        fin_text = fin_text.lower()
        # correct words (not yet)
        fin_text = self.iwh.correctWords(fin_text)
        # manage repetitions
        fin_text = removeRepeat(fin_text)
        # manage punctation
        fin_text = replaceQMarks(fin_text)
        fin_text = replaceEMarks(fin_text)
        # manage URLs
        fin_text = replaceURLs(fin_text)
        # manage Images
        fin_text = replaceIMGs(fin_text)
        # NOW DEPENDS ON NER
        if ner:
            # manage brands and models
            fin_text = replaceBrands(self.cwh, fin_text)
            fin_text = replaceModels(self.cwh, fin_text)
            # manage speed consumption and weight
            fin_text = replaceSpeed(fin_text)
            fin_text = replaceConsumption(fin_text)
            fin_text = replaceWeight(fin_text)
        # NOW DEPENDS ON METHOD
        if method == 'word':
            # just do nothing except eventually stemming
            if use_stemmer:
                tokens = fin_text.split(' ')
                fin_text = ' '.join([t if t.isupper() else self.iwh.stem(t) for t in tokens]) 
        elif method == 'swnt':
            tokens = fin_text.split(' ')
            swnt_tokens = []
            for t in tokens:
                info = self.iwh.getWordInfo(t)
                if info == None or info['Sentiment'] == None:
                    swnt_tokens.append(t)
                else:
                    # confidence 0-100
                    swnt_tokens.append(str(info['Sentiment'])[:3].upper() + '_' + str(int(float(info['Confidence'])*10)))
            fin_text = ' '.join(swnt_tokens)
            # stemmer
            if use_stemmer:
                tokens = fin_text.split(' ')
                fin_text = ' '.join([t if t.isupper() else self.iwh.stem(t) for t in tokens]) 
        elif method == 'pos':
            tokens = fin_text.split(' ')
            pos_tokens = []
            # pos
            for t in tokens:
                info = self.iwh.getWordInfo(t)
                if info == None or info['POS'] == None:
                    # unknown tag
                    pos_tokens.append('UNK')
                else:
                    pos_tokens.append(str(info['POS']).upper())
            # pos_word
            for t in tokens:
                info = self.iwh.getWordInfo(t)
                if info == None or info['POS'] == None:
                    pos_tokens.append('UNK_' + str(t))
                else:
                    pos_tokens.append(str(info['POS']).upper() + '_' + str(t))
                    
            fin_text = ' '.join(pos_tokens)
            # stemmer
            if use_stemmer:
                tokens = fin_text.split(' ')
                fin_text = ' '.join([t if t.isupper() else self.iwh.stem(t) for t in tokens]) 
        
        return fin_text

In [139]:
class Vectorizer:
    
    def __init__(self, list_comments, method='bow', max_features=1000, ngrams=2, just_presence=False):
        if method not in ['bow', 'tfidf']:
            raise ValueError('Method not recognized. Select from bow, tfidf')
        if method == 'bow':
            self.vectorizer = CountVectorizer(ngram_range=(1,ngrams), binary=just_presence, lowercase=False, max_features=max_features)
        elif method == 'tfidf':
            self.vectorizer = TfidfVectorizer(ngram_range=(1,ngrams), lowercase=False, max_features=max_features)
        # fit vectorizer
        self.vectorizer.fit(list_comments)          
        
        
        '''
        self.list_comments = list_comments
        # initialize tfidf weights
        self.idf_dict = {}
        idf_dict_uni = {}
        idf_dict_big = {}
        # unigrams
        for comment in list_comments:
            tokens = list(set(comment.split()))
            for t in tokens:
                if idf_dict_uni.get(t) != None:
                    idf_dict_uni[t] = idf_dict_uni[t] + 1
                else:
                    idf_dict_uni[t] = 1
        # bigrams
        for comment in list_comments:
            tokens = comment.split()
            for i in range(len(tokens) -2):
                big = (tokens[i], tokens[i+1])
                if idf_dict_big.get(big) != None:
                    idf_dict_big[big] = idf_dict_big[big] + 1
                else:
                    idf_dict_big[big] = 1
        # cut most frequent
        idf_dict_uni = Counter(idf_dict_uni).most_common(most_common_unigrams)
        idf_dict_big = Counter(idf_dict_big).most_common(most_common_bigrams)
        self.idf_dict.update(idf_dict_uni)
        self.idf_dict.update(idf_dict_big)
        '''
        
    def vectorize(self, comment):
        
        return self.vectorizer.transform([comment])
        
        '''
        if method not in ['bow', 'tfidf']:
            raise ValueError('Method not recognized. Select from bow, tfidf')
        unigrams = comment.split(' ')
        bigrams = []
        for i in range(len(unigrams) -2):
            bigrams.append((unigrams[i], unigrams[i+1]))
        if method == 'bow':
            bow_dict = dict.fromkeys(self.idf_dict, 0)
            for u in unigrams + bigrams:
                if bow_dict.get(u) != None:
                    if just_presence:
                        bow_dict[u] = 1
                    else:
                        bow_dict[u] = bow_dict[u] +1
            return list(bow_dict.values())
        elif method == 'tfidf':
            tf_dict = dict.fromkeys(self.idf_dict, 0)
            for u in unigrams + bigrams:
                if tf_dict.get(u) != None:
                    tf_dict[u] = tf_dict[u] +1
        '''
        
    def get_feature_names(self):
        return self.vectorizer.get_feature_names()

In [144]:
v = Vectorizer(list_comments=['ciao come va ?', 'vediamo come scrive ciao questa tastiera', 'non lo so, per me è falso'], method='tfidf', max_features=100, ngrams=2, just_presence=True)
print(v.get_feature_names())
print(v.vectorize('ciao ciao come va ?').toarray())

['ciao', 'ciao come', 'ciao questa', 'come', 'come scrive', 'come va', 'falso', 'lo', 'lo so', 'me', 'me falso', 'non', 'non lo', 'per', 'per me', 'questa', 'questa tastiera', 'scrive', 'scrive ciao', 'so', 'so per', 'tastiera', 'va', 'vediamo', 'vediamo come']
[[0.62663214 0.41197298 0.         0.31331607 0.         0.41197298
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.41197298 0.
  0.        ]]


Just an example

In [11]:
text = 'Sono reali calcolati nel arco del tutto anno nel estate qualcosa in piÃ¹ causa gomme di 17" e climatizzatore nel inverno un po di meno. Per quanto riguarda le autostrade quelle che percorro io principalmente la A4 e molto congestionata cosi spesso la media e 110-115 km/h che ovviamente influisce positivamente a i consumi. Ma quello che mi piace di piÃ¹ Ã¨ assenza dei guasti. Sulla vecchia Accord il primo guasto lo ho avuto a 200000 km si Ã¨ rotto il termostato della clima. Ogni tanto faccio giro di altri forum e leggo delle turbine rotte catene di distribuzione progettate male iniettori fatti male mah nel 2015 per me sono le cose incomprensibili . Con tutti gli difetti che puÃ² avere preferisco la Honda. '
print(text)
print('##########################################################################################')
p = Preprocessor()
print(p.preprocessText(text, ner=False, use_stemmer=False, method='word'))
print('##########################################################################################')
print(p.preprocessText(text, ner=False, use_stemmer=False, method='swnt'))
print('##########################################################################################')
print(p.preprocessText(text, ner=False, use_stemmer=False, method='pos'))


Sono reali calcolati nel arco del tutto anno nel estate qualcosa in piÃ¹ causa gomme di 17" e climatizzatore nel inverno un po di meno. Per quanto riguarda le autostrade quelle che percorro io principalmente la A4 e molto congestionata cosi spesso la media e 110-115 km/h che ovviamente influisce positivamente a i consumi. Ma quello che mi piace di piÃ¹ Ã¨ assenza dei guasti. Sulla vecchia Accord il primo guasto lo ho avuto a 200000 km si Ã¨ rotto il termostato della clima. Ogni tanto faccio giro di altri forum e leggo delle turbine rotte catene di distribuzione progettate male iniettori fatti male mah nel 2015 per me sono le cose incomprensibili . Con tutti gli difetti che puÃ² avere preferisco la Honda. 
##########################################################################################
sono reali calcolati nel arco del tutto anno nel estate qualcosa in più causa gomme di 17" e climatizzatore nel inverno un po di meno. per quanto riguarda le autostrade quelle che percorro io pr