In [1]:
from collections import Counter
import json
import os
import re
import string

import joblib
import nltk
from nltk.corpus import stopwords
from sklearn.base import clone
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from unidecode import unidecode

# Load dataset

In [2]:
ds_path = '../data/ready/full_spanish_dataset.json'

with open(ds_path, 'r') as f:
    dataset_raw = json.load(f)

c = Counter([k['klass'] for k in dataset_raw])
c

Counter({'neutral': 89782, 'negative': 26272, 'positive': 107252})

In [3]:
dataset_raw[0]

{'text': '4.Denme una mano con RT para exigir q @lanacioncom retire la foto y respete el derecho a la intimidad de @MariaviicToriia su mamá y hermano.',
 'klass': 'neutral',
 'id_annotator': '87',
 'id': '332473940712751104'}

# Clean dataset

In [4]:
class Dataset():
    @staticmethod 
    def _remove_users(speech):
        return re.sub(r'\w*(@)\w*', '', speech)
    
    @staticmethod 
    def _remove_url(speech):
        return re.sub(r"\S*(\.com|\.ly|\.co|\.net|\.org|\.me|\.gl)\S*", "", speech)
    
    @staticmethod 
    def _remove_punctuation(speech):
        return re.sub(r'[^\w\s]', '', speech)
    
    @staticmethod 
    def _remove_hashtag(speech):
        return re.sub(r'\w*(#)\w*', '', speech)
    
    @staticmethod 
    def _reduce_laugh(speech):
        return re.sub(r'\w*(jaja|kaka|jeje|jiji|juju|jojo|ajaj|jaaj)\w*','jaja',speech)
    
    def _processed(self, speech):
        tt = speech.lower()
        tt = self._remove_users(tt)
        tt = self._remove_url(tt)
        tt = self._remove_hashtag(tt)
        tt = self._reduce_laugh(tt)
        tt = self._remove_punctuation(tt)
        tt = unidecode(tt)
        return tt
    
    def binary_class(self, dataset, processed=True):
        corpus = [self._processed(k['text']) if processed else k['text'] for k in dataset if k['klass'] != 'neutral']
        mapper = {'negative': 0, 'positive': 1}
        target = [mapper[k['klass']] for k in dataset if k['klass'] != 'neutral']
        
        return corpus, target
        

    def multi_class(self, dataset, processed=True):
        corpus = [self._processed(k['text']) if processed else k['text'] for k in dataset]
        mapper = {'neutral': 0, 'negative': -1, 'positive': 1}
        target = [mapper[k['klass']] for k in dataset]
        return corpus, target

In [5]:
def cleaner(speech):
    tt = speech.lower()
    tt = re.sub(r'\w*(@)\w*', '', tt)
    tt = re.sub(r'\w*(RT)\w*', '', tt, )
    tt = re.sub(r'\w*(#)\w*', '', tt, )
    tt = re.sub(r"\S*(\.com|\.ly|\.co|\.net|\.org|\.me|\.gl)\S*", "", tt)
    tt = re.sub(r'\w*(jaja|kaka|jeje|jiji|juju|jojo|ajaj|jaaj)\w*','jaja',tt)
    tt = re.sub(r'[^\w\s]', '', tt)
    return tt

In [6]:
ds = Dataset()
corpus, target = ds.binary_class(dataset_raw, processed=False)
corpus_multi, target_multi = ds.multi_class(dataset_raw, processed=False)


## Aditional cleaning steps (not implemented yet)

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SpanishStemmer
from nltk.stem.porter import PorterStemmer

my_tokenizer = RegexpTokenizer("[\w']+")
stemmer_es = SpanishStemmer('spanish')
stemmer_porter = PorterStemmer()
stopwords_es = stopwords.words('spanish')

#punctuation to remove
non_words = list(string.punctuation) + ['¿', '¡']
# non_words.extend(map(str,range(10)))

def tokenizer(document):
    return [token for token in my_tokenizer.tokenize(document) if token.isalpha()]

def stemmer(tokens):
    return [stemmer_es.stem(token) for token in tokens]

def stemmer_tokenizer(speech):
    return stemmer(tokenizer(tweet))

def rm_stopwords(tokens):
    return [k for k in tokens if k not in stopwords_es]

In [8]:
tweet = corpus[1].lower()
print(tweet)
print(tokenizer(tweet))
print(stemmer(tokenizer(tweet)))
print(rm_stopwords(tokenizer(tweet)))

lo que me hizo reir gastón, no tiene nombre...
['lo', 'que', 'me', 'hizo', 'reir', 'gastón', 'no', 'tiene', 'nombre']
['lo', 'que', 'me', 'hiz', 'reir', 'gaston', 'no', 'tiene', 'nombr']
['hizo', 'reir', 'gastón', 'nombre']


In [9]:
print(tokenizer('en el 20001 los hombres iran a la luna'))


['en', 'el', 'los', 'hombres', 'iran', 'a', 'la', 'luna']


# Modeling

## Text to num vector
BOW = Bag of word

In [10]:
hashing = HashingVectorizer(
    analyzer = "word",
    n_features=1000,
#     tokenizer=tokenizer_stemmer,
    preprocessor=None,
    #  stop_words=stopwords.words("spanish"),
    binary=True,
    strip_accents='ascii',
    encoding='utf-8',
    ngram_range=(1,3), )

vectorizer = CountVectorizer(  
    analyzer = 'word',
    tokenizer = tokenizer,
    strip_accents='unicode',
    preprocessor=cleaner,
    lowercase = True,
    stop_words = stopwords_es,
    max_features=5000,
#     min_df = 0.,
#     max_df = 1.9,
    ngram_range=(1, 3),
    binary=True,
)

## Binary classification


In [11]:
x_train, x_test, y_train, y_test =  \
    train_test_split(corpus, target, stratify=target, random_state=42, test_size=0.2)

In [12]:
log_bin_pipeline = Pipeline(
    steps=[('processor', clone(vectorizer)),
           ("clf",  LogisticRegressionCV(class_weight='balanced', n_jobs=-1))])

In [13]:
log_bin_pipeline.fit(x_train, y_train);



In [14]:
y_train_pred = log_bin_pipeline.predict(x_train)
y_test_pred = log_bin_pipeline.predict(x_test)

In [15]:
acc_train = accuracy_score(y_train, y_train_pred)
acc_train

0.6893249328303017

In [16]:
acc_test = accuracy_score(y_test, y_test_pred)
acc_test

0.6486051301254446

In [17]:
confusion_matrix(y_train, y_train_pred )

array([[13821,  7197],
       [25989, 59812]])

In [18]:
confusion_matrix(y_test, y_test_pred, )

array([[ 2914,  2340],
       [ 7044, 14407]])

In [19]:
y_test[:10]

[1, 1, 1, 1, 0, 0, 1, 1, 1, 1]

In [20]:
tweet = x_test[2]
print(tweet)
print(f'probability: {log_bin_pipeline.predict_proba([tweet])}, class: {log_bin_pipeline.predict([tweet])}')

¡Vamos #APorLaDécima! Conoce la historia de las victorias del Real Madrid en la UCL: bit.ly/1i10lZRpic.twitter.com/omTDc1chu5
probability: [[0.01770089 0.98229911]], class: [1]


## Multiclass classification

In [21]:
x_train, x_test, y_train, y_test =  \
    train_test_split(corpus_multi, target_multi, stratify=target_multi, random_state=42, test_size=0.2)

In [22]:
log_multi_pipeline = Pipeline(
    steps=[('processor', clone(vectorizer)),
           ('clf', LogisticRegressionCV(class_weight='balanced', n_jobs=-1))])

In [23]:
log_multi_pipeline.fit(x_train, y_train);



In [24]:
y_train_pred = log_multi_pipeline.predict(x_train)
y_test_pred = log_multi_pipeline.predict(x_test)

In [25]:
acc_train = accuracy_score(y_train, y_train_pred)
acc_train

0.5585969861848145

In [26]:
acc_test = accuracy_score(y_test, y_test_pred)
acc_test

0.5253907124624961

In [27]:
confusion_matrix(y_train, y_train_pred )

array([[ 9680,  4670,  6668],
       [10516, 41217, 20092],
       [15856, 21052, 48893]])

In [28]:
confusion_matrix(y_train, y_train_pred )

array([[ 9680,  4670,  6668],
       [10516, 41217, 20092],
       [15856, 21052, 48893]])

In [36]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

          -1       0.27      0.46      0.34     21018
           0       0.62      0.57      0.59     71825
           1       0.65      0.57      0.61     85801

    accuracy                           0.56    178644
   macro avg       0.51      0.53      0.51    178644
weighted avg       0.59      0.56      0.57    178644



In [29]:
tweet = x_test[3]
print(tweet)
print(f'probability: {log_multi_pipeline.predict_proba([tweet])}, class: {log_multi_pipeline.predict([tweet])}')

@SetteSettamen @Conpdepau ya veo ya.... 😡😡😡😡 a ti tambien sette que te tiro un botellin a la cabeza eh ajajjajaja
probability: [[0.51373624 0.1025064  0.38375735]], class: [-1]


**ATTENTION**: We need to pay mora atention to vocabulary to improve the performance. there are a lot of word with the same meaning, pluran and singular, etc

In [30]:
log_multi_pipeline.steps[0][1].get_feature_names()[:10]

['abajo',
 'abandona',
 'abc',
 'abierta',
 'abierta capilla',
 'abierta capilla ardiente',
 'abiertas',
 'abierto',
 'abogado',
 'aborto']

In [31]:
# # Define the hiperparameters (Very simple because of memory issues)
# hiperparam_pipeline_log_reg = {  
#    "regresion__fit_intercept":[True], # Data is not centered
#    "regresion__cv":[10, 20],
#    "regresion__max_iter": [500, 600]
#     }

In [32]:
# log_grid_search = GridSearchCV(estimator=log_pipeline,
#                               param_grid=hiperparam_pipeline_log_reg,
#                               scoring="roc_auc",
#                               cv=10,
#                               n_jobs=-1
#                              )
        

# #We train the model
# log_grid_search.fit( x_train, y_train)

# #This will take a very long time

In [33]:
# log_grid_search.score(x_test, y_test)

# Save best model

In [34]:
model_path = '../models/linear/'

binary_file = os.path.join(model_path, 'log_bin_pipeline.joblib')
joblib.dump(log_bin_pipeline, binary_file)

multi_file = os.path.join(model_path, 'log_multi_pipeline.joblib')
joblib.dump(log_multi_pipeline, multi_file)

['../models/linear/log_multi_pipeline.joblib']