In [1]:
from collections import Counter
import json
import os
import re
import string

import joblib
import nltk
from nltk.corpus import stopwords
from sklearn.base import clone
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load dataset

In [2]:
ds_path = '../data/ready/full_spanish_dataset.json'

with open(ds_path, 'r') as f:
    dataset_raw = json.load(f)

c = Counter([k['klass'] for k in dataset_raw])
c

Counter({'neutral': 89782, 'negative': 26272, 'positive': 107252})

# Clean dataset

In [3]:
class Dataset():

    @staticmethod 
    def _processed(word):
        tt = word.lower()
        tt = re.sub(r'\w*(@)\w*', '', tt)
        tt = re.sub(r'\w*(RT)\w*', '', tt, )
        tt = re.sub(r'\w*(#)\w*', '', tt, )
        tt = re.sub(r"\S*(\.com|\.ly|\.co|\.net|\.org|\.me|\.gl)\S*", "", tt)
        tt = re.sub(r'\w*(jaja|kaka|jeje|jiji|juju|jojo|ajaj|jaaj)\w*','jaja',tt)
    #     tt = tt.translate(None, string.punctuation)
        return tt
    
    def binary_class(self, dataset):
        corpus = [self._processed(k['text']) for k in dataset if k['klass'] != 'neutral']
        mapper = {'negative': 0, 'positive': 1}
        target = [mapper[k['klass']] for k in dataset if k['klass'] != 'neutral']
        
        return corpus, target
        

    def multi_class(self, dataset):
        corpus = [self._processed(k['text']) for k in dataset]
        mapper = {'neutral': 0, 'negative': -1, 'positive': 1}
        target = [mapper[k['klass']] for k in dataset]
        return corpus, target

In [4]:
ds = Dataset()
corpus, target = ds.binary_class(dataset_raw)
corpus_multi, target_multi = ds.multi_class(dataset_raw)


## Aditional cleaning steps (not implemented yet)

In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SpanishStemmer
from nltk.stem.porter import PorterStemmer

my_tokenizer = RegexpTokenizer("[\w']+")
stemmer_es = SpanishStemmer('spanish')
stemmer_porter = PorterStemmer()
stopwords_es = stopwords.words('spanish')

#punctuation to remove
non_words = list(string.punctuation) + ['¿', '¡']
# non_words.extend(map(str,range(10)))

def tokenizer(document):
    return [token for token in my_tokenizer.tokenize(document)]

def stemmer(tokens):
    return [stemmer_es.stem(token) for token in tokens]

def rm_stopwords(tokens):
    return [k for k in tokens if k not in stopwords_es]

In [6]:
tweet = corpus[1].lower()
print(tweet)
print(tokenizer(tweet))
print(stemmer(tokenizer(tweet)))
print(rm_stopwords(tokenizer(tweet)))

lo que me hizo reir gastón, no tiene nombre...
['lo', 'que', 'me', 'hizo', 'reir', 'gastón', 'no', 'tiene', 'nombre']
['lo', 'que', 'me', 'hiz', 'reir', 'gaston', 'no', 'tiene', 'nombr']
['hizo', 'reir', 'gastón', 'nombre']


# Modeling

## Text to num vector

In [8]:
hashing = HashingVectorizer(
    analyzer = "word",
    n_features=1000,
#     tokenizer=tokenizer_stemmer,
    preprocessor=None,
    #  stop_words=stopwords.words("spanish"),
    binary=True,
    strip_accents='ascii',
    encoding='utf-8',
    ngram_range=(1,3), )

vectorizer = CountVectorizer(  
    analyzer = 'word',
#     tokenizer = tokenizer,
    lowercase = True,
#     stop_words = stopwords_es,
    max_features=1000,
    min_df = 50,
    max_df = 1.9,
    ngram_range=(1, 2),
    binary=True,
)

## Binary classification


In [9]:
x_train, x_test, y_train, y_test =  \
    train_test_split(corpus, target, stratify=target, random_state=42, test_size=0.2)

In [10]:
log_bin_pipeline = Pipeline(
    steps=[('processor', clone(vectorizer)),
           ("clf",  LogisticRegressionCV(class_weight='balanced', n_jobs=-1))])

In [11]:
log_bin_pipeline.fit(x_train, y_train);

In [12]:
y_train_pred = log_bin_pipeline.predict(x_train)
y_test_pred = log_bin_pipeline.predict(x_test)

In [13]:
acc_train = accuracy_score(y_train, y_train_pred)
acc_train

0.645428247783634

In [14]:
acc_test = accuracy_score(y_test, y_test_pred)
acc_test

0.636285339824003

In [15]:
y_test[:10]

[1, 1, 1, 1, 0, 0, 1, 1, 1, 1]

In [16]:
tweet = x_test[2]
print(tweet)
print(f'probability: {log_bin_pipeline.predict_proba([tweet])}, class: {log_bin_pipeline.predict([tweet])}')

¡vamos ! conoce la historia de las victorias del real madrid en la ucl: 
probability: [[0.32236145 0.67763855]], class: [1]


## Multiclass classification

In [17]:
x_train, x_test, y_train, y_test =  \
    train_test_split(corpus_multi, target_multi, stratify=target_multi, random_state=42, test_size=0.2)

In [18]:
log_multi_pipeline = Pipeline(
    steps=[('processor', vectorizer),
           ('clf', LogisticRegressionCV(class_weight='balanced', n_jobs=-1))])

In [19]:
log_multi_pipeline.fit(x_train, y_train);

In [20]:
y_train_pred = log_multi_pipeline.predict(x_train)
y_test_pred = log_multi_pipeline.predict(x_test)

In [21]:
acc_train = accuracy_score(y_train, y_train_pred)
acc_train

0.5148059828485704

In [22]:
acc_test = accuracy_score(y_test, y_test_pred)
acc_test

0.5044332990013882

In [23]:
tweet = x_test[2]
print(tweet)
print(f'probability: {log_multi_pipeline.predict_proba([tweet])}, class: {log_multi_pipeline.predict([tweet])}')

he aceptado el reto de  y reto a     aceptáis? 
probability: [[0.20936523 0.51258017 0.27805461]], class: [0]


In [24]:
# # Define the hiperparameters (Very simple because of memory issues)
# hiperparam_pipeline_log_reg = {  
#    "regresion__fit_intercept":[True], # Data is not centered
#    "regresion__cv":[10, 20],
#    "regresion__max_iter": [500, 600]
#     }

In [25]:
# log_grid_search = GridSearchCV(estimator=log_pipeline,
#                               param_grid=hiperparam_pipeline_log_reg,
#                               scoring="roc_auc",
#                               cv=10,
#                               n_jobs=-1
#                              )
        

# #We train the model
# log_grid_search.fit( x_train, y_train)

# #This will take a very long time

In [26]:
# log_grid_search.score(x_test, y_test)

# Save best model

In [27]:
model_path = '../models/linear/'

binary_file = os.path.join(model_path, 'log_bin_pipeline.joblib')
joblib.dump(log_bin_pipeline, binary_file)

multi_file = os.path.join(model_path, 'log_multi_pipeline.joblib')
joblib.dump(log_multi_pipeline, multi_file)

['../models/linear/log_multi_pipeline.joblib']