# Baseline tarea 1

-----------------------------




## Importar librerías y utiles

In [1]:
import os
import copy
import random
import shutil
import numpy as np
import pandas as pd
import gzip

import gensim

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV, cross_validate
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.casual import TweetTokenizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix,
                             cohen_kappa_score,
                             classification_report,
                             accuracy_score,
                             roc_auc_score,
                             make_scorer)
from sklearn.model_selection import train_test_split

from utils import auc 




In [2]:
# setear semillas
SEED = 8080
random.seed(SEED)
np.random.seed(SEED)

## Datos

### Obtener los datasets desde el github del curso

In [3]:
base_url = 'https://raw.githubusercontent.com/dccuchile/CC6205/master/assignments/assignment_1/data'
col_names = ['id', 'tweet', 'class', 'sentiment_intensity']
sentiments = ['anger', 'fear', 'joy', 'sadness']
split_names = ['train', 'target']
train, target = [
    {
        sentiment : pd.read_csv(
            f"{base_url}/{split_name}/{sentiment}-{split_name}.txt",
            sep='\t',
            names=col_names
        )
        for sentiment in sentiments
    }
    for split_name in split_names
]

### Analizar los datos 

Imprimir la cantidad de tweets de cada dataset, según su intensidad de sentimiento

In [4]:
def get_group_dist(group_name, train):
    print(group_name, "\n",
          train[group_name].groupby('sentiment_intensity').count())


for key in train:
    get_group_dist(key, train)

anger 
                       id  tweet  class
sentiment_intensity                   
high                 163    163    163
low                  161    161    161
medium               617    617    617
fear 
                       id  tweet  class
sentiment_intensity                   
high                 270    270    270
low                  288    288    288
medium               699    699    699
joy 
                       id  tweet  class
sentiment_intensity                   
high                 195    195    195
low                  219    219    219
medium               488    488    488
sadness 
                       id  tweet  class
sentiment_intensity                   
high                 197    197    197
low                  210    210    210
medium               453    453    453


In [5]:
def fair_sampling(train, rs):
    new_train = copy.deepcopy(train)
    intensities = ['high', 'medium', 'low']
    for key in new_train:
        cants = np.array([new_train[key]['id'][new_train[key].sentiment_intensity == tens].count()
                          for tens in intensities])
        max_who = intensities[list(cants).index(cants.max())]
        new_max = int((cants.mean() - cants.max()/3) * 3/2)
        # Aqui estan los reducidos reducidos
        max_sample = new_train[key][new_train[key].sentiment_intensity == max_who].sample(new_max, random_state = rs)
        the_rest = new_train[key][new_train[key].sentiment_intensity != max_who].copy()
        all_data = the_rest.append(max_sample, ignore_index = True)
        new_train[key] = all_data
    return new_train

In [6]:
new_train = fair_sampling(train, 8080)
for key in new_train:
    get_group_dist(key, new_train)

anger 
                       id  tweet  class
sentiment_intensity                   
high                 163    163    163
low                  161    161    161
medium               162    162    162
fear 
                       id  tweet  class
sentiment_intensity                   
high                 270    270    270
low                  288    288    288
medium               279    279    279
joy 
                       id  tweet  class
sentiment_intensity                   
high                 195    195    195
low                  219    219    219
medium               207    207    207
sadness 
                       id  tweet  class
sentiment_intensity                   
high                 197    197    197
low                  210    210    210
medium               203    203    203


## Clasificar

### Dividir el dataset en entrenamiento y prueba

In [7]:
def split_dataset(dataset):
    # Dividir el dataset en train set y test set
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.tweet,
        dataset.sentiment_intensity,
        shuffle=True,
        test_size=0.33,
        random_state=8080,
    )
    return X_train, X_test, y_train, y_test

### Definir el clasificador

Consejo para el vectorizador: investigar los modulos de `nltk`, en particular, `TweetTokenizer`, `mark_negation`. También, el parámetro ngram_range para clasificadores no bayesianos.

Consejo para el clasificador: investigar otros clasificadores mas efectivos que naive bayes. Ojo q naive bayes no debería usarse con n-gramas, ya que rompe el supuesto de independencia.


### Armando un embedding vectorizer

In [8]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
model_glove_twitter = api.load("glove-twitter-25")

In [12]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, wordvector, tokenize):
        self.word2vec = wordvector
        self.tokenize = tokenize
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = wordvector.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[word] for word in self.tokenize(phrase) if word in self.word2vec.index2word]
                    or [np.zeros(self.dim)], axis=0)
            for phrase in X.array
        ])

In [13]:
wordvector = model_glove_twitter.wv
token = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
mev = MeanEmbeddingVectorizer(wordvector, token)

  """Entry point for launching an IPython kernel.


In [14]:
mytokenizer = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
vectorizer = CountVectorizer(tokenizer=mytokenizer.tokenize, ngram_range=(1, 1))


def get_bagging(base = SVC(kernel = 'linear', probability = True), n_est = 10, vect = vectorizer):
    # Inicializamos el Clasificador.
    classifier = BaggingClassifier(base_estimator = base,
                                   n_estimators = n_est,
                                   n_jobs = -1)
    
    # Establecer el pipeline.
    text_clf = Pipeline([('vect', vect), ('clf', classifier)])
    return text_clf


def get_ada(base = SVC(kernel = 'linear', probability = True), n_est = 50, vect = vectorizer):    
    # Inicializamos el Clasificador.
    classifier = AdaBoostClassifier(base_estimator = base, n_estimators = n_est)
    
    # Establecer el pipeline.
    text_clf = Pipeline([('vect', vect), ('clf', classifier)])
    return text_clf


def get_svm_rbf(vect = vectorizer):
    # Inicializamos el Clasificador.
    classifier = SVC(kernel = 'rbf',
                     gamma = 'scale',
                     C = 100,
                     probability = True)
    # Establecer el pipeline.
    text_clf = Pipeline([('vect', vect), ('clf', classifier)])
    return text_clf


def get_svm_linear(vect = vectorizer):
    # Inicializamos el Clasificador.
    classifier = SVC(kernel = 'linear',
                     probability = True)
    # Establecer el pipeline.
    text_clf = Pipeline([('vect', vect), ('clf', classifier)])
    return text_clf


def get_baseline(vect = vectorizer):
    # Inicializamos el Clasificador.
    classifier = MultinomialNB()
    # Establecer el pipeline.
    text_clf = Pipeline([('vect', vect), ('clf', classifier)])
    return text_clf

### Definir evaluación

Esta función imprime la matriz de confusión, el reporte de clasificación y las metricas usadas en la competencia:


- `auc`
- `kappa`
- `accuracy`

In [15]:
def evaulate(predicted, y_test, labels):
    # Importante: al transformar los arreglos de probabilidad a clases,
    # entregar el arreglo de clases aprendido por el clasificador. 
    # (que comunmente, es distinto a ['low', 'medium', 'high'])
    predicted_labels = [labels[np.argmax(item)] for item in predicted]
    '''
    # Confusion Matrix
    print('Confusion Matrix for {}:\n'.format(key))

    # Classification Report
    print(
        confusion_matrix(y_test,
                         predicted_labels,
                         labels=['low', 'medium', 'high']))

    print('\nClassification Report')
    print(
        classification_report(y_test,
                              predicted_labels,
                              labels=['low', 'medium', 'high']))
    '''
    # AUC
    print("auc: ", auc(y_test, predicted))

    # Kappa
    print("kappa:", cohen_kappa_score(y_test, predicted_labels))

    # Accuracy
    print("accuracy:", accuracy_score(y_test, predicted_labels), "\n")

    print('------------------------------------------------------\n\n')

### Ejecutar el clasificador para cierto dataset

Clasifica un dataset. Retorna el modelo ya entrenado mas sus labels asociadas.


In [16]:
def classify(text_clf ,dataset, key, get_best = False):
    X_train, X_test, y_train, y_test = split_dataset(dataset)
    skf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10)
    print("Empieza Cross Validation")
    results = cross_validate(text_clf, X_train, y_train, cv = skf,
                           scoring = make_scorer(cohen_kappa_score),
                           return_estimator = True,
                           error_score='raise',
                           n_jobs = 3)
    print(f"Resultados CV: Cohen-Kappa {results['test_score'].mean()} +/- {results['test_score'].std()**2}")          
    # Entrenar el clasificador
    text_clf.fit(X_train, y_train)

    # Predecir las probabilidades de intensidad de cada elemento del set de prueba.
    predicted = text_clf.predict_proba(X_test)

    # Obtener las clases aprendidas.
    learned_labels = text_clf.classes_

    # Evaluar
    print(f"Resultados para {key} usando el modelo {text_clf.steps[1][1]}")
    evaulate(predicted, y_test, learned_labels)
    return text_clf, learned_labels

### Ejecutar el clasificador por cada dataset


In [None]:
classifiers = []
learned_labels_array = []

linear = get_svm_linear(vect = mev)
rbf = get_svm_rbf()
ada = get_ada()
bagg = get_bagging()

models = [linear]

# Por cada llave en train ('anger', 'fear', 'joy', 'sadness')
'''
print("Probando con el dataset completo")
for model in models3[:1]:
    for key in train:
        classifier, learned_labels = classify(model, train[key], key, get_best = True)
        classifiers.append(classifier)
        learned_labels_array.append(learned_labels)
'''
print("Probando con el dataset recortado")
for model in models:
    for key in new_train:
        classifier, learned_labels = classify(model, new_train[key], key, get_best = True)
        classifiers.append(classifier)
        learned_labels_array.append(learned_labels)

Probando con el dataset recortado
Empieza Cross Validation


## Predecir target set

In [None]:
def predict_target(dataset, classifier, labels):
    # Predecir las probabilidades de intensidad de cada elemento del target set.
    predicted = pd.DataFrame(classifier.predict_proba(dataset.tweet), columns=labels)
    # Agregar ids
    predicted['id'] = dataset.id.values
    # Reordenar
    predicted = predicted[['id', 'low', 'medium', 'high']]
    return predicted

### Ejecutar la predicción y guardar archivos.

In [None]:
predicted_target = {}

if (not os.path.isdir('./predictions')):
    os.mkdir('./predictions')

else:
    # Eliminar predicciones anteriores:
    shutil.rmtree('./predictions')
    os.mkdir('./predictions')

for idx, key in enumerate(target):
    # Predecir el target set
    predicted_target[key] = predict_target(target[key], classifiers[idx],
                                           learned_labels_array[idx])
    # Guardar predicciones
    predicted_target[key].to_csv('./predictions/{}-pred.txt'.format(key),
                                 sep='\t',
                                 header=False,
                                 index=False)

# Crear archivo zip
a = shutil.make_archive('predictions', 'zip', './predictions')