In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    confusion_matrix,
    cohen_kappa_score,
    classification_report,
    accuracy_score,
    roc_auc_score,
)
import nltk
nltk.download('stopwords')
from nltk.tokenize.casual import TweetTokenizer
import re

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
class CharsCountTransformer(BaseEstimator, TransformerMixin):
    def get_relevant_chars(self, tweet):
        num_hashtags = tweet.count("#")
        num_exclamations = tweet.count("!")
        num_interrogations = tweet.count("?")
        num_at = tweet.count("@")
        num_uc = sum(1 for c in tweet if c.isupper())
        return [num_hashtags, num_exclamations, num_interrogations, num_at, num_uc]

    def transform(self, X, y=None):
        chars = []
        for tweet in X:
            chars.append(self.get_relevant_chars(tweet))

        return np.array(chars)

    def fit(self, X, y=None):
        return self

In [31]:
def auc_score(test_set, predicted_set):
    high_predicted = np.array([prediction[2] for prediction in predicted_set])
    medium_predicted = np.array(
        [prediction[1] for prediction in predicted_set]
    )
    low_predicted = np.array([prediction[0] for prediction in predicted_set])
    inc_test = np.where(test_set == "incivilidad", 1.0, 0.0)
    odio_test = np.where(test_set == "odio", 1.0, 0.0)
    normal_test = np.where(test_set == "normal", 1.0, 0.0)
    auc_high = roc_auc_score(inc_test, high_predicted)
    auc_med = roc_auc_score(odio_test, medium_predicted)
    auc_low = roc_auc_score(normal_test, low_predicted)
    auc_w = (
        normal_test.sum() * auc_low
        + odio_test.sum() * auc_med
        + inc_test.sum() * auc_high
    ) / (normal_test.sum() + odio_test.sum() + inc_test.sum())
    return auc_w


def evaluate(predicted_probabilities, y_test, labels):
    # Importante: al transformar los arreglos de probabilidad a clases,
    # entregar el arreglo de clases aprendido por el clasificador.
    # (que comunmente, es distinto a ['normal', 'odio', 'incivilidad'])
    predicted_labels = [
        labels[np.argmax(item)] for item in predicted_probabilities
    ]

    print("Matriz de confusión")
    print(
        confusion_matrix(
            y_test, predicted_labels, labels=["normal", "odio", "incivilidad"]
        )
    )

    print("\nReporte de clasificación:\n")
    print(
        classification_report(
            y_test, predicted_labels, labels=["normal", "odio", "incivilidad"]
        )
    )
    # Reorder predicted probabilities array.
    labels = labels.tolist()

    predicted_probabilities = predicted_probabilities[
        :,
        [
            labels.index("normal"),
            labels.index("odio"),
            labels.index("incivilidad"),
        ],
    ]

    auc = round(auc_score(y_test, predicted_probabilities), 3)
    print("Métricas:\n\nAUC: ", auc, end="\t")
    kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
    print("Kappa:", kappa, end="\t")
    accuracy = round(accuracy_score(y_test, predicted_labels), 3)
    print("Accuracy:", accuracy)
    print("------------------------------------------------------\n")
    return np.array([auc, kappa, accuracy])


In [32]:
data = pd.read_csv("https://raw.githubusercontent.com/dccuchile/CC6205/master/assignments/new/assignment_1/train/train.tsv", sep="\t")

In [33]:
stopwords = nltk.corpus.stopwords.words('spanish')

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    data.texto,
    data.clase,
    shuffle=True,
    test_size=0.33,
    random_state=42,
)


In [35]:
vectorizer = TfidfVectorizer(
    stop_words=stopwords
)
ml_pipeline = Pipeline(
    [
        (
            "features",
            FeatureUnion(
                [
                    ("vectorizer", vectorizer),
                    ("chars_count", CharsCountTransformer()),
                ]
            ),
        ),
        ("clf", RandomForestClassifier(n_jobs=-1,)),
    ]
)


In [36]:
t = TweetTokenizer(reduce_len=False,strip_handles=False)
def tweet_tokenizer(text):
    return t.tokenize(text)

def preprocessor(text):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

In [43]:
param_grid = {
    "clf__n_estimators": [800,1000,1200],
    "clf__criterion":['gini', 'entropy'],
    'clf__max_features':['log2'],
    'clf__class_weight':[None,'balanced_subsample','balanced'],
    'features__vectorizer__lowercase':[True],
    'features__vectorizer__ngram_range':[(1,1)],
    'features__vectorizer__tokenizer':[tweet_tokenizer, None],
    'features__vectorizer__preprocessor':[preprocessor, None]
}
search = RandomizedSearchCV(
    ml_pipeline,
    param_grid,
    scoring='roc_auc_ovr_weighted',
    n_jobs=-1,
    verbose = 2,
    n_iter=20,
    cv=3
)
search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END clf__class_weight=balanced, clf__criterion=entropy, clf__max_features=log2, clf__n_estimators=1000, features__vectorizer__lowercase=True, features__vectorizer__ngram_range=(1, 1), features__vectorizer__preprocessor=None, features__vectorizer__tokenizer=None; total time= 1.3min
[CV] END clf__class_weight=balanced_subsample, clf__criterion=gini, clf__max_features=log2, clf__n_estimators=1000, features__vectorizer__lowercase=True, features__vectorizer__ngram_range=(1, 1), features__vectorizer__preprocessor=None, features__vectorizer__tokenizer=None; total time= 1.4min
[CV] END clf__class_weight=balanced_subsample, clf__criterion=gini, clf__max_features=log2, clf__n_estimators=1000, features__vectorizer__lowercase=True, features__vectorizer__ngram_range=(1, 1), features__vectorizer__preprocessor=None, features__vectorizer__tokenizer=None; total time= 1.4min
[CV] END clf__class_weight=balanced, clf__criterion=entropy, clf

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('features',
                                              FeatureUnion(transformer_list=[('vectorizer',
                                                                              TfidfVectorizer(stop_words=['de',
                                                                                                          'la',
                                                                                                          'que',
                                                                                                          'el',
                                                                                                          'en',
                                                                                                          'y',
                                                                                                          'a',
                                          

In [44]:
search.best_params_

{'features__vectorizer__tokenizer': None,
 'features__vectorizer__preprocessor': None,
 'features__vectorizer__ngram_range': (1, 1),
 'features__vectorizer__lowercase': True,
 'clf__n_estimators': 1000,
 'clf__max_features': 'log2',
 'clf__criterion': 'entropy',
 'clf__class_weight': 'balanced'}

In [45]:
best_params = {'features__vectorizer__ngram_range': (1, 1),
 'features__vectorizer__lowercase': True,
 'clf__n_estimators': 1200,
 'clf__max_features': 'log2',
 'clf__criterion': 'entropy',
 'clf__class_weight': 'balanced'}

In [46]:
# ml_pipeline.set_params(**search.best_params_)

In [47]:
# ml_pipeline.fit(X_train, y_train)

In [48]:
predicted_probabilities = search.best_estimator_.predict_proba(X_test)
learned_labels = search.best_estimator_.classes_
scores = evaluate(predicted_probabilities, y_test, learned_labels)

Matriz de confusión
[[1018   27  412]
 [ 204  322  311]
 [ 159   14 1564]]

Reporte de clasificación:

              precision    recall  f1-score   support

      normal       0.74      0.70      0.72      1457
        odio       0.89      0.38      0.54       837
 incivilidad       0.68      0.90      0.78      1737

    accuracy                           0.72      4031
   macro avg       0.77      0.66      0.68      4031
weighted avg       0.75      0.72      0.71      4031

Métricas:

AUC:  0.901	Kappa: 0.544	Accuracy: 0.72
------------------------------------------------------

