In [57]:
#!pip install --quiet sentence-transformers emoji 

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = "/content/drive/MyDrive/nlp-tweets-classification/"
    ARTIFACTS_PATH = "/content/drive/MyDrive/nlp-tweets-classification/"
except ModuleNotFoundError:
    DATA_PATH = "../../data/"
    ARTIFACTS_PATH = "../../artifacts/"
    running_in_colab = False

## Imports

In [1]:
import os
import joblib
import random
import uuid

from emoji import demojize
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_samples, silhouette_score, \
                            cohen_kappa_score, precision_score, \
                            recall_score, average_precision_score, \
                            f1_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

pd.set_option("display.max_colwidth", None)

SEED = 42
DATA_PATH = '../../../../tweets_data/'
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

## Utility functions

In [2]:
def clean_text(text):
   
    tokens = text.split()
    tokens = [t for t in tokens if 'http' not in t]
    tokens = [demojize(t, language='es') for t in tokens]
    tokens = [t.replace('@', '') for t in tokens]
    
    return " ".join(tokens)


def get_hyperparams_space(model_name):
    if model_name == "log":
        hyperparams = dict(
            C=[0.001, 0.01, 0.1, 1, 10, 100], 
            penalty=["l1", "l2"],
            class_weight=["balanced", None]
        )
    elif model_name == "rf":
        hyperparams = dict(
            n_estimators=[120, 500, 1200],
            class_weight=["balanced", None]
        )
    elif model_name == "svc":
        hyperparams = dict(
            C=[0.001, 0.01, 0.1, 1, 10, 100],
            class_weight=["balanced", None]
        )
    elif model_name == "xgb":
        hyperparams = dict(
            eta=[0.01, 0.05, 0.1],
            gamma=[0.1, 0.5, 1],
            max_depth=[3, 12, 25],
            min_child_weight=[1, 3, 7],
            subsample=[0.6, 0.8, 1],
            colsample_bytree=[0.6, 0.8, 1],
        )
    elif model_name == "nb":
        hyperparams = dict()
    elif model_name == "lgbm":
        hyperparams = dict(
            learning_rate=[0.01, 0.05, 0.1],
        )
    else:
        raise Exception(f"No hyperparams for model {model_name}")
    return hyperparams

def get_model(model_name):
    if model_name == "lgbm":
        return LGBMClassifier()
    elif model_name == "hist":
        return HistGradientBoostingClassifier()
    elif model_name == "log":
        return LogisticRegression(class_weight="balanced", solver="liblinear")
    elif model_name == "rf":
        return RandomForestClassifier(
                  n_jobs=-1, 
                  n_estimators=1200, 
               )
    elif model_name == "nb":
        return GaussianNB()
    elif model_name == "xgb":
        return XGBClassifier(tree_method="hist", use_label_encoder=False, max_depth=10, eta=0.1)
    elif model_name == "svc":
        return LinearSVC(class_weight="balanced")
    else:
        raise ValueError(format)

def get_mode_rows(a):
    a = np.ascontiguousarray(a)
    void_dt = np.dtype((np.void, a.dtype.itemsize * np.prod(a.shape[1:])))
    _, ids, count = np.unique(a.view(void_dt).ravel(), return_index=1, return_counts=1)
    largest_count_id = ids[count.argmax()]
    most_frequent_row = a[largest_count_id]
    return most_frequent_row

## Set notebook parameters

In [3]:
col_target = "label"
model_name = "log"
n_splits = 10
use_precalculated_embeddings = True 
run_hyperparams_search = True 

## Read data

In [4]:
data = pd.read_csv(DATA_PATH + 'all_citizens_labeled_20221006.csv')


In [5]:
df = data[['label', 'id', 'text']]


In [6]:
df = df.dropna()


In [7]:
df.drop([670,1991], axis=0, inplace=True)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('label', axis=1), df.label, random_state=42, stratify=df.label)

In [9]:
df = pd.concat([y_train, x_train], axis=1)
df_test = pd.concat([y_test, x_test], axis =1)

## Create embeddings

In [10]:
df_full= pd.concat([df,df_test], axis=0)

In [11]:
df_full.reset_index(drop=True, inplace=True)

In [12]:
if use_precalculated_embeddings:
    df_embeddings = pd.read_csv(DATA_PATH + "tweets_citizens_embeddings.csv")
else:
    from sentence_transformers import SentenceTransformer
    
    cleaned_tweets = [clean_text(tweet) for tweet in tqdm(df_full.text)]
    
    model_em = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
    embeddings = model_em.encode(cleaned_tweets)

    df_embeddings = pd.concat([df_full["id"], pd.DataFrame(embeddings)], axis=1)
    
    df_embeddings.to_csv(DATA_PATH + 'tweets_citizens_embeddings.csv', index=False)

#assert df.id.isin(df_embeddings.id).all() 
#assert df_test.id.isin(df_embeddings.id).all() 

## Train model

In [13]:
df_results = pd.DataFrame()

x_test = df_test[["id"]].merge(df_embeddings, how="inner", on="id").drop(columns="id")
df_label = df[["id", col_target]].merge(df_embeddings, how="inner", on="id").drop(columns="id")

le = LabelEncoder()
le.fit(df_label[col_target])
#joblib.dump(le, ARTIFACTS_PATH + f"{col_target}/label_encoder.joblib")

df_label = df_label.sample(frac=1).reset_index(drop=True).dropna()
df_label["kfold"] = -1

kf = StratifiedKFold(n_splits=10)

for f, (t, v) in enumerate(kf.split(X=df_label, y=df_label[col_target])):
    df_label.loc[v, "kfold"] = f

f1_scores = []
kappa_scores = []
precision_scores = []
recall_scores = []
acc_scores = []
test_preds = []

print(f"Training {model_name}")
for f in range(n_splits):
    df_train = df_label[df_label.kfold != f].reset_index(drop=True)
    df_val = df_label[df_label.kfold == f].reset_index(drop=True)
    
    print(f"Split {f}: {df_train.shape[0]} observations for training / {df_val.shape[0]} observations for validation")
    x_train = df_train.iloc[:, 1:-1]
    x_val = df_val.iloc[:, 1:-1]
    y_train = le.transform(df_train[col_target])
    y_val = le.transform(df_val[col_target])

    if run_hyperparams_search:
        space = get_hyperparams_space(model_name)
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True)
        search = GridSearchCV(get_model(model_name), space, scoring='f1_macro', cv=cv_inner, refit=True)
        result = search.fit(x_train, y_train)
        model = result.best_estimator_
    else:
          model = get_model(model_name)
    
    #joblib.dump(model, ARTIFACTS_PATH + f"{col_target}/model_{model_name}_{f}.joblib")

    model.fit(x_train, y_train)
    preds = model.predict(x_val)
    test_preds.append(model.predict(x_test)) 

    f1_scores.append(f1_score(y_val, preds, average="macro"))
    kappa_scores.append(cohen_kappa_score(y_val, preds))
    precision_scores.append(precision_score(y_val, preds, average="macro"))
    recall_scores.append(recall_score(y_val, preds, average="macro"))
    print(f"f1: {np.mean(f1_scores):.2f}, kappa: {np.mean(kappa_scores):.2f}")
    
df_results = pd.concat(
    [df_results, 
      pd.DataFrame(
          dict(
              target=col_target,
              model=[model_name],
              f1=np.mean(f1_scores),
              kappa=np.mean(kappa_scores),
              precision=np.mean(precision_scores),
              recall=np.mean(recall_scores),
              f1_scores=str([round(s, 2) for s in f1_scores]),
              kappa_scores=str([round(s, 2) for s in kappa_scores]),
              precision_scores=str([round(s, 2) for s in precision_scores]),
              recall_scores=str([round(s, 2) for s in recall_scores]),
          )
        )
])

Training log
Split 0: 1683 observations for training / 188 observations for validation
f1: 0.41, kappa: 0.29
Split 1: 1684 observations for training / 187 observations for validation
f1: 0.43, kappa: 0.29
Split 2: 1684 observations for training / 187 observations for validation
f1: 0.43, kappa: 0.30
Split 3: 1684 observations for training / 187 observations for validation
f1: 0.44, kappa: 0.31
Split 4: 1684 observations for training / 187 observations for validation
f1: 0.45, kappa: 0.32
Split 5: 1684 observations for training / 187 observations for validation
f1: 0.46, kappa: 0.33
Split 6: 1684 observations for training / 187 observations for validation
f1: 0.46, kappa: 0.33
Split 7: 1684 observations for training / 187 observations for validation
f1: 0.47, kappa: 0.34
Split 8: 1684 observations for training / 187 observations for validation
f1: 0.49, kappa: 0.36
Split 9: 1684 observations for training / 187 observations for validation
f1: 0.48, kappa: 0.36


In [14]:
labels = list(df.label.unique())


In [15]:
labels

['expresivo',
 'entretenido',
 'informativo_demandante',
 'informativo_colaborador',
 'destructivo']

In [16]:
lab_enconde = le.fit_transform(labels)

In [17]:
dictio_lab_encond = dict(zip(lab_enconde,labels))
dictio_lab_encond

{2: 'expresivo',
 1: 'entretenido',
 4: 'informativo_demandante',
 3: 'informativo_colaborador',
 0: 'destructivo'}

In [18]:
df_results.T

Unnamed: 0,0
target,label
model,log
f1,0.484764
kappa,0.361125
precision,0.475917
recall,0.503399
f1_scores,"[0.41, 0.45, 0.44, 0.46, 0.5, 0.5, 0.49, 0.53, 0.6, 0.47]"
kappa_scores,"[0.29, 0.3, 0.32, 0.32, 0.38, 0.37, 0.37, 0.41, 0.49, 0.36]"
precision_scores,"[0.41, 0.46, 0.43, 0.45, 0.49, 0.49, 0.48, 0.52, 0.58, 0.47]"
recall_scores,"[0.43, 0.46, 0.45, 0.47, 0.53, 0.53, 0.51, 0.55, 0.63, 0.47]"


## Save results

In [20]:
results_path = DATA_PATH + f"results_citizens_{model_name}.csv"
if run_hyperparams_search:
    results_path = DATA_PATH + f"results_best_hyperparams_{model_name}.csv"
df_results.to_csv(results_path, mode='a', header=not os.path.exists(results_path), index=False)

In [21]:
df_train[col_target].value_counts(normalize=True)

expresivo                  0.480404
informativo_colaborador    0.198337
informativo_demandante     0.135392
destructivo                0.098575
entretenido                0.087292
Name: label, dtype: float64

In [22]:
df_test[col_target] = le.inverse_transform(get_mode_rows(test_preds))
df_test["label"] = df_test[col_target]
df_test[col_target].value_counts(normalize=True)

expresivo                  0.479167
informativo_colaborador    0.179487
informativo_demandante     0.165064
destructivo                0.092949
entretenido                0.083333
Name: label, dtype: float64

In [23]:
model.fit(df_embeddings.iloc[:,1:-1], df_full.label)

In [24]:
all_preds = model.predict(df_embeddings.iloc[:,1:-1])

In [25]:
f1_score(all_preds, df_full.label, average='macro')

0.6588809217140985

In [54]:
all_tweets = pd.read_csv('../../../../tweets_data/all_citizens_joined.csv')
all_tweets.shape

(584685, 30)

In [55]:
all_es = all_tweets.loc[all_tweets.lang=='es'][['id tweet', 'text']]

all_es.reset_index(drop=True, inplace=True)

In [56]:
all_es.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454748 entries, 0 to 454747
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id tweet  454748 non-null  int64 
 1   text      454748 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB


In [35]:
if use_precalculated_embeddings:
    df_embeddings_es = pd.read_csv('../../../../tweets_data/all_citizens_joined_embeddings.csv')
else:
    from sentence_transformers import SentenceTransformer
    cleaned_tweets = [clean_text(tweet) for tweet in tqdm(all_es.text)]
    
    model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
    embeddings_es = model.encode(cleaned_tweets)

    df_embeddings_es = pd.concat([all_es["id tweet"], pd.DataFrame(embeddings)], axis=1)

In [37]:
df_embeddings_es.shape

(454748, 769)

In [39]:
df_embeddings_es.head()

Unnamed: 0,id tweet,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,1535395134686240774,0.906994,0.230015,0.775082,-0.244691,-0.149994,-0.698744,0.375754,-1.003181,0.230185,...,-0.178782,0.160151,0.261129,0.16388,0.320013,-0.562562,0.068671,0.154549,-0.761191,-0.299566
1,1535395568998129666,0.449592,-0.146677,-0.915937,-0.347452,-0.109174,-0.194892,1.335919,-0.014602,0.619942,...,0.066316,0.456723,-0.269764,0.193559,-0.508561,-0.582981,-0.373798,-0.039058,0.071571,0.508146
2,1535395648509460480,-0.202093,0.290053,-0.127502,-0.391537,-0.273759,0.441322,0.972044,0.107094,-0.077597,...,-0.218865,-0.934736,0.344583,-0.098832,-0.076999,-0.730442,-0.578819,-0.798397,1.176301,0.352648
3,1535395809054826496,0.264822,0.015052,0.155617,-0.09998,-0.097115,0.49296,0.780404,0.864955,-0.580162,...,0.200758,-0.258619,-0.177287,-0.012658,-0.394453,-0.580247,0.303021,-0.969682,-0.068528,0.265391
4,1535395920866689025,0.458823,-0.249472,-0.089871,-0.261616,-0.205998,-0.323839,0.071756,-0.636309,-0.12611,...,0.489663,0.061293,0.160667,0.264908,-0.199654,-0.452748,-0.497668,-0.234685,-0.442812,0.467232


In [40]:
all_preds_es = model.predict(df_embeddings_es.iloc[:,1:-1])

In [41]:
all_preds_es

array(['informativo_colaborador', 'informativo_demandante', 'expresivo',
       ..., 'expresivo', 'informativo_colaborador',
       'informativo_colaborador'], dtype=object)

In [42]:
all_preds_proba = model.predict_proba(df_embeddings_es.iloc[:,1:-1])

In [43]:
lab_proba = [np.where(all_preds_proba[i] == all_preds_proba[i].max())[0][0] for i in range(len(all_preds_proba))]

In [44]:
all_es[col_target] = all_preds_es

In [45]:
all_es['label_with_proba'] = lab_proba

In [46]:
all_es['label_with_proba'] = all_es['label_with_proba'].apply(lambda x : dictio_lab_encond[x])

In [47]:
all_es.label.value_counts(normalize=True)

expresivo                  0.388646
informativo_colaborador    0.320769
informativo_demandante     0.145320
destructivo                0.088141
entretenido                0.057124
Name: label, dtype: float64

In [48]:
all_es.label_with_proba.value_counts(normalize=True)

expresivo                  0.388646
informativo_colaborador    0.320769
informativo_demandante     0.145320
destructivo                0.088141
entretenido                0.057124
Name: label_with_proba, dtype: float64

In [49]:
dictio_lab_encond

{2: 'expresivo',
 1: 'entretenido',
 4: 'informativo_demandante',
 3: 'informativo_colaborador',
 0: 'destructivo'}

In [50]:
labels_prob = ['destructivo_proba', 'entretenido_proba', 'expresivo_proba', 'informativo_colaborador_proba','informativo_demandante_proba']

In [51]:
final_res = pd.concat([all_es,pd.DataFrame(all_preds_proba, columns=labels_prob)], axis=1)
final_res.head(5)

Unnamed: 0,id tweet,text,label,label_with_proba,destructivo_proba,entretenido_proba,expresivo_proba,informativo_colaborador_proba,informativo_demandante_proba
0,1535395134686240774,. @alicanteayto tendrá uno de los cuatro centros puestos en marcha por @CEOE_ES para ofrecer orientación de empleo a ucranianos @CEV_CV @Boronavarro ☛ https://t.co/u3Lk2c4XrX #mediterráneopress #tudiariodigital #comunitatvalenciana #política #economía #empresa #ciudades,informativo_colaborador,informativo_colaborador,0.183393,0.057162,0.147906,0.543856,0.067682
1,1535395568998129666,"RT @MCMVALVANERA: @policiademadrid @Lineamadrid @MADRID @begonavillacis @AlmeidaPP_ ALONSO CANO 55 ""El Cafecito"" , 20:30 de esta tarde. Ocupacion incluso del espacio del arbol. HAGAN ALGO!! https://t.co/p1DEMYnHDJ",informativo_demandante,informativo_demandante,0.108868,0.106246,0.174866,0.246129,0.36389
2,1535395648509460480,"@anavas14 @ComunidadMadrid @MADRID No hay derecho. Mas condena cruel. Sin escoletas, alguien se quedara en casa sin trabajar, menguaran los ingresos. Y la gente ya en la cuerda floja se precarizará mas.... ¿ Es esta vida la que defienden l@s Ayuso ?",expresivo,expresivo,0.157644,0.0345,0.495863,0.017097,0.294896
3,1535395809054826496,@Enfinqueridapi1 @_joelmar @ChanelTerrero @santacruz_ayto Dos horas que tendría al publico disfrutando!,expresivo,expresivo,0.081676,0.104396,0.726476,0.043409,0.044043
4,1535395920866689025,RT @19MarioCava93: En el @CDPuebladeSoto jugamos al escondite cada vez que un balón se pierde por línea de banda (contraria a la de los banquillos). ¿Os apetece jugar al escondite con nosotros o le ponemos solución a esto ya? @rex_pedro @murciadeportes @SerranoJAntonio @AytoMurcia https://t.co/p9VjwXNvMm,informativo_colaborador,informativo_colaborador,0.063183,0.122499,0.249059,0.354419,0.210841


In [52]:
final_res.shape

(454748, 9)

In [53]:
final_res.to_csv(f'../../../results/citizens_final_{model_name}_results.csv', index=False)