# Remix

In [155]:
import pickle
import re
import spacy
import torch
import emojilib
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('opinion_lexicon')
nltk.download("sentiwordnet")
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import opinion_lexicon
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from unidecode import unidecode
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import cohen_kappa_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from nltk import bigrams
from nltk import trigrams

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
tweet_tokenizer = TweetTokenizer()
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

[nltk_data] Downloading package punkt to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package opinion_lexicon to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\opinion_lexicon.zip.
[nltk_data] Downloading package sentiwordnet to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\sentiwordnet.zip.
[nltk_data] Downloading package omw-1.4 to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'F:\Desktop\Universidad\Lenguaje\NLP-Competition-1\venv\Scripts\python.exe -m pip install --upgrade pip' command.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoj

In [2]:
SENTIMENTS = "anger fear joy sadness".split()
DF_TRAIN = pickle.load(open("df_train.pickle", "rb"))
INTENSITIES = "low medium high".split()
VOWELS = "aeiou"

## Representaciones

In [158]:
LABELS = "lemma pos tag shape".split()

def get_emojilib_attrib(tweet: str) -> dict[str, int]:
    result = {}
    emo_list = emojilib.emoji_list(tweet)
    emo_names = list([d['name'] for d in emo_list if 'name' in d])
    for emo in emo_names:
        if emo not in result.keys():
            result["emoji<&>" + emo] = 0
        result["emoji<&>" + emo] += 1
    return result


def get_linguistics_attrib(tweet: str) -> dict[str, int]:
    result = {}
    nlp_tweet = nlp(tweet)
    for token in nlp_tweet:
        vals = [token.lemma_.lower(), token.pos_, token.tag_, token.shape_]
        for k, v in zip(LABELS, vals):
            ling = f"linguistics<&>{k}<&>{v}"
            if ling not in result.keys():
                result[ling] = 0
            result[ling] += 1
        vals = [token.ent_id_, token.ent_type_]
        for k, v in zip(["name", "label"], vals):
            ling = f"entities<&>{k}<&>{v}"
            if ling not in result.keys():
                result[ling] = 0
            result[ling] += 1

    def not_stop(tup: tuple) -> bool:
        for element in tup:
            if element.is_stop:
                return False
        return True

    bi_tokens = [(w[0].lemma_.lower(), w[1].lemma_.lower()) for w in bigrams(nlp_tweet) if not_stop(w)]
    for bigram in bi_tokens:
        ling = f"linguistics<&>bigram<&>{bigram}"
        if ling not in result.keys():
            result[ling] = 0
        result[ling] += 1
    tri_tokens = [(w[0].lemma_.lower(), w[1].lemma_.lower()) for w in trigrams(nlp_tweet) if not_stop(w)]
    for trigram in tri_tokens:
        ling = f"linguistics<&>trigram<&>{trigram}"
        if ling not in result.keys():
            result[ling] = 0
        result[ling] += 1
    return result


def get_punct_attrib(tweet: str) -> dict[str, int]:
    result = {
        "punct<&>[\.]{3}": len(re.findall(r"[.]{3}", tweet)), "punct<&>[!]": len(re.findall(r"[!]", tweet)),
        "punct<&>[#]": len(re.findall(r"[#]", tweet)), "punct<&>[#]{1}\S+": len(re.findall(r"[#]\S+", tweet)),
        "punct<&>[\*]": len(re.findall(r"[*]", tweet)), "punct<&>[@]{1}\S+": len(re.findall(r"[@]\S+", tweet)),
        "punct<&>\S*[?]{1}\S*": len(re.findall(r"\S*[?]\S*", tweet))
    }
    return result


def get_retro_attrib(tweet: str) -> dict[str, int]:
    result = {
        "retro<&>num_tokens": len(tweet.split()), "retro<&>length": len(" ".join(tweet.split())),
        "retro<&>num_numbs": len(re.findall(r"\d+", tweet)), "retro<&>num_alpha": len(re.findall(r"\w+", tweet)),
        "retro<&>num_with_uppercase": len(re.findall(r"\S*[A-Z]+\S*", tweet)),
        "retro<&>num_tokens_upper": sum(int(t.isupper()) for t in tweet.split())
    }

    def prop_vowels(text: str) -> float:
        length = len(text.replace(" ", ""))
        if length > 0:
            return len(re.findall(r"[aeiou]", text)) / length
        return 0

    def len_max_rep_char(text: str) -> int:
        extended_text = text + " "
        lens = [0]
        character_len = 1
        current= extended_text[0]
        for character in extended_text[1:]:
            if character == current:
                character_len += 1
                continue
            if current.isalpha():
                lens.append(character_len)
            current = character
            character_len = 1
        return max(lens)

    def max_char_fre_per_token(text: str, character="k") -> int:
        tokens = text.split()
        max_freq = 0
        for token in tokens:
            freq = sum(int(c == character) for c in token)
            if freq > max_freq:
                max_freq = freq
        return max_freq

    def max_type_rep_char_per_token(text: str, kind=VOWELS) -> int:
        extended_text = unidecode(text + " ")
        lens = [0]
        character_len = 1
        current = extended_text[0]
        for character in extended_text[1:]:
            current_is_vowel = current in VOWELS
            if current.isalpha() and character.isalpha() and ((character in VOWELS and current_is_vowel) or (character not in VOWELS and not current_is_vowel)):
                character_len += 1
                continue
            if kind == VOWELS:
                if current_is_vowel:
                    lens.append(character_len)
            elif not current_is_vowel:
                lens.append(character_len)
            current = character
            character_len = 1
        return max(lens)

    lower_tweet = tweet.lower()
    result["retro<&>prop_vowels"] = prop_vowels(lower_tweet)
    result["retro<&>len_max_rep_char"] = len_max_rep_char(lower_tweet)
    result["retro<&>max_char_fre_per_token(o)"] = max_char_fre_per_token(lower_tweet, character="o")
    result["retro<&>max_char_fre_per_token(s)"] = max_char_fre_per_token(lower_tweet, character="s")
    result["retro<&>max_char_fre_per_token(g)"] = max_char_fre_per_token(lower_tweet, character="g")
    result["retro<&>max_char_fre_per_token(l)"] = max_char_fre_per_token(lower_tweet, character="l")
    result["retro<&>max_type_rep_char_per_token(vowel)"] = max_type_rep_char_per_token(lower_tweet)

    return result


def get_sentiwordnet_sent(tweet: str):
    def penn_to_wn(tag: str):
        if tag.startswith('J'):
            return wn.ADJ
        elif tag.startswith('N'):
            return wn.NOUN
        elif tag.startswith('R'):
            return wn.ADV
        elif tag.startswith('V'):
            return wn.VERB
        return None

    def get_sentiment(word: str, tag: str):
        wn_tag = penn_to_wn(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            return [0, 0, 0]
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            return [0, 0, 0]
        synsets = wn.synsets(word, pos=wn_tag)
        if not synsets:
            return [0, 0, 0]
        swn_synset = swn.senti_synset(synsets[0].name())
        return [swn_synset.pos_score(), swn_synset.neg_score(), swn_synset.obj_score()]

    words_data = tweet_tokenizer.tokenize(tweet.lower())

    pos_val = nltk.pos_tag(words_data)
    senti_val = [get_sentiment(x, y) for (x, y) in pos_val]
    return dict(zip("+ - o".split(), np.sum(senti_val, axis=0)))


def get_lexicon_attrib(tweet: str) -> dict[str, any]:
    result = {}
    sentiwordnet = get_sentiwordnet_sent(tweet)
    result["lexicon<&>LiuHu<&>+"] = sum(int(t.lower() in opinion_lexicon.positive()) for t in tweet.split())
    result["lexicon<&>LiuHu<&>-"] = sum(int(t.lower() in opinion_lexicon.negative()) for t in tweet.split())
    result["lexicon<&>sentiwordnet<&>+"] = sentiwordnet["+"]
    result["lexicon<&>sentiwordnet<&>-"] = sentiwordnet["-"]
    result["lexicon<&>sentiwordnet<&>o"] = sentiwordnet["o"]
    return result

In [159]:
def get_first_representation(cut_below=5) -> pd.DataFrame:
    attributes = []
    for idx in DF_TRAIN.index:
        tweet = DF_TRAIN.loc[idx]["text"]
        o = {"id": idx}
        o = {**o, **get_retro_attrib(tweet)}
        o = {**o, **get_punct_attrib(tweet)}
        o = {**o, **get_emojilib_attrib(tweet)}
        o = {**o, **get_linguistics_attrib(tweet)}
        attributes.append(o)
    sparse_result = pd.DataFrame(attributes).set_index("id").fillna(0)
    to_drop = []
    for col in sparse_result.columns:
        try:
            count = sparse_result[col].sum()
            if count < cut_below and "linguistics" in col:
                to_drop.append(col)
        except Exception:
            continue
    return sparse_result.drop(columns=to_drop)

def get_second_representation() -> pd.DataFrame:
    data = []
    col_names = [f"BERTweet_{i}" for i in range(768)]
    for idx in DF_TRAIN.index:
        tweet = ' '.join([w.lemma_.lower() for w in nlp(DF_TRAIN.loc[idx]["text"]) if not w.is_stop])
        input_ids = torch.tensor([tokenizer.encode(tweet)])
        with torch.no_grad():
            outputs = bertweet(input_ids)
            hidden_states = outputs[0]
        token_embeddings = np.array([ll.numpy() for ll in hidden_states[0]])
        sentence_embedding = np.mean(token_embeddings, axis=0)
        o = {"id": idx}
        o = {**o, **dict(zip(col_names, sentence_embedding))}
        data.append(o)
    return pd.DataFrame(data).set_index("id").fillna(0)

def get_third_representation() -> pd.DataFrame:
    attributes = []
    for idx in DF_TRAIN.index:
        tweet = DF_TRAIN.loc[idx]["text"]
        o = {"id": idx}
        o = {**o, **get_lexicon_attrib(tweet)}
        attributes.append(o)
    return pd.DataFrame(attributes).set_index("id").fillna(0)

In [148]:
pickle.dump(get_first_representation(), open("revision/df_representation_v1.pickle", "wb"))
pickle.dump(get_second_representation(), open("revision/df_representation_v2.pickle", "wb"))
pickle.dump(get_third_representation(), open("revision/df_representation_v3.pickle", "wb"))

df_representation_v4 = pd.concat([
    pickle.load(open("revision/df_representation_v3.pickle", "rb")),
    pickle.load(open("revision/df_representation_v1.pickle", "rb"))
], axis=1)
pickle.dump(df_representation_v4, open("revision/df_representation_v4.pickle", "wb"))

## Seleccionar columnas

In [165]:
def get_best_f1(df: pd.DataFrame, indexes, ranked_cols, skip) -> int:
    f1_weight = []
    for num_cols in range(1, len(ranked_cols) + 1, skip):
        x = df.loc[indexes][ranked_cols[:num_cols+1]]
        y = DF_TRAIN.loc[x.index]["int"]
        clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
        cv_results = cross_validate(clf, x, y, cv=5, scoring="f1_weighted")
        test_score = cv_results["test_score"]
        f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    return sorted(f1_weight, key=lambda i: i[1], reverse=True)[0][0]


def get_fine_best_f1(df: pd.DataFrame, indexes, ranked_cols, skip) -> int:
    best_f1 = get_best_f1(df, indexes, ranked_cols, skip)
    fine_f1_weight = []
    for num_cols in range(best_f1 - skip, best_f1 + skip, 1):
        x = df.loc[indexes][ranked_cols[:num_cols + 1]]
        y = DF_TRAIN.loc[x.index]["int"]
        clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
        cv_results = cross_validate(clf, x, y, cv=5, scoring="f1_weighted")
        test_score = cv_results["test_score"]
        fine_f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    return sorted(fine_f1_weight, key=lambda i: i[1], reverse=True)[0][0]


def get_indexes_and_ranked_cols(df: pd.DataFrame, sen: str):
    indexes = DF_TRAIN[DF_TRAIN["sen"] == sen].index
    indexes_lh = DF_TRAIN.loc[indexes][DF_TRAIN.loc[indexes]["int"].isin(["low", "high"])].index

    x = df.loc[indexes_lh]
    y = DF_TRAIN.loc[x.index]["int"]

    dic_label_count = y.value_counts().to_dict()
    min_label = min(dic_label_count.items(), key=lambda i: i[1])[0]
    max_label = max(dic_label_count.items(), key=lambda i: i[1])[0]
    index_label_1 = y[y==min_label].index
    oversampling_steps = int(dic_label_count[max_label] / dic_label_count[min_label]) - 1
    x_res, y_res = x.copy(), y.copy()

    for step in range(oversampling_steps):
        new_indexes = [f"{idx}+{step + 1}" for idx in index_label_1]
        copied_sub_x = pd.DataFrame(x.loc[index_label_1].values, columns=x.columns, index=new_indexes)
        copied_sub_y = pd.Series(y.loc[index_label_1].values, index=new_indexes)
        x_res = pd.concat([x_res, copied_sub_x], axis=0)
        y_res = pd.concat([y_res, copied_sub_y], axis=0)

    x_res = pd.DataFrame(StandardScaler().fit_transform(x_res), columns=x_res.columns, index=x_res.index)
    selector = SelectKBest(chi2, k=x.shape[1])
    x_res_ = x_res - x_res.min()
    selector.fit(x_res_, y_res)
    scores_selector = {col: selector.scores_[i] if str(selector.scores_[i]) != "nan" else 0 for i, col in enumerate(x.columns.tolist())}
    ranked_cols = [x[0] for x in sorted(scores_selector.items(), key=lambda i: i[1], reverse=True)]
    return indexes, ranked_cols


def dump_cols_selected(df: pd.DataFrame, version: int, skip=10) -> None:
    for sen in SENTIMENTS:
        indexes, ranked_cols = get_indexes_and_ranked_cols(df, sen)
        pickle.dump(ranked_cols[:get_fine_best_f1(df, indexes, ranked_cols, skip) + 1], open(f"revision/cols_selected_{sen}_v{version}.pickle", "wb"))

In [151]:
dump_cols_selected(pickle.load(open("revision/df_representation_v1.pickle", "rb")), 1)
dump_cols_selected(pickle.load(open("revision/df_representation_v2.pickle", "rb")), 2)
dump_cols_selected(pickle.load(open("revision/df_representation_v3.pickle", "rb")), 3, 1)
dump_cols_selected(pickle.load(open("revision/df_representation_v4.pickle", "rb")), 4)

## Evaluación

In [167]:
METRICS = "auc kappa accuracy".split()
N_FITS = 100

summary_baseline = pd.DataFrame({
    "sen": SENTIMENTS,
    "auc": [0.62, 0.67, 0.65, 0.67],
    "kappa": [0.07, 0.15, 0.18, 0.19],
    "accuracy": [0.63, 0.57, 0.54, 0.55]
}).set_index("sen")

df_representation_v1 = pickle.load(open("revision/df_representation_v1.pickle", "rb"))
df_representation_v2 = pickle.load(open("revision/df_representation_v2.pickle", "rb"))
df_representation_v3 = pickle.load(open("revision/df_representation_v3.pickle", "rb"))
df_representation_v4 = pickle.load(open("revision/df_representation_v4.pickle", "rb"))

In [168]:
def auc_score(test_set, predicted_set) -> float:
    high_predicted = np.array([prediction[2] for prediction in predicted_set])
    medium_predicted = np.array([prediction[1] for prediction in predicted_set])
    low_predicted = np.array([prediction[0] for prediction in predicted_set])
    high_test = np.where(test_set == 'high', 1.0, 0.0)
    medium_test = np.where(test_set == 'medium', 1.0, 0.0)
    low_test = np.where(test_set == 'low', 1.0, 0.0)
    auc_high = roc_auc_score(high_test, high_predicted)
    auc_med = roc_auc_score(medium_test, medium_predicted)
    auc_low = roc_auc_score(low_test, low_predicted)
    auc_w = (low_test.sum() * auc_low + medium_test.sum() * auc_med +
             high_test.sum() * auc_high) / (
                 low_test.sum() + medium_test.sum() + high_test.sum())
    return auc_w

In [169]:
def get_summary(df: pd.DataFrame, version: int) -> dict:
    summary = {sen: {l: [] for l in METRICS} for sen in SENTIMENTS}
    for sen in SENTIMENTS:
        for _ in range(N_FITS):
            indexes = DF_TRAIN[DF_TRAIN["sen"] == sen].index
            cols_selected_sen = pickle.load(open(f"revision/cols_selected_{sen}_v{version}.pickle", "rb"))

            x = df.loc[indexes][cols_selected_sen]
            y = DF_TRAIN.loc[x.index]["int"]

            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=np.random.randint(1, x.shape[0]))
            y_train = y_train.replace({"low": 0, "medium": 1, "high": 2})
            clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight="balanced", probability=True))
            clf.fit(x_train, y_train)
            clf.score(x_test, y_test.replace({"low": 0, "medium": 1, "high": 2}))
            y_pred = clf.predict_proba(x_test)
            predicted_labels = [
                INTENSITIES[np.argmax(item)] for item in y_pred
            ]

            auc = round(auc_score(y_test, y_pred), 3)
            kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
            accuracy = round(accuracy_score(y_test, predicted_labels), 3)
            summary[sen]["auc"].append(auc)
            summary[sen]["kappa"].append(kappa)
            summary[sen]["accuracy"].append(accuracy)
    return summary


def get_evaluation(filename: str) -> pd.DataFrame:
    return pd.DataFrame({l: {sen: round(np.mean(pickle.load(open(filename, "rb"))[sen][l]), 3) for sen in SENTIMENTS} for l in METRICS})

In [153]:
pickle.dump(get_summary(df_representation_v1, 1), open("revision/summary_v1.pickle", "wb"))
pickle.dump(get_summary(df_representation_v2, 2), open("revision/summary_v2.pickle", "wb"))
pickle.dump(get_summary(df_representation_v3, 3), open("revision/summary_v3.pickle", "wb"))
pickle.dump(get_summary(df_representation_v4, 4), open("revision/summary_v4.pickle", "wb"))

In [12]:
get_evaluation("summary_v1.pickle")

Unnamed: 0,auc,kappa,accuracy
anger,0.721,0.282,0.68
fear,0.751,0.366,0.648
joy,0.785,0.409,0.659
sadness,0.717,0.286,0.59


In [13]:
get_evaluation("summary_v2.pickle")

Unnamed: 0,auc,kappa,accuracy
anger,0.764,0.309,0.698
fear,0.775,0.369,0.653
joy,0.796,0.427,0.673
sadness,0.779,0.376,0.639


In [128]:
get_evaluation("revision/summary_v1.pickle")

Unnamed: 0,auc,kappa,accuracy
anger,0.713,0.152,0.654
fear,0.703,0.306,0.618
joy,0.764,0.367,0.635
sadness,0.725,0.304,0.596


In [154]:
get_evaluation("revision/summary_v2.pickle")

Unnamed: 0,auc,kappa,accuracy
anger,0.747,0.286,0.695
fear,0.753,0.338,0.641
joy,0.763,0.347,0.634
sadness,0.738,0.296,0.603


In [88]:
summary_baseline

Unnamed: 0_level_0,auc,kappa,accuracy
sen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anger,0.62,0.07,0.63
fear,0.67,0.15,0.57
joy,0.65,0.18,0.54
sadness,0.67,0.19,0.55


In [171]:
get_evaluation("revision/summary_v3.pickle")

Unnamed: 0,auc,kappa,accuracy
anger,0.569,0.0,0.657
fear,0.598,0.009,0.558
joy,0.592,0.02,0.528
sadness,0.584,0.001,0.521


In [174]:
get_evaluation("revision/summary_v4.pickle")

Unnamed: 0,auc,kappa,accuracy
anger,0.63,0.093,0.65
fear,0.715,0.288,0.616
joy,0.767,0.369,0.635
sadness,0.713,0.294,0.597
