In [8]:
from pathlib import Path
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option("display.max_colwidth", None)

In [3]:
data_path = Path("../data")
emotions_1_path = data_path/"goemotions_1.csv"

df = pd.read_csv(emotions_1_path)

In [4]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [9]:
# Excitement examples
df[["text", "excitement"]].loc[lambda d: d["excitement"] == 1].sample(5)

Unnamed: 0,text,excitement
34496,"Wow. I have to say, I am relieved. Pretty incredible idea.",1
2395,You should check out my comment at the bottom of this thread of why I somewhat support antifa. This comic is obviously fearmongering conservatives.,1
33550,This is me and my wife. She squeals with delight whenever it snows and I start looking for property in Arizona.,1
15760,I'm so doing this!,1
13877,Simply amazing.,1


In [10]:
# No excitement examples
df[["text", "excitement"]].loc[lambda d: d["excitement"] == 0].sample(5)

Unnamed: 0,text,excitement
46482,I saw one that us taking Greedy William's at 6 with the Jaga taking [NAME] at 7 lol. Cllelin ferrel was at 4,0
53849,I legit wish you never talked to me,0
46034,I think she was trying to demonstrate that such solutions are practical even if politically difficult. But it was a sloppy analogy.,0
39474,Because killing people is wrong?,0
27614,Doesn't state strangers either. Assume some more.,0


In [11]:
df["excitement"].value_counts()

0    68100
1     1900
Name: excitement, dtype: int64

In [12]:
X, y = df["text"], df["excitement"]

pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight="balanced", max_iter=1000)
)

In [13]:
%%time

pipe.fit(X, y)

Wall time: 3.73 s


Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

## Trick 1: Model Uncertainty

This trick consists in building a model that outputs probabilities. To say that the model is uncertain of its predictions, we'll look at probabilities in the range ]0.45, 0.55[.

**Safouane Note**: Using this interval makes the assumption that the model is calibrated. As we're using logistic regression, there is no need to recalibrate the model. Had we been using Random forest for example, this assumption wouldn't have been valid😅

In [14]:
# Column at 0-th position => predictions for non-excitement
# Column at 1st position => predictions for excitement
pipe.predict_proba(X)

array([[0.81906852, 0.18093148],
       [0.87337871, 0.12662129],
       [0.99887474, 0.00112526],
       ...,
       [0.95766974, 0.04233026],
       [0.8940276 , 0.1059724 ],
       [0.97989241, 0.02010759]])

In [16]:
# The 0-th position corresponds to no excitment predictions
probas = pipe.predict_proba(X)[:, 0]

# See what examples the model is uncertain about
# when it comes to predicting excitement
(
    df
    .loc[(probas > 0.45) & (probas < 0.55)]
    [["text", "excitement"]]
    .head(7)
)

Unnamed: 0,text,excitement
8,that's adorable asf,0
46,"If there’s a pattern, yes.",0
107,My fans on patreon will be rewarded soon,0
154,"Ones with close ties to SA, anyway. An escaped apostate won't exactly be itching to run home.",0
158,I really like this ring so I’m glad to hear that.,0
262,OMG THOSE TINY SHOES! *desire to boop snoot intensifies*,0
362,This. I relate to this. So much. Almost too much.,0


Examples at rows 262 and 362 should have annotated as excitement examples.

## Trick 2: Model Disagreement

**Idea 1**: The starting idea consists in training a model to predict the label ***excitement*** and looking at the rows of the training set where the model gets the predictions wrong. This will likely return some bad labels.

In [18]:
df.loc[lambda d: d["excitement"] != pipe.predict(X)].shape

(5315, 37)

The problem with this method is that it will likely return many rows (all the rows of the training set that the model gets wrong)!

**Idea 2**: This method consists of two steps:
- ***Step 1***: Get the rows of the training set where the model gets the predictions wrong (= Idea 1)
- ***Step 2***: Limit the number of rows by sorting the probabilities of the true class in ascending order and taking only the K-top rows. The idea is that the not only the model got the class wrong but it also assigned very low probabilities to the true class.

In [19]:
def correct_class_confidence(X, y, mod):
    """
    Gives the predicted confidence (or proba) associated
    with the correct label `y` from a given model.
    """
    probas = mod.predict_proba(X)
    # values is a list that will contain the probabilty predictions
    # of the true class
    values = []
    for i, proba in enumerate(probas):
        # proba contains the probability predictions for non-excitement
        # and excitement.
        # proba_dict is dictionary with both classes and their corresponding
        # probability prediction
        proba_dict = {mod.classes_[j]: v for j, v in enumerate(proba)}
        values.append(proba_dict[y[i]])
    return values

In [20]:
# Might be wrongly annotated as excitement
(
    df
    .assign(confidence=correct_class_confidence(X, y, pipe)) # confidence is the proba of the true class
    .loc[lambda d: pipe.predict(d["text"]) != d["excitement"]]
    [["text", "excitement", "confidence"]]
    .sort_values("confidence")
    .loc[lambda d: d["excitement"] == 1]
    .head(20)
)

Unnamed: 0,text,excitement,confidence
5762,Thank you so much.,1,0.142695
28099,Get you some!!,1,0.162535
40598,There it is!,1,0.17249
20350,LOL [NAME] AND [NAME] JUST PLAYING WITH THEM,1,0.175862
44431,[NAME]? Is that you?,1,0.196345
15760,I'm so doing this!,1,0.225074
12012,i said first world :),1,0.241769
13356,> What? I second this,1,0.260638
31360,This! :-),1,0.270166
7233,This!!! 🐃 and 💍 for your hard work!,1,0.280482


In [21]:
# Might be wrongly annotated as non-excitement
(
    df
    .assign(confidence=correct_class_confidence(X, y, pipe)) # confidence is the proba of the true class
    .loc[lambda d: pipe.predict(d["text"]) != d["excitement"]]
    [["text", "excitement", "confidence"]]
    .sort_values("confidence")
    .loc[lambda d: d["excitement"] == 0]
    .head(20)
)

Unnamed: 0,text,excitement,confidence
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0,0.000148
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0,0.000262
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0,0.000595
20823,"Wow, your posting history is a real... interesting ride.",0,0.000719
69395,"Wow, your posting history is a real... interesting ride.",0,0.000719
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0,0.00074
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0,0.000813
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0,0.00113


## Trick 3: Use [Cleanlab](https://cleanlab.readthedocs.io) to discover bad labels!

In [22]:
from cleanlab.pruning import get_noise_indices

In [23]:
# get_noise_indices returns indices that are worth checking
ordered_label_errors = get_noise_indices(
    s=y,
    psx=pipe.predict_proba(X),
    sorted_index_method="prob_given_label"
)

In [24]:
df.iloc[ordered_label_errors][["text", "excitement"]].head(20)

Unnamed: 0,text,excitement
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0
20823,"Wow, your posting history is a real... interesting ride.",0
69395,"Wow, your posting history is a real... interesting ride.",0
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0


## Trick 4: Use [Cleanlab](https://cleanlab.readthedocs.io) to learn with noisy labels!

In [25]:
from cleanlab.classification import LearningWithNoisyLabels

In [26]:
# Wrap around any classifier that has `sample_weights`
fresh_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight="balanced", max_iter=1000)
)

# lnl should be better equipped to deal with noisy labels
lnl = LearningWithNoisyLabels(clf=fresh_pipe)
lnl.fit(X=X, s=y.values)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

The model from cleanlab should be better equipped to deal with bad labels. If one wants to also check for bad labels using this approach, cleanlab model can be compared to another model that is trained on noisy labels. The outputs that are different should yield a direction to look into.

In [27]:
new_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight="balanced", max_iter=1000)
)

new_pipe.fit(X=X, y=y)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [28]:
df[lnl.predict(X) != new_pipe.predict(X)][["text", "excitement"]].sample(5)

Unnamed: 0,text,excitement
19128,Did not expect it to work with a body on it..,1
14762,Tailed. Let's get it!,1
29680,I almost pissed myself waiting so long in the tunnel. Not a fun feeling,0
65385,"The Patriots dethroned the Rams and the greatest show on turf, it'd be cool for the Rams to do the same now.",0
3432,i'll be waiting for your post about having a crush on a straight guy tomorrow.,0
