In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

df = pd.read_csv("data/annotations.csv")[['text', 'excitement']].drop_duplicates()

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

## Embeddings

In [10]:
%pip install "whatlies[tfhub]"

In [7]:
from sklearn.preprocessing import QuantileTransformer
from whatlies.language import BytePairLanguage, UniversalSentenceLanguage

2022-05-06 14:08:22.054359: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-06 14:08:22.054383: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Let's first train a bag of words model.

In [29]:
%%time

model_cv = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000))
model_cv.fit(list(df['text']), df['excitement'])

CPU times: user 11.3 s, sys: 39 s, total: 50.3 s
Wall time: 4.79 s


Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

Let's also use BytePair embeddings.

In [8]:
%%time

model_bp = make_pipeline(BytePairLanguage("en"), QuantileTransformer(), LogisticRegression(max_iter=1000))
model_bp.fit(list(df['text']), df['excitement'])

CPU times: user 20.3 s, sys: 10.8 s, total: 31.1 s
Wall time: 8.82 s


Pipeline(steps=[('bytepairlanguage', BytePairLanguage(lang='en')),
                ('quantiletransformer', QuantileTransformer()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

Let's also use the Universal Sentence Encoder embeddings.

In [11]:
%%time 
model_use = make_pipeline(UniversalSentenceLanguage(), QuantileTransformer(), LogisticRegression(max_iter=1000))
model_use.fit(list(df['text']), df['excitement'])

2022-05-06 14:08:44.760606: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-06 14:08:44.760756: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-06 14:08:44.760763: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-06 14:08:44.760778: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pop-os): /proc/driver/nvidia/version does not exist
2022-05-06 14:08:44.760891: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

CPU times: user 5min 15s, sys: 56.2 s, total: 6min 11s
Wall time: 1min 31s


Pipeline(steps=[('tfhublanguage',
                 TFHubLanguage(url='https://tfhub.dev/google/universal-sentence-encoder/4')),
                ('quantiletransformer', QuantileTransformer()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [None]:
%%time
df_pred = (df
           .assign(pred_cv=lambda d: model_cv.predict(d['text']), 
                   proba_cv=lambda d: model_cv.predict_proba(d['text'])[:, 1]))

In [None]:
%%time 

df_pred = (df_pred
           .assign(pred_bp=lambda d: model_bp.predict(list(d['text'])), 
                   proba_bp=lambda d: model_bp.predict_proba(list(d['text']))[:, 1]))

In [12]:
%%time 

df_pred = (df_pred
           .assign(pred_use=lambda d: model_use.predict(list(d['text'])), 
                   proba_use=lambda d: model_use.predict_proba(list(d['text']))[:, 1]))

CPU times: user 6min 6s, sys: 28.6 s, total: 6min 35s
Wall time: 2min 22s


In [35]:
name = 'use'

(df_pred
 .loc[lambda d: d[f'pred_{name}'] != d['excitement']]
 .sort_values(f"proba_{name}", ascending=False)
 .head(10)
 [['text', 'excitement', f'pred_{name}', f'proba_{name}']])

Unnamed: 0,text,excitement,pred_use,proba_use
60361,"omg I remember be exactly like this and being so excited, gahhhh I might even believe I’m more excited than you hahahah.",0,1,0.821158
127857,I’m so excited for next season. My expectations are slowly climbing.,0,1,0.732249
46358,"Congratulations! Idk how long you've been living here, but welcome to Colorado!! :D I hope you're enjoying it!",0,1,0.710787
181343,Yeah let's wrap up AS4 I'm suddenly so much more excited for S11,0,1,0.709047
3058,"Ahhh, my birthday month. I must go..",0,1,0.694658
90678,Exciting finish here.,0,1,0.687051
107058,enjoy it and happy New Year!! I was at the cheesecake factory yesterday with one of my kids.. had a blast,0,1,0.671981
96902,Woo hoo!,0,1,0.670619
83352,happy early birthday OP! have fun !,0,1,0.65938
7335,"Awesome, let's celebrate! Send the free pack over here ;)",0,1,0.657701


<br><br><br><br><br><br><br><br><br><br><br>

In [28]:
df_pred.loc[lambda d: d['pred_cv'] != d['pred_use']][['text', 'excitement']].iloc[10:20]

Unnamed: 0,text,excitement
1646,"starting 2019 of with a banger, can't wait what else will happen this year",0
1687,I’m glad to see [NAME] finally get the big screen presence they deserve.,0
1729,Happy new years!!,0
1821,I have to go with [NAME]. Rookie who lived up to the hype and I actually got excited when he got snaps.,0
2031,"It's crisp, no filler material. Interesting protagonists. Amazing music. Good way to spend 6 hours.",0
2111,"Seriously doubt it will be Knights of Ren, but that would certainly be a great way to get people excited and build hype.",1
2361,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0
2812,Wow that’s a fascinating belief,0
2952,Im tiddly at beer oclock. Hello from taranaki new Zealand.,0
2963,Wow... congrats,1
