# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
# !pip install snorkel

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_csv('../all_sqlbelle_youtube_comments_4200rows.csv')

In [None]:
df_train.textDisplay[:-3:-1]

4201                              Thank you George :)
4200    Awesome! Looking forward to many more videos.
Name: textDisplay, dtype: object

In [None]:
df_train.tail(2)

Unnamed: 0,response_kind,vid_channel_id,vid_id,vid_published_at,vid_title,vid_desc,vid_chan_title,is_live_brodcast,vid_publish_time,view_count,...,canRate,viewerRating,likeCount,publishedAt,updatedAt,tcomment_id,reply_id,canReply,totalReplyCount,isPublic
4200,youtube#commentThread,,ZxcWBnXY8ps,,,,,,,,...,True,none,1.0,2020-09-02T06:16:13Z,2020-09-02T06:16:13Z,UgzO3LCmld1HWW3XpNt4AaABAg,,True,1.0,True
4201,youtube#comment,,ZxcWBnXY8ps,,,,,,,,...,True,none,0.0,2020-09-03T05:16:12Z,2020-09-03T05:16:12Z,UgzO3LCmld1HWW3XpNt4AaABAg,UgzO3LCmld1HWW3XpNt4AaABAg.9D5QmTk8TpI9D7thkihuok,,,


#### Create simple labels to start

In [None]:
abstain = -1
not_spam = 0
spam = 1

#### Key Word:  simple function to identify comments using "my"

In [None]:
from snorkel.labeling import labeling_function


@labeling_function()
def lf_keyword_my(x):
    """Many spam comments talk about 'my channel', 'my video', etc."""
    return spam if "my" in x.text.lower() else abstain

#### Regular Expression: simple fuction

In [None]:
import re


@labeling_function()
def lf_regex_check_out(x):
    """Spam comments say 'check out my video', 'check it out', etc."""
    return spam if re.search(r"check.*out", x.text, flags=re.I) else abstain

#### Arbitrary Heuristics

In [None]:
@labeling_function()
def lf_short_comment(x):
    """Non-spam comments are often short, such as 'cool video!'."""
    return not_spam if len(x.text.split()) < 5 else abstain

#### Third Party Models:
>
    -We use a third-party sentiment classification model, TextBlob.

    -We combine this with the heuristic that non-spam comments are often positive.

In [None]:
# !pip install textblob

In [None]:
from textblob import TextBlob


@labeling_function()
def lf_textblob_polarity(x):
    
    return not_spam if TextBlob(x.text).sentiment.polarity > 0.3 else abstain

# Combining & Cleaning the Labels

In [None]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

In [None]:
df = df_train
#create toy dataset
df =df[['textDisplay']][:100]
#drop na's
df = df.loc[~df.textDisplay.isna()]
# create string and 
df = df.textDisplay.astype(str)

df = df.reset_index()
#rename columns
df.rename(columns = {'index': 'num','textDisplay':'text'},inplace=True)
df.tail(2)

df_train = df
df_train.head(3)

Unnamed: 0,num,text
0,45,Thank you. Very clear and informative
1,46,Really good content. Looking forward to an ad...
2,47,"Hello Belle, I am having a problem, when I am ..."


In [None]:
# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

100%|██████████| 55/55 [00:00<00:00, 456.43it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.072]
INFO:root:[50 epochs]: TRAIN:[loss=0.003]
 11%|█         | 56/500 [00:00<00:00, 557.85epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.002]
 24%|██▍       | 122/500 [00:00<00:00, 615.84epoch/s]INFO:root:[150 epochs]: TRAIN:[loss=0.002]
INFO:root:[200 epochs]: TRAIN:[loss=0.001]
 41%|████      | 205/500 [00:00<00:00, 711.17epoch/s]INFO:root:[250 epochs]: TRAIN:[loss=0.001]
 55%|█████▌    | 277/500 [00:00<00:00, 702.15epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.001]
 70%|██████▉   | 349/500 [00:00<00:00, 706.93epoch/s]INFO:root:[350 epochs]: TRAIN:[loss=0.000]
INFO:root:[400 epochs]: TRAIN:[loss=0.000]
 84%|████████▍ | 420/500 [00:00<00:00, 509.34epoch/s]INFO:root:[450 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 601.04epoch/s]
INFO:root:Finished Training


#### Filter out all data that was abstained from labeling

In [None]:
df_train = df_train[df_train.label != abstain]

#### Augmentation Transfomers
>Create functions that use nltk synonyms add more versions of our data with synonyms for model training

In [None]:
import random

import nltk
from nltk.corpus import wordnet as wn

from snorkel.augmentation import transformation_function

nltk.download("wordnet", quiet=True)


def get_synonyms(word):
    """Get the synonyms of word from Wordnet."""
    lemmas = set().union(*[s.lemmas() for s in wn.synsets(word)])
    return list(set(l.name().lower().replace("_", " ") for l in lemmas) - {word})


@transformation_function()
def tf_replace_word_with_synonym(x):
    """Try to replace a random word with a synonym."""
    words = x.text.lower().split()
    idx = random.choice(range(len(words)))
    synonyms = get_synonyms(words[idx])
    if len(synonyms) > 0:
        x.text = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1 :])
        return x

#### Apply these functions to our dataset

In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/davidramsey/nltk_data...


True

In [None]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier

tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
df_train_augmented = tf_applier.apply(df_train)

100%|██████████| 34/34 [00:02<00:00, 11.93it/s]


### Writing a Slicing Function
> create functions used to get rid of or "slice" out unwanted data

In [None]:
from snorkel.slicing import slicing_function


@slicing_function()
def short_link(x):
    """Return whether text matches common pattern for shortened ".ly" links."""
    return int(bool(re.search(r"\w+\.ly", x.text)))

### Train a Classifier

In [None]:
df_train

Unnamed: 0,num,text,label
1,46,Really good content. Looking forward to an ad...,0
3,48,Do u hv any udemy course belle??,0
4,49,Thank you s o much! you are awesome teacher,0
10,55,recommend this for anyone who want to learn ta...,0
11,56,Thank you Michal! :),0
12,57,All your videos are amazing! Thank you for you...,0
13,58,Thank you Suganthi!,1
15,60,"Thank you Saurabh, I appreciate the kind comme...",0
16,61,Thanks for sharing the great detailed presenta...,0
17,62,Thank you Guna!,1


In [None]:
[print(o) for o in df_train_augmented.text[:3]]

Really good content.  Looking forward to an advanced table calc video soon.  Super helpful looking at financials over time
really good content. looking forward to an advanced board calc video soon. super helpful looking at financials over time
Do u hv any udemy course belle??


[None, None, None]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

train_text = df_train_augmented.text.tolist()
X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text)

clf = LogisticRegression(solver="lbfgs")
clf.fit(X=X_train, y=df_train_augmented.label.values)

LogisticRegression()

In [None]:
#| export
def foo(): pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()