### Importing clickbait data

The clickbait dataset: https://github.com/bhargaviparanjape/clickbait/tree/master/dataset. Please download the csv file here: https://drive.google.com/file/d/1EwBhBTRdmCCAOObcEAy--TfiWaq78SSE/view?usp=sharing. 

In [10]:
import pandas as pd

# read dataset:
data = pd.read_csv('/Users/kaiyang/Desktop/clickbait_data.csv')
data.head()

Unnamed: 0,title,label
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


### Preprocessing 

In [15]:
# preparing the data: X = titles, y = labels:
titles = data['title']
labels = data['label']

# the frequence of labels:
labels.value_counts()

label
0    15151
1    10074
Name: count, dtype: int64

In [19]:
# Preparing internal datasets for text cleaning:
import nltk   # Natural Language Processing (NLP) resources using the Natural Language Toolkit (NLTK), is used for text processing tasks like tokenization, stemming, lemmatization, and removing stopwords.
nltk.download('stopwords')    # Downloads a list of common stopwords (e.g., "the", "is", "in", "and") from various languages
nltk.download('wordnet')    # Downloads the WordNet lexical database, which helps with lemmatization (reducing words to their base form). Example: "running" → "run", "better" → "good".
nltk.download('omw-1.4')    # Downloads Open Multilingual WordNet (OMW), which provides synonyms and word meanings for multiple languages.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaiyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kaiyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kaiyang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
# the clean function:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

punctuation = set(punctuation)
stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def remove_punctuation(text):
    return ''.join([ch for ch in text if ch not in punctuation])

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in text.split(" ")])

def clean(text):
    text = [x.lower() for x in text]    # Convert to lowercase
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

In [25]:
titles = titles.apply(clean)
titles[0:10]

0                                            get bings
1                        tv female friend group belong
2        new star war force awakens trailer give chill
3    vine new york celebrity big brother fucking pe...
4    couple stunning photo shoot baby learning inop...
5           flirt queer girl without making total fool
6          32 cute thing distract awkward thanksgiving
7                              disney princess florida
8          whats quote lyric best describes depression
9    natalie dormer sam claflin play game see theyd...
Name: title, dtype: object

### A Simple Baseline Model (logistic regression)

In [28]:
from sklearn import model_selection, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [30]:
# split dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(titles, labels, test_size=0.33, random_state=42)

# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') #\w{1,} means 1 or more words

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.fit_transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
xtrain_tfidf =  tfidf_vect.fit_transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [32]:
# run a logistc regression model
classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)

In [34]:
# predictions and performance metrics:
predictions = classifier.predict(xvalid_count)
print(metrics.accuracy_score(predictions, valid_y))
print(metrics.precision_score(predictions, valid_y))
print(metrics.recall_score(predictions, valid_y))
print(metrics.f1_score(predictions, valid_y))

0.9495495495495495
0.9101258238466148
0.9620012666244458
0.9353448275862069


In [36]:
# using TF-IDF instead of term frequencency(count):
classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_tfidf, train_y)
predictions = classifier.predict(xvalid_tfidf)
print(metrics.accuracy_score(predictions, valid_y))
print(metrics.precision_score(predictions, valid_y))
print(metrics.recall_score(predictions, valid_y))
print(metrics.f1_score(predictions, valid_y))

0.9421021021021021
0.8786698621929299
0.9744186046511628
0.9240705734089477


### Reusable Classifier

In [39]:
# use the fitted classifier to make predictions on new data:
text = "At Town Hall Event, Harris Agrees That Trump Is a Fascist"
text = clean(text)
classifier.predict(count_vect.transform([text]))

array([0])

In [41]:
# define a function to test any texts if they are clickbait or not:
def clickbait(text):
    text = clean(text)
    r = classifier.predict(count_vect.transform([text]))[0]
    if r == 1:
        return "Clickbait"
    else:
        return "Not clickbait"

In [43]:
clickbait("The Perfect Pizza Doesn't Exi-: A Slice of Heaven in Every Bite!")

'Clickbait'

The end of the session!