In [1]:
import pandas as pd
import numpy as np
import nltk
import time
import random
from IPython.core.display import display
from sklearn.neighbors import KNeighborsClassifier
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from time import time
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Gutierya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Gutierya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Gutierya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Data sorting 1/2

In [2]:
#pulling in non clickbait txt file
raw_nonclick = pd.read_csv('./data/non_clickbait_data.txt',
                   sep='delimiter', header=None,
                   names=['feature', 'target'])
#adding float 0 'false' value to non clickbait df
target_nonclick = raw_nonclick['target'] = 0.0
#peeking at df table
print('Non clickbait Data: ')
display(raw_nonclick.head())
#pulling in clickbait txt file
raw_click = pd.read_csv('./data/clickbait_data.txt',
                   sep='delimiter', header=None,
                   names=['feature', 'target'])
#adding float 1 'true' value to non clickbait df
target_click = raw_click['target'] = 1.0
#peeking at df table
print('Clickbait Data: ')
display(raw_click.head())

#Merging 2 files into one
print('Merged Data: ')
merged = pd.concat([raw_click, raw_nonclick], ignore_index=True, sort=True)
display(merged.head())

#Creating two variables for feature and target
X = merged['feature']
Y = merged['target']


#Optionally instead of zeroes and ones for target values, using strings:
    # "clickbait" vs "nonclickbait"
#creating copies of previous df's
raw_nonclick_copy = raw_nonclick
raw_click_copy = raw_click
#assigning string values to 'target' column for both txt files
y_str_nonclick = raw_nonclick_copy['target'] = "not clickbait"
y_str_click = raw_click_copy['target'] = "clickbait"
#Merging 2 files into one (*but with string type target values vs floats)
Y_String = pd.concat([raw_click_copy, raw_nonclick_copy],sort=True)

Non clickbait Data: 


  return func(*args, **kwargs)


Unnamed: 0,feature,target
0,Bill Changing Credit Card Rules Is Sent to Oba...,0.0
1,"In Hollywood, the Easy-Money Generation Toughe...",0.0
2,1700 runners still unaccounted for in UK's Lak...,0.0
3,Yankees Pitchers Trade Fielding Drills for Put...,0.0
4,Large earthquake rattles Indonesia; Seventh in...,0.0


Clickbait Data: 


  return func(*args, **kwargs)


Unnamed: 0,feature,target
0,Should I Get Bings,1.0
1,Which TV Female Friend Group Do You Belong In,1.0
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1.0
3,"This Vine Of New York On ""Celebrity Big Brothe...",1.0
4,A Couple Did A Stunning Photo Shoot With Their...,1.0


Merged Data: 


Unnamed: 0,feature,target
0,Should I Get Bings,1.0
1,Which TV Female Friend Group Do You Belong In,1.0
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1.0
3,"This Vine Of New York On ""Celebrity Big Brothe...",1.0
4,A Couple Did A Stunning Photo Shoot With Their...,1.0


Data sorting 2/2

In [3]:
# Split data into train & test
X_train, X_test, y_train, y_test = train_test_split(merged['feature'], merged['target'], test_size=5000, random_state=123,
                                                    stratify=merged['target'])# Append sentiment back using indices
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)# Check dimensions
print(f"Train: {train.shape[0]} rows and {train.shape[1]} columns")
print(f"{train['target'].value_counts()}\n")
print(f"Test: {test.shape[0]} rows and {test.shape[1]} columns")
print(test['target'].value_counts())

Train: 27000 rows and 2 columns
0.0    13501
1.0    13499
Name: target, dtype: int64

Test: 5000 rows and 2 columns
0.0    2500
1.0    2500
Name: target, dtype: int64


Preprocessing

In [4]:
def analytics(vectoriser, X):
    start_time = time()
    print(f"\nThere are {vectoriser.fit_transform(X).shape[1]} columns\n")
    end_time = time()
    print(f"This took {round((end_time-start_time),2)} seconds\n")
    word_tokens = list(vectoriser.vocabulary_.keys())
    word_tokens.sort()
    print(f"Sample tokens: {word_tokens[:50]}\n")
    ignored = vectoriser.stop_words_
    if len(ignored)==0:
        print("No token is ignored.")
    elif len(ignored)>50:
        print(f"Sample ignored tokens: {random.sample(ignored, 50)}")
    else:
        print(f"Sample ignored tokens: {ignored}")

def custom_text_preprocessor(corpus):
    tokenizer = RegexpTokenizer(r'[A-Za-z]+')
    word_tokens = tokenizer.tokenize(corpus)
    pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
    pos_tagz = pos_tag(word_tokens)
    lem = WordNetLemmatizer()
    word_tokens = [lem.lemmatize(t.lower(), pos=pos_map.get(p[0], 'v')) for t, p in pos_tagz]
    return word_tokens

#tfidf
vectoriser = TfidfVectorizer(analyzer=custom_text_preprocessor, stop_words='english',
                              max_df=.5, max_features=500)
#printing analytics for X_train
analytics(vectoriser, X_train)

#fitting tfidf -- for X_train:
X_train = vectoriser.fit_transform(X_train)
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train


#fitting tfidf -- for X test:
# Fit to the data and transform to feature matrix
X_test = vectoriser.fit_transform(X_test)
# Convert sparse matrix to dataframe
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
# Rename each column using the mapping
for col in X_test.columns:
    X_test.rename(columns={col: col_map[col]}, inplace=True)
X_test


#fitting tfidf -- for X:
# Fit to the data and transform to feature matrix
X = vectoriser.fit_transform(X)
# Convert sparse matrix to dataframe
X = pd.DataFrame.sparse.from_spmatrix(X)
# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
# Rename each column using the mapping
for col in X.columns:
    X.rename(columns={col: col_map[col]}, inplace=True)
X


There are 500 columns

This took 16.76 seconds

Sample tokens: ['a', 'about', 'actually', 'adorable', 'af', 'afghan', 'afghanistan', 'after', 'again', 'against', 'age', 'aid', 'air', 'all', 'amazing', 'america', 'american', 'an', 'and', 'animal', 'announce', 'another', 'anyone', 'are', 'around', 'arrest', 'as', 'ask', 'at', 'attack', 'australia', 'australian', 'award', 'baby', 'back', 'bad', 'ban', 'bank', 'base', 'be', 'beautiful', 'been', 'before', 'begin', 'being', 'best', 'big', 'bill', 'billion', 'birth']

Sample ignored tokens: ['commander', 'yule', 'easily', 'martyn', 'libra', 'wittle', 'emerisque', 'bremen', 'fortune', 'zendaya', 'suite', 'nicholas', 'sa', 'yummy', 'fassbender', 'kilimanjaro', 'pretender', 'slip', 'stripe', 'cursive', 'prince', 'moo', 'lup', 'angering', 'exchequer', 'temperament', 'frosh', 'offered', 'apec', 'eisenhower', 'scrapped', 'lift', 'sham', 'natalie', 'pascal', 'kiichi', 'zit', 'caitlin', 'nickel', 'harper', 'bind', 'sudden', 'jan', 'nifong', 'argenti

since Python 3.9 and will be removed in a subsequent version.
  print(f"Sample ignored tokens: {random.sample(ignored, 50)}")


Unnamed: 0,a,about,actually,adorable,af,afghan,afghanistan,after,again,against,...,worker,world,worst,would,year,york,you,your,zealand,zodiac
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.217385,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.183638,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.387489,0.000000,0.0,0.0,0.0
4,0.316231,0.0,0.0,0.0,0.0,0.0,0.0,0.236183,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,0.356680,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
31996,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
31997,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
31998,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


K-nearest neighbors

In [5]:
knn = KNeighborsClassifier(n_neighbors=7)

cv_scores_train = cross_val_score(knn, X_train, y_train, cv=5)
cv_scores_test = cross_val_score(knn, X_test, y_test, cv=5)
knn.fit(X_train, y_train)

print("non cv Training accuracy for comparison below: ", knn.score(X_train, y_train))
print("non cv Testing accuracy for comparison below: ", knn.score(X_test, y_test))

print("\ncv accuracy scores for train folds: ", cv_scores_train)
print('cv accuracy scores *mean for train folds: {}'.format(np.mean(cv_scores_train)))
print("\t ±",(100*np.std(cv_scores_train)))
print("\ncv sccuracy scores for test folds: ", cv_scores_test)
print('cv accuracy scores *mean for test folds: {}'.format(np.mean(cv_scores_test)))
print("\t ±",(100*np.std(cv_scores_test)))

non cv Training accuracy for comparison below:  0.9045555555555556
non cv Testing accuracy for comparison below:  0.5286

cv accuracy scores for train folds:  [0.8537037  0.86222222 0.87074074 0.85592593 0.86259259]
cv accuracy scores *mean for train folds: 0.8610370370370373
	 ± 0.5964688041243642

cv sccuracy scores for test folds:  [0.721 0.71  0.721 0.675 0.729]
cv accuracy scores *mean for test folds: 0.7112
	 ± 1.9082976707002475


Naïve Bayes - "multinomial" (using since popular for text classification)

In [6]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

#quick testing - click bait headline to see if I get 'clickbait (1.) or nonclick (0.)'
test_msg = ['If Disney Princesses Were From Florida']
test_msg_counts = vectoriser.transform(test_msg)
classifications = classifier.predict(test_msg_counts)
classifications
    #appears to be working! showing 1 as value which is labeled "Yes" (1.)

#cross validation using a 5-fold cross validator
scores = cross_val_score(classifier, X_train, y_train, cv=5)
scores_ = cross_val_score(classifier, X_test, y_test, cv=5)
#printing accuracy per fold - train
print("cv accuracy scores per train fold: ", scores)
#printing mean accuracy of folds
print(f"cv *mean accuracy of train folds: ", scores.mean(), f"± {100*np.std(scores):.2f}")

#printing accuracy per fold - test
print("\ncv accuracy scores per test fold: ", scores_)
#printing mean accuracy of folds
print(f"cv *mean accuracy of test folds: ", scores_.mean(), f"± {100*np.std(scores_):.2f}")

cv accuracy scores per train fold:  [0.92481481 0.92944444 0.93092593 0.93018519 0.92592593]
cv *mean accuracy of train folds:  0.9282592592592593 ± 0.24

cv accuracy scores per test fold:  [0.92  0.937 0.917 0.918 0.941]
cv *mean accuracy of test folds:  0.9266 ± 1.02


Multilayer perceptron

In [7]:
#using 10 network configs (layer sizes below)
kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
classifier_mp = MLPClassifier(hidden_layer_sizes=10,max_iter=30,
                              activation='logistic', learning_rate='adaptive')
mp_scores = []
mp_scores_test = []

for train_indices, test_indices in kfold.split(X_train, y_train):
    classifier_mp.fit(X_train, y_train)
    mp_scores.append(classifier_mp.score(X_train, y_train))
classifier_mp.fit(X_train, y_train)

for train_indices, test_indices in kfold.split(X_test, y_test):
    classifier_mp.fit(X_test, y_test)
    mp_scores_test.append(classifier_mp.score(X_test, y_test))
classifier_mp.fit(X_test, y_test)

predictions_mp = classifier_mp.predict(X_test)
scores_mp = accuracy_score(y_test, predictions_mp)#cross validation using a 5-fold cross validator
print("Mean prediction accuracy: ", scores_mp)
print("Accuracy on training set ", classifier.score(X_train, y_train))
print("Accuracy on test set ", classifier.score(X_test, y_test))

print("\nBelow metrics of interest from Cross Validation - train ~ ")
print('\tKFold CV scores for:');[print(f'{(100*score):.2f}%') for score in mp_scores]
print(f'\n*Mean score of KFold CV: {100*np.mean(mp_scores):.2f}% ± {100*np.std(mp_scores):.2f}%')

print("\nBelow metrics of interest from Cross Validation - testing ~ ")
print('\tKFold CV scores for:');[print(f'{(100*s):.2f}%') for s in mp_scores_test]
print(f'\n*Mean score of KFold CV: {100*np.mean(mp_scores_test):.2f}% ± {100*np.std(mp_scores_test):.2f}%')



Mean prediction accuracy:  0.9398
Accuracy on training set  0.9302222222222222
Accuracy on test set  0.6672

Below metrics of interest from Cross Validation - train ~ 
	KFold CV scores for:
94.82%
94.85%
94.80%
94.81%
94.85%
94.85%
94.84%
94.84%
94.87%
94.84%

*Mean score of KFold CV: 94.84% ± 0.02%

Below metrics of interest from Cross Validation - testing ~ 
	KFold CV scores for:
93.98%
94.10%
94.08%
93.88%
93.88%
93.94%
93.50%
93.48%
94.00%
93.90%

*Mean score of KFold CV: 93.87% ± 0.21%




Report:

- Data representation used - Tfidf.
- Model metric used for ranking - mean cross validation score.
- Scoring of models on metric:
    - Knn training = .86 ± .59 ; Knn test = .71 ± 1.9
    - Naive Bayes train = .928 ± .24 ; Naive Bayes test = .926 ± 1.02
    - Multilayer perceptron train = .94 ±.03 ; Multilayer perceptron test = .93 ± .11
- Hyperparameter values that gave optimal results in cross validation were
conservative values/ lower iterations surprisingly I had to test many values for example number of iterations, where the lower
hyper parameters gave faster calculations and did not cause the program to break.
- A way that the classifier could be used as a plugin for a web browser
is, say an extension that filters news articles as spam or not or another example
would be "ad blocking" via text classification of url, in order to decide to block
the url/advise user of the possible scam website/ad.