In [1]:
import pandas as pd
import numpy as np

In [2]:
bait = pd.read_csv('dataset/clickbait_data', sep="\n", header=None, names=['title'])

In [3]:
bait['bait'] = 1

In [4]:
bait.head()

Unnamed: 0,title,bait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [5]:
nobait = pd.read_csv('dataset/non_clickbait_data', sep="\n", header=None, names=['title'])

In [6]:
nobait.head()

Unnamed: 0,title
0,Bill Changing Credit Card Rules Is Sent to Oba...
1,"In Hollywood, the Easy-Money Generation Toughe..."
2,1700 runners still unaccounted for in UK's Lak...
3,Yankees Pitchers Trade Fielding Drills for Put...
4,Large earthquake rattles Indonesia; Seventh in...


In [7]:
nobait['bait'] = 0

In [8]:
nobait.head()

Unnamed: 0,title,bait
0,Bill Changing Credit Card Rules Is Sent to Oba...,0
1,"In Hollywood, the Easy-Money Generation Toughe...",0
2,1700 runners still unaccounted for in UK's Lak...,0
3,Yankees Pitchers Trade Fielding Drills for Put...,0
4,Large earthquake rattles Indonesia; Seventh in...,0


In [10]:
db = pd.concat([bait, nobait])

In [11]:
db.head()

Unnamed: 0,title,bait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [13]:
db.tail()

Unnamed: 0,title,bait
15996,"To Make Female Hearts Flutter in Iraq, Throw a...",0
15997,"British Liberal Democrat Patsy Calton, 56, die...",0
15998,Drone smartphone app to help heart attack vict...,0
15999,"Netanyahu Urges Pope Benedict, in Israel, to D...",0
16000,Computer Makers Prepare to Stake Bigger Claim ...,0


In [14]:
from sklearn.utils import shuffle
db = shuffle(db, random_state=26).reset_index(drop=True)

In [16]:
db.tail()

Unnamed: 0,title,bait
31995,These Cops Took Care Of A Sick Woman's Five Ch...,1
31996,This Word Association Test Will Tell You If Yo...,1
31997,China Says Rio Tinto Bribed Most Big Steel Makers,0
31998,NATO deploys helicopters in Libya,0
31999,Advertisers Change Game Plans for Super Bowl,0


In [17]:
db.describe()

Unnamed: 0,bait
count,32000.0
mean,0.499969
std,0.500008
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [18]:
from sklearn.model_selection import train_test_split
X = db.title
y = db.bait
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [20]:
from sklearn.pipeline import Pipeline
# Pipeline to have different parts of program to be cross validated together

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# Count vectorizer for converting to token
# tfidfvectorizer to convert count matrix to normalised tf representation
# multinomialNB - Naive Bayes for word counts for text classification

from sklearn.naive_bayes import MultinomialNB

pipeline1 = Pipeline([('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('mnb', MultinomialNB())])


In [21]:
pipeline1.fit(X,y)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('mnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [23]:
X_test.head()

19450             Do You See The Giant Penis On This Woman
25509    Paris court jails rioters for attempted murder...
21263        Business Graduates Looking Beyond Wall Street
19121    This Is What Thousands Of People Singing "Swee...
8574     Israeli airstrikes damage more offices housing...
Name: title, dtype: object

In [24]:
predicted = pipeline1.predict(X_test)

In [28]:
for i in range(0,5):
    print(predicted[i])

1
0
0
1
0


In [29]:
print(np.mean(predicted == y_test))

0.983625


In [31]:
pipeline1.predict(["Easiest way to enlarge your "])[0]

1

In [33]:
pipeline1.predict_proba(["Loveisn't easy so try this"])[0]

array([0.00751429, 0.99248571])

In [34]:
pipeline1.predict(["Loveisn't easy so try this"])[0]

1

In [35]:
from sklearn.externals import joblib



In [36]:
import pickle

In [37]:
joblib.dump(pipeline1, 'clickbait.pkl')

['clickbait.pkl']