In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [21]:
import pickle
import json

In [2]:
data = pd.read_csv("clickbait_data.csv")

In [3]:
cv = CountVectorizer(ngram_range=(1, 3)).fit(data.headline)

In [4]:
print("Vocabulary size: {}".format(len(cv.vocabulary_))) 
# print("Vocabulary:\n{}".format(cv.get_feature_names()))

Vocabulary size: 348722


In [5]:
X_train = data.headline[0:int(0.8*len(data.headline))]
X_test = data.headline[int(0.8*len(data.headline)):]

In [6]:
y_train = data.clickbait[0:int(0.8*len(text))]
y_test = data.clickbait[int(0.8*len(text)):]

NameError: name 'text' is not defined

In [9]:
y_train = data.clickbait[0:int(0.8*len(data.headline))]
y_test = data.clickbait[int(0.8*len(data.headline)):]

In [10]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression(max_iter=1000))
param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100],
                  "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

In [11]:
# param_grid = {"logisticregression__C": [100],
#                   "tfidfvectorizer__ngram_range": [(1, 1)]}

In [12]:
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters:\n{}".format(grid.best_params_))

Best cross-validation score: 0.97
Best parameters:
{'logisticregression__C': 10, 'tfidfvectorizer__ngram_range': (1, 2)}


In [26]:
import sklearn
sklearn.__version__

'1.1.3'

# Create a pickle file for the pipline and a json file for local prediction

In [13]:
# create a new pipeline with the best hyperparameters
best_pipeline = grid.best_estimator_

# fit the pipeline on the training set
best_pipeline.fit(X_train, y_train)

In [16]:
with open('model.pkl', 'wb') as model_file:
  pickle.dump(best_pipeline, model_file)

In [22]:
X_train_json = X_train.to_json()

with open('X_train.json', 'w') as f:
    # write the JSON data to the file
    json.dump(X_train_json, f)

# fitting a model not using cross validation

In [61]:
text = cv.transform(data.headline).toarray().tolist()
text_train = text[0:int(0.8*len(text))]
text_test = text[int(0.8*len(text)):]

In [33]:
model = LogisticRegression()
model.fit(text_train, y_train)

In [37]:
y_pred = model.predict_proba(text_train)

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_train)

0.994296875

In [40]:
y_pred_test = model.predict(text_test)
accuracy_score(y_pred_test, y_test)

0.9675

# stemming

In [67]:
import spacy
import nltk

In [71]:
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 10.2 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [73]:
en_nlp = spacy.load('en_core_web_sm')

In [74]:
stemmer = nltk.stem.PorterStemmer()

In [75]:
def compare_normalization(doc):
    # tokenize document in spacy
    doc_spacy = en_nlp(doc)
    # print lemmas found by spacy
    print("Lemmatization:")
    print([token.lemma_ for token in doc_spacy])
    # print tokens found by Porter stemmer
    print("Stemming:")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])


In [76]:
compare_normalization(u"Our meeting today was worse than yesterday, "
                           "I'm scared of meeting the clients tomorrow.")

Lemmatization:
['our', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', 'I', 'be', 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', 'i', 'am', 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']


In [79]:
import re
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en_core_web_sm')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))
def custom_tokenizer(document):
    doc_spacy = en_nlp(document, entity=False, parse=False) 
    return [token.lemma_ for token in doc_spacy]

lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)

In [82]:
X_train_lemma = lemma_vect.fit_transform(X_train) 
# print("X_train_lemma.shape: {}".format(X_train_lemma.shape))

TypeError: __call__() got an unexpected keyword argument 'entity'