# Feature extraction

## Read the data

In [2]:
import pandas as pd
X_train = pd.read_csv("/code/data/train.csv")['headline']
y_train = pd.read_csv("/code/data/train.csv")['is_sarcastic']

X_val = pd.read_csv("/code/data/val.csv")['headline']
y_val = pd.read_csv("/code/data/val.csv")['is_sarcastic']

## Feature extraction using TFIDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english') 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the val set 
tfidf_val = tfidf_vectorizer.transform(X_val)


In [58]:
tfidf_train.count_nonzero()

118204

In [60]:
tfidf_train.size

118204

In [65]:
tfidf_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [69]:
import numpy as np

In [72]:
np.mean(tfidf_train)

0.00012299332755879073

## Prediction using Multinominal Naive Bayes

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB() 

# Predict bag of words

clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_val)
metrics.accuracy_score(y_val, pred)

0.8322851153039832

## Confusion matrix 

In [73]:
metrics.confusion_matrix(y_val, pred)

array([[1965,  325],
       [ 516, 1487]])

## Check bad predictions

In [74]:
for input, prediction, label in zip(X_val, pred, y_val):
  if prediction != label:
    print(input, 'has been classified as ', prediction, 'and should be ', label)

justin timberlake tells jessica biel no one will believe her has been classified as  0 and should be  1
what selma blair's 'outburst' teaches us about mixing pills and alcohol has been classified as  1 and should be  0
the 4 types of bosses… and how to manage up to them has been classified as  1 and should be  0
u.s. job growth rebounds sharply, unemployment rate hits 4.4 percent has been classified as  1 and should be  0
ann romney's grandma tips are 'freakin' awesome' has been classified as  1 and should be  0
fantasy football star confident he can make leap to general manager of nfl team has been classified as  0 and should be  1
progressive parents allow child to choose how he's ostracized by peers has been classified as  0 and should be  1
storybook romance leads to in-flight-magazine marriage has been classified as  0 and should be  1
girl in park acts like it's no big deal she's wearing bikini has been classified as  0 and should be  1
tea party congressman listens to constituen

mtv movie awards snubs director jonas mekas yet again has been classified as  0 and should be  1
world leaders pour into washington to pay last respects to dying nation has been classified as  0 and should be  1
try this one thing before assuming you have a sleep disorder has been classified as  1 and should be  0
tonight: house faces his greatest challenge yet has been classified as  0 and should be  1
woman takes break from dating to focus on everything about herself no one could ever love has been classified as  0 and should be  1
nestle to switch to cage-free eggs in u.s. by 2020 has been classified as  1 and should be  0
lung cancer: saved by the scan has been classified as  1 and should be  0
emergency responders working to dislodge commercial jet from thick, polluted cloud over new delhi has been classified as  0 and should be  1
stormy daniels, james comey arrive at white house for state dinner has been classified as  0 and should be  1
here's the 'hocus pocus' remake you never

## Saving the model to disk using Pickle

In [76]:
import pickle
pickle.dump(pred, open('/code/models/tfidf.80.pickle', 'wb'))

## Prediction using Passive Aggressive Classifier

In [19]:
from sklearn.linear_model import PassiveAggressiveClassifier

linear_clf = PassiveAggressiveClassifier(max_iter=2000, random_state=1994)

# Predict Passive Agressive classifier

linear_clf.fit(count_train, y_train)
pred = linear_clf.predict(count_val)
metrics.accuracy_score(y_val, pred)

0.7654320987654321