In [2]:
import pandas as pd
import numpy as np

## Prepare and load training and testing datasets

In [3]:
pd.options.display.max_colwidth = 150

train_set = pd.read_csv("Train_Dataset.csv")


test_set = pd.read_csv("Test_Dataset.csv")
train_set.head(50)

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldly vices,1
1,hungover man horrified to learn he made dozens of plans last night,1
2,emily's list founder: women are the 'problem solvers' in congress,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0
5,james corden and the red hot chili peppers strip down for 'carpool karaoke',0
6,u.s. dignity reserves nearly depleted,1
7,"how to re-ignite the spark in your body, mind and soul",0
8,report: there still time to convert to christianity before christmas starts,1
9,education reform and evidence,0


In [4]:
#Checking for na values
train_set.isnull().sum()

headline        0
is_sarcastic    0
dtype: int64

In [5]:
train_set['is_sarcastic'].value_counts()
train_set.shape

(44262, 2)

In [6]:
test_set.shape

(11066, 1)

This means that about 54% of the data is non-sarcastic and 46% sarcastic.

## Split datasets

In [7]:
from sklearn.model_selection import train_test_split

X = train_set[['headline']]
y = train_set['is_sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape

(29655, 1)

From the corpus, we will extract the following from each document:
* Word count - total number of words in the doc/headline
* Character count - total number of characters in the doc/headline
* Average word density - mean length of words in the doc

In [8]:
import string

X_train['char_count'] = X_train['headline'].apply(len)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split()))
X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)

X_test['char_count'] = X_test['headline'].apply(len)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)

# X_train['punctuation_count'] = X_train['headline'].apply(lambda x: len("".join(char for char in x if char in string.punctuation)))
# X_train['upper_case'] = X_train['headline'].apply(lambda x: len([word for word in x.split() if word.isupper()]))
# X_train['title_word_count'] = X_train['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [9]:
X_train.head()

Unnamed: 0,headline,char_count,word_count,word_density
22779,google introduces a new way to screen telemarketers,51,8,5.666667
39644,the true meaning of the ray rice scandal,40,8,4.444444
423,a christmas message to vice president mike pence,48,8,5.333333
36581,burger king unveils new low-fat cashier,39,6,5.571429
42039,argument about capital of australia occurs 10 feet from encyclopedia,68,10,6.181818


In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42, solver='liblinear')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alpha

In [11]:
lr.fit(X_train.drop(['headline'], axis=1), y_train)
predictions = lr.predict(X_test.drop(['headline'], axis=1))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


In [12]:
from sklearn import metrics

print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.58      0.75      0.65      7864
           1       0.55      0.36      0.44      6743

   micro avg       0.57      0.57      0.57     14607
   macro avg       0.57      0.56      0.55     14607
weighted avg       0.57      0.57      0.55     14607



In [13]:
print(metrics.confusion_matrix(y_test, predictions))

[[5878 1986]
 [4287 2456]]


In [14]:
print(metrics.accuracy_score(y_test,predictions))

0.5705483672211953


In [15]:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['0','1'], columns=['0','1'])
df

Unnamed: 0,0,1
0,5878,1986
1,4287,2456


<font color=green>These results are terrible. More non-sarcastic headlines are labeled as sarcastic (4287) than correctly identified as sarcastic (2456), although a relatively small number of sarcastic headlines (1986) were confused as non-sarcastic. Let's try another classifier called Naive Bayes.</font>

In [16]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(X_train.drop(['headline'], axis=1), y_train)
predictions = nb.predict(X_test.drop(['headline'], axis=1))

In [17]:
print(metrics.confusion_matrix(y_test, predictions))

[[7862    2]
 [6692   51]]


In [18]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.54      1.00      0.70      7864
           1       0.96      0.01      0.02      6743

   micro avg       0.54      0.54      0.54     14607
   macro avg       0.75      0.50      0.36     14607
weighted avg       0.74      0.54      0.38     14607



In [19]:
print(metrics.accuracy_score(y_test,predictions))

0.541726569453002


<font color=green>Clearly, the predictions using the naive bayes classifier is worst than the predictions of the logistic regression model. Although more non-sarcastic headlines are correctly predicted (7862), almost none (51) are correctly recognized as sarcastic. The false non-sarcastic headlines shot up to 6692 counts from 4287 from the logistic regression model. Let's try another model from the support vector machine (SVM) classifier.</font>

In [None]:
from sklearn.svm import SVC
svc_model = SVC(gamma="auto")
svc_model.fit(X_train.drop(['headline'], axis=1), y_train)

In [None]:
predictions = svc_model.predict(X_test.drop(['headline'], axis=1))

In [None]:
print(metrics.confusion_matrix(y_test, predictions))

In [None]:
print(metrics.classification_report(y_test, predictions))

In [None]:
print(metrics.accuracy_score(y_test,predictions))

<font color=green>The SVC model performs the best compared to the logistic regression and naive bayes models. We will aggregate the results from this model to the results of sentiment analysis to get a better predictions</font>

## Text Pre-Processing and Wrangling

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import contractions
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_doc(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_doc)

In [None]:
X_train['clean headline'] = normalize_corpus(X_train['headline'].values)
X_test['clean headline'] = normalize_corpus(X_test['headline'].values)

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_train

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train['headline'], y_train)  

In [None]:
from sklearn import metrics

In [None]:
predictions = text_clf.predict(X_test['headline'])

In [None]:
print(metrics.classification_report(y_test, predictions))

In [None]:
print(metrics.confusion_matrix(y_test, predictions))

In [None]:
print(metrics.classification_report(y_test,predictions))

In [None]:
results = text_clf.predict(test_set['headline'])

In [None]:
test_set['predicted sarcasm'] = results

In [None]:
test_set