# In this notebook we classify sarcastic and nonsarcastic comments

In [1]:
import numpy as np
import pandas as pd
import random
import string

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB

import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
# %config InlineBackend.figure_format = 'retina'

#### Create function to read in a random sample of the data

In [2]:
# p is set to a value between 0.0 and 1.0, so we can read in a percentage `p` sized portion of the data
p = 1.0

def read_in_data(file):
    return pd.read_csv(file, 
                       sep='\t', 
                       header=None, 
                       names=['label','comment','author','subreddit','score',
                              'ups','downs','date','created_utc','parent_comment'],
                       usecols=['label','comment','author','subreddit','score',
                              'date','created_utc','parent_comment'],  
                       skiprows=lambda i: i>0 and random.random() > p)

Read in a random sample of the data from **data_train**

In [3]:
data_train = read_in_data("../data/train-balanced.csv")
data_train.head()

Unnamed: 0,label,comment,author,subreddit,score,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,2016-10,1476662123,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,2016-11,1477959850,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,2016-09,1474580737,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,2016-10,1476824627,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,2016-12,1483117213,Yep can confirm I saw the tool they use for th...


#### Create function to reorder cols for easy comparison of comments as they are transformed

In [4]:
def reorder_col_headers(df, cols):
    return df[cols]

Run reorder_col_headers on **data_train**

In [5]:
data_train = reorder_col_headers(data_train, 
                                 ['label', 'comment', 'author', 'subreddit', 'score', 'parent_comment'])
data_train.head()

Unnamed: 0,label,comment,author,subreddit,score,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...


#### Create function to find NA values, if any, in data

In [6]:
def identify_NA_values_in_data(df):
    # Any NAs?
    print('NAs exist in this sample (T/F):', df.isnull().values.any(), '\n')
    # How many?
    print('How many NAs in this sample?', df.isnull().sum().sum(), '\n')
    # In which columns?
    print('Which cols have NAs in this sample?\n')
    print(df.isnull().any(), '\n')

    # How many of them are sarcasm?
    print('Of these NAs, how many are labeled sarcasm?\n')
    nulls = df[df['comment'].isnull()]  
    print('\tNumber of null comments in sample:', len(nulls))
    nulls_sarc = nulls[nulls['label'] == 1]
    print('\tNumber of null comments in sample that are sarcasm:',len(nulls_sarc))
    if not len(nulls) == 0:
        print('\tRatio of sarcastic null comments to all null comments in sample:', len(nulls_sarc) / len(nulls))
    else:
        print('\tRatio of sarcastic null comments to all null comments in sample: 0 of 0')
        

Run identify_NA_values_in_data on **data_train**

In [7]:
# identify_NA_values_in_data(data_train)

Read in a random sample of the data from **data_test**

In [8]:
data_test = read_in_data("../data/test-balanced.csv")
data_test = reorder_col_headers(data_test, 
                                 ['label', 'comment', 'author', 'subreddit', 'score', 'parent_comment'])

print(data_test.shape, '\n')
data_test.head()

(251608, 6) 



Unnamed: 0,label,comment,author,subreddit,score,parent_comment
0,0,Actually most of her supporters and sane peopl...,Quinnjester,politics,3,Hillary's Surrogotes Told to Blame Media for '...
1,0,They can't survive without an echo chamber whi...,TheGettysburgAddress,The_Donald,13,Thank God Liberals like to live in concentrate...
2,0,you're pretty cute yourself 1729 total,Sempiternally_free,2007scape,8,Saw this cutie training his Attack today...
3,0,If you kill me you'll crash the meme market,Catacomb82,AskReddit,2,If you were locked in a room with 49 other peo...
4,0,I bet he wrote that last message as he was sob...,Dorian-throwaway,niceguys,5,You're not even that pretty!


#### Create function to inspect data types and memory usage

In [9]:
def view_df_info(df):
    return df.info(memory_usage='deep', null_counts=True)

Run view_df_info on **data_train**

In [10]:
view_df_info(data_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010826 entries, 0 to 1010825
Data columns (total 6 columns):
label             1010826 non-null int64
comment           1010773 non-null object
author            1010826 non-null object
subreddit         1010826 non-null object
score             1010826 non-null int64
parent_comment    1010826 non-null object
dtypes: int64(2), object(4)
memory usage: 437.7 MB


Run `view_df_info` on **data_test**

In [11]:
view_df_info(data_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251608 entries, 0 to 251607
Data columns (total 6 columns):
label             251608 non-null int64
comment           251594 non-null object
author            251608 non-null object
subreddit         251608 non-null object
score             251608 non-null int64
parent_comment    251608 non-null object
dtypes: int64(2), object(4)
memory usage: 108.9 MB


Run `describe()` on **data_test**

In [12]:
print('data_train\n', data_test.describe())

data_train
                label          score
count  251608.000000  251608.000000
mean        0.500000       6.757452
std         0.500001      48.450781
min         0.000000    -329.000000
25%         0.000000       1.000000
50%         0.500000       2.000000
75%         1.000000       4.000000
max         1.000000    9923.000000


Run identify_NA_values_in_data on **data_train**

In [13]:
identify_NA_values_in_data(data_train)

NAs exist in this sample (T/F): True 

How many NAs in this sample? 53 

Which cols have NAs in this sample?

label             False
comment            True
author            False
subreddit         False
score             False
parent_comment    False
dtype: bool 

Of these NAs, how many are labeled sarcasm?

	Number of null comments in sample: 53
	Number of null comments in sample that are sarcasm: 45
	Ratio of sarcastic null comments to all null comments in sample: 0.8490566037735849


Run identify_NA_values_in_data on **data_test**

In [14]:
identify_NA_values_in_data(data_test)

NAs exist in this sample (T/F): True 

How many NAs in this sample? 14 

Which cols have NAs in this sample?

label             False
comment            True
author            False
subreddit         False
score             False
parent_comment    False
dtype: bool 

Of these NAs, how many are labeled sarcasm?

	Number of null comments in sample: 14
	Number of null comments in sample that are sarcasm: 9
	Ratio of sarcastic null comments to all null comments in sample: 0.6428571428571429


In [15]:
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

## Preprocessing

### Punctuation

#### Create function to remove punctuation from corpus

In [16]:
def remove_punc(comment):
    '''This function uses str methods from the string class to remove punctuation from the text.'''
    
    # replace punctuation with '' (no space)
    translator = str.maketrans('', '', string.punctuation)
    
    # return the text stripped of punctuation marks
    return comment.translate(translator)

Run function `` on **data_train**

In [17]:
data_train['comment'] = data_train['comment'].apply(remove_punc)

print('data_train\n')
data_train.head()

data_train



Unnamed: 0,label,comment,author,subreddit,score,parent_comment
0,0,NC and NH,Trumpbart,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,They were underdogs earlier today but since Gr...,Creepeth,nfl,3,They're favored to win.
3,0,This meme isnt funny none of the new york nigg...,icebrotha,BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools,cush2push,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...


Run function `` on **data_train**

In [18]:
data_test['comment'] = data_test['comment'].apply(remove_punc)

print('data_test\n')
data_test.head()

data_test



Unnamed: 0,label,comment,author,subreddit,score,parent_comment
0,0,Actually most of her supporters and sane peopl...,Quinnjester,politics,3,Hillary's Surrogotes Told to Blame Media for '...
1,0,They cant survive without an echo chamber whic...,TheGettysburgAddress,The_Donald,13,Thank God Liberals like to live in concentrate...
2,0,youre pretty cute yourself 1729 total,Sempiternally_free,2007scape,8,Saw this cutie training his Attack today...
3,0,If you kill me youll crash the meme market,Catacomb82,AskReddit,2,If you were locked in a room with 49 other peo...
4,0,I bet he wrote that last message as he was sob...,Dorian-throwaway,niceguys,5,You're not even that pretty!


### Deal with stopwords and letter casing

In [19]:
# load stopwords from NLTK
sw = stopwords.words('english')
# view stop words
np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

#### Create function to ...

In [20]:
def remove_stopwords_and_lowercase(comment):
    '''This function lowercases words and then remove stopwords.'''
    
    comment = \
    [word.lower() for word in comment.split() if word.lower() not in sw]
    return ' '.join(comment)

Run function `` on **data_train**

In [21]:
# Apply the function to each comment
data_train['comment_lc_stopped'] = data_train['comment'].apply(remove_stopwords_and_lowercase)
data_train = reorder_col_headers(data_train, 
                                 ['label', 'comment', 'comment_lc_stopped', 'author', 
                                  'subreddit', 'score', 'parent_comment'])
data_train.head()

Unnamed: 0,label,comment,comment_lc_stopped,author,subreddit,score,parent_comment
0,0,NC and NH,nc nh,Trumpbart,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,know west teams play west teams east teams right,Shbshb906,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,They were underdogs earlier today but since Gr...,underdogs earlier today since gronks announcem...,Creepeth,nfl,3,They're favored to win.
3,0,This meme isnt funny none of the new york nigg...,meme isnt funny none new york nigga ones,icebrotha,BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools,could use one tools,cush2push,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...


Run function `` on **data_train**

In [22]:
data_test['comment_lc_stopped'] = data_test['comment'].apply(remove_stopwords_and_lowercase)
data_test = reorder_col_headers(data_test, 
                                 ['label', 'comment', 'comment_lc_stopped', 'author', 
                                  'subreddit', 'score', 'parent_comment'])
data_test.head()

Unnamed: 0,label,comment,comment_lc_stopped,author,subreddit,score,parent_comment
0,0,Actually most of her supporters and sane peopl...,actually supporters sane people saw media doin...,Quinnjester,politics,3,Hillary's Surrogotes Told to Blame Media for '...
1,0,They cant survive without an echo chamber whic...,cant survive without echo chamber great america,TheGettysburgAddress,The_Donald,13,Thank God Liberals like to live in concentrate...
2,0,youre pretty cute yourself 1729 total,youre pretty cute 1729 total,Sempiternally_free,2007scape,8,Saw this cutie training his Attack today...
3,0,If you kill me youll crash the meme market,kill youll crash meme market,Catacomb82,AskReddit,2,If you were locked in a room with 49 other peo...
4,0,I bet he wrote that last message as he was sob...,bet wrote last message sobbing,Dorian-throwaway,niceguys,5,You're not even that pretty!


### Stem all words

In [23]:
stemmer = SnowballStemmer('english')

#### Create function to ...

In [24]:
def stem_each_word(text):
    '''This function stems each word in `text`'''
    
    text = [stemmer.stem(word) for word in text.split()]
    return ' '.join(text)

Run function `` on **data_train**

In [25]:
# apply stem_each_word function to each TRAIN and TEST comment

# TRAIN
data_train['comment_stemmed'] = data_train['comment_lc_stopped'].apply(stem_each_word)
data_train = reorder_col_headers(data_train, 
                                 ['label', 'comment', 'comment_lc_stopped', 'comment_stemmed', 'author', 
                                  'subreddit', 'score', 'parent_comment'])
data_train.head()

Unnamed: 0,label,comment,comment_lc_stopped,comment_stemmed,author,subreddit,score,parent_comment
0,0,NC and NH,nc nh,nc nh,Trumpbart,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,know west teams play west teams east teams right,know west team play west team east team right,Shbshb906,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,They were underdogs earlier today but since Gr...,underdogs earlier today since gronks announcem...,underdog earlier today sinc gronk announc afte...,Creepeth,nfl,3,They're favored to win.
3,0,This meme isnt funny none of the new york nigg...,meme isnt funny none new york nigga ones,meme isnt funni none new york nigga one,icebrotha,BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools,could use one tools,could use one tool,cush2push,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...


Run function `` on **data_train**

In [26]:
# TEST
data_test['comment_stemmed'] = data_test['comment_lc_stopped'].apply(stem_each_word)
data_test = reorder_col_headers(data_test, 
                                 ['label', 'comment', 'comment_lc_stopped', 'comment_stemmed', 'author', 
                                  'subreddit', 'score', 'parent_comment'])
data_test.head()

Unnamed: 0,label,comment,comment_lc_stopped,comment_stemmed,author,subreddit,score,parent_comment
0,0,Actually most of her supporters and sane peopl...,actually supporters sane people saw media doin...,actual support sane peopl saw media doingespec...,Quinnjester,politics,3,Hillary's Surrogotes Told to Blame Media for '...
1,0,They cant survive without an echo chamber whic...,cant survive without echo chamber great america,cant surviv without echo chamber great america,TheGettysburgAddress,The_Donald,13,Thank God Liberals like to live in concentrate...
2,0,youre pretty cute yourself 1729 total,youre pretty cute 1729 total,your pretti cute 1729 total,Sempiternally_free,2007scape,8,Saw this cutie training his Attack today...
3,0,If you kill me youll crash the meme market,kill youll crash meme market,kill youll crash meme market,Catacomb82,AskReddit,2,If you were locked in a room with 49 other peo...
4,0,I bet he wrote that last message as he was sob...,bet wrote last message sobbing,bet wrote last messag sob,Dorian-throwaway,niceguys,5,You're not even that pretty!


### Lemmatize

In [27]:
# use nltk.stem WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

#### Create function to ...

In [28]:
def lemmatize_each_word(text):
    '''This function lemmatizes each word, e.g., 
    - Remove the final "s" or "es" to singularize plurals and change person of some verbs
      (gets --> get,
      passes --> pass, 
      BUT not always as expected: 
      does --> doe (a deer, a female deer), 
      capitalizes --> captializes)
    '''
    text = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(text)

Run function `` on **data_train**

In [29]:
# apply lemmatize_each_word(text) function to each TRAIN and TEST comment

# TRAIN
data_train['comment_stemmed_lemmed'] = data_train['comment_stemmed'].apply(lemmatize_each_word)
data_train = reorder_col_headers(data_train, 
                                 ['label', 'comment', 'comment_lc_stopped', 'comment_stemmed', 'comment_stemmed_lemmed',
                                  'author', 'subreddit', 'score', 'parent_comment'])
data_train.head()

Unnamed: 0,label,comment,comment_lc_stopped,comment_stemmed,comment_stemmed_lemmed,author,subreddit,score,parent_comment
0,0,NC and NH,nc nh,nc nh,nc nh,Trumpbart,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,know west teams play west teams east teams right,know west team play west team east team right,know west team play west team east team right,Shbshb906,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,They were underdogs earlier today but since Gr...,underdogs earlier today since gronks announcem...,underdog earlier today sinc gronk announc afte...,underdog earlier today sinc gronk announc afte...,Creepeth,nfl,3,They're favored to win.
3,0,This meme isnt funny none of the new york nigg...,meme isnt funny none new york nigga ones,meme isnt funni none new york nigga one,meme isnt funni none new york nigga one,icebrotha,BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools,could use one tools,could use one tool,could use one tool,cush2push,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...


Run function `` on **data_train**

In [30]:
# TEST

data_test['comment_stemmed_lemmed'] = data_test['comment_stemmed'].apply(lemmatize_each_word)
data_test = reorder_col_headers(data_test, 
                                 ['label', 'comment', 'comment_lc_stopped', 'comment_stemmed', 'comment_stemmed_lemmed',
                                  'author', 'subreddit', 'score',  'parent_comment'])
data_test.head()

Unnamed: 0,label,comment,comment_lc_stopped,comment_stemmed,comment_stemmed_lemmed,author,subreddit,score,parent_comment
0,0,Actually most of her supporters and sane peopl...,actually supporters sane people saw media doin...,actual support sane peopl saw media doingespec...,actual support sane peopl saw medium doingespe...,Quinnjester,politics,3,Hillary's Surrogotes Told to Blame Media for '...
1,0,They cant survive without an echo chamber whic...,cant survive without echo chamber great america,cant surviv without echo chamber great america,cant surviv without echo chamber great america,TheGettysburgAddress,The_Donald,13,Thank God Liberals like to live in concentrate...
2,0,youre pretty cute yourself 1729 total,youre pretty cute 1729 total,your pretti cute 1729 total,your pretti cute 1729 total,Sempiternally_free,2007scape,8,Saw this cutie training his Attack today...
3,0,If you kill me youll crash the meme market,kill youll crash meme market,kill youll crash meme market,kill youll crash meme market,Catacomb82,AskReddit,2,If you were locked in a room with 49 other peo...
4,0,I bet he wrote that last message as he was sob...,bet wrote last message sobbing,bet wrote last messag sob,bet wrote last messag sob,Dorian-throwaway,niceguys,5,You're not even that pretty!


# Feature Extraction

#### Instantiate CountVectorizer() from sklearn.feature_extraction.text 

In [31]:
count_vectorizer = CountVectorizer()

Fit and transform count_vectorizer on **data_train.comment_stemmed_lemmed**

In [32]:
X_train_counts = count_vectorizer.fit_transform(data_train.comment_stemmed_lemmed)
type(X_train_counts)
X_train_counts.shape

(1010773, 165660)

Transform count_vectorizer (do NOT fit) on **data_test.comment_stemmed_lemmed**

In [33]:
X_test_counts = count_vectorizer.transform(data_test.comment_stemmed_lemmed)
X_test_counts.shape

(251594, 165660)

#### Instantiate TfidfTransformer() from sklearn.feature_extraction.text 

In [34]:
tfidf_transformer = TfidfTransformer()

Fit and transform tfidf_transformer on **X_train_counts**

In [35]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1010773, 165660)

Transform tfidf_transformer (do NOT fit) on **X_test_counts**

In [36]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf.shape

(251594, 165660)

## Train a classifier

#### Fit a Multinomial Naive Bayes linear model on (X_train_tfidf, data_train.label) to create a classifier

In [37]:
classifer = MultinomialNB().fit(X_train_tfidf, data_train.label)

#### Use the classifier to predict the labels in X_test_tfidf

In [38]:
y_test_predict = classifer.predict(X_test_tfidf)

In [39]:
y_test_predict[0:5]

array([0, 1, 1, 1, 0], dtype=int64)

## Metrics

#### Evaluate classifier based on accuracy

In [43]:
sum(y_test_predict == 1)

125341

In [44]:
sum(y_test_predict == 0)

126253

In [45]:
# false positives

predicted_actual = pd.DataFrame(y_test_predict, data_test.label)

In [48]:
predicted_actual[0:30].T

label,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
0,0,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,1,1,1,1,0


In [40]:
accuracy = np.mean(y_test_predict == data_test.label)

In [41]:
print(accuracy)

0.6646263424405987


In [51]:
data_test.label[100:120]

100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    1
111    0
112    0
113    1
114    0
115    0
116    0
117    0
118    0
119    0
Name: label, dtype: int64