In [1]:
#import the dependencies
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn import preprocessing
import pickle

In [2]:
#import the data
df = pd.read_csv("~/Downloads/train-balanced-sarcasm.csv")

In [3]:
#the data is BIG, set a sample
sample_size = 0.10
seed = 123
df1_5 = df.sample(frac = sample_size, random_state = seed)

In [4]:
#set X and Y variables
xVar = df1_5["comment"]
yVar = df1_5["label"].astype(int)

In [5]:
#split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(xVar.values.astype('U'), yVar, test_size=0.2) 
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(80866,) (80866,)
(20217,) (20217,)


In [6]:
#import a vectorizer to handle the words
cvec = CountVectorizer()

In [7]:
#fit the vectorizer
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
#transform the X data into a usable format
X_train = cvec.transform(X_train) 
X_test = cvec.transform(X_test)

In [9]:
#instantiate our models
clf = RandomForestClassifier(n_jobs=2, random_state=0, max_depth = 15)
lr = LogisticRegression(solver = 'liblinear')
bayes = BernoulliNB()

In [10]:
#ift the models
lr.fit(X_train, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
bayes.fit(X_train, y_train.values.ravel())

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [13]:
#score the model on the training data
lr.score(X_train, y_train.values.ravel())

0.8007073430118963

In [14]:
#score the model on the test data
lr.score(X_test, y_test.values.ravel())

0.6651333036553395

In [15]:
#score the model on the training data
clf.score(X_train, y_train.values.ravel())

0.6809784087255459

In [16]:
#score the model on the test data
clf.score(X_test, y_test.values.ravel())

0.6539051293465895

In [17]:
#score the model on the training data
bayes.score(X_train, y_train.values.ravel())

0.7776444983058393

In [18]:
#score the model on the test data
bayes.score(X_test, y_test.values.ravel())

0.6621160409556314

In [19]:
#display to a text UI
def print_me(input):
    if input[0] == 1: 
        print ("sarcastic") 
    else: 
        print ("not sarcastic")

In [20]:
#test a sarcastic sentence
sentence1 = ["yeah, as if"]
sentence_vector1 = cvec.transform(sentence1) 
print_me(lr.predict(sentence_vector1))

sarcastic


In [22]:
#test a sincere sentence
sentence2 = ["Like I'd be caught dead "]
sentence_vector2 = cvec.transform(sentence2) 
print_me(lr.predict(sentence_vector2))

not sarcastic


In [None]:
# save the vectorizer to disk
vectorizer_filename = 'vectorizer.sav'
pickle.dump(cvec, open(vectorizer_filename, 'wb'))

In [None]:
# save the clf_model to disk
#clf_filename = 'clf_finalized_model.sav'
#pickle.dump(clf, open(clf_filename, 'wb'))

In [None]:
# save the lr_model to disk
lr_filename = 'lr_finalized_model.sav'
pickle.dump(lr, open(lr_filename, 'wb'))

In [None]:
# save the bayes_model to disk
bayes_filename = 'bayes_finalized_model.sav'
pickle.dump(bayes, open(bayes_filename, 'wb'))