In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn import preprocessing
import pickle

In [2]:
df = pd.read_csv("~/Downloads/train-balanced-sarcasm.csv")

In [3]:
sample_size = 0.10
seed = 123
df1_5 = df.sample(frac = sample_size, random_state = seed)

In [4]:
xLabel = ["comment"]
yLabel = ["label"]

In [5]:
df2 = df1_5[xLabel]
yVar = df1_5[yLabel].astype(int)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df2['comment'].values.astype('U'), yVar, test_size=0.2) 
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(80866,) (80866, 1)
(20217,) (20217, 1)


In [7]:
cvec = CountVectorizer()

In [8]:
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [9]:
X_train = cvec.transform(X_train) 
X_test = cvec.transform(X_test)

In [10]:
clf = RandomForestClassifier(n_jobs=2, random_state=0, max_depth = 15)
lr = LogisticRegression(solver = 'liblinear')
bayes = BernoulliNB()

In [11]:
lr.fit(X_train, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [13]:
bayes.fit(X_train, y_train.values.ravel())

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [14]:
lr.score(X_train, y_train.values.ravel())

0.800682610738753

In [15]:
lr.score(X_test, y_test.values.ravel())

0.6662709600830984

In [16]:
clf.score(X_train, y_train.values.ravel())

0.6791482205129473

In [17]:
clf.score(X_test, y_test.values.ravel())

0.6444081713409506

In [18]:
bayes.score(X_train, y_train.values.ravel())

0.7762594910098187

In [19]:
bayes.score(X_test, y_test.values.ravel())

0.6632042340604442

In [20]:
# save the clf_model to disk
#clf_filename = 'clf_finalized_model.sav'
#pickle.dump(clf, open(clf_filename, 'wb'))

In [21]:
# save the lr_model to disk
lr_filename = 'lr_finalized_model.sav'
pickle.dump(lr, open(lr_filename, 'wb'))

In [22]:
# save the lr_model to disk
bayes_filename = 'bayes_finalized_model.sav'
pickle.dump(bayes, open(bayes_filename, 'wb'))

In [34]:
sentence1 = ["yeah, as if"]
sentence_vector1 = cvec.transform(sentence1) 
lr.predict(sentence_vector1)

array([1])

In [35]:
sentence2 = ["I like puppies"]
sentence_vector2 = cvec.transform(sentence2) 
lr.predict(sentence_vector2)

array([0])

In [41]:
x1 = lr.predict(sentence_vector1)

In [42]:
x2 = lr.predict(sentence_vector2)

In [43]:
def print_me(input):
    if input[0] == 1: 
        print ("sarcastic") 
    else: 
        print ("not sarcastic")

In [44]:
print_me(x1)


sarcastic


In [45]:
print_me(x2)

not sarcastic
