In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn import preprocessing
import pickle

In [2]:
df = pd.read_csv("~/Downloads/train-balanced-sarcasm.csv")

In [37]:
sample_size = 0.10
seed = 123
df1_5 = df.sample(frac = sample_size, random_state = seed)

In [38]:
xLabel = ["comment"]
yLabel = ["label"]

In [39]:
df2 = df1_5[xLabel]
yVar = df1_5[yLabel].astype(int)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(df2['comment'].values.astype('U'), yVar, test_size=0.2) 
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(80866,) (80866, 1)
(20217,) (20217, 1)
<class 'numpy.ndarray'>


In [41]:
cvec = CountVectorizer()

In [42]:
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [43]:
X_train = cvec.transform(X_train) 
X_test = cvec.transform(X_test)

In [44]:
clf = RandomForestClassifier(n_jobs=2, random_state=0, max_depth = 15)
lr = LogisticRegression(solver = 'liblinear')
bayes = BernoulliNB()

In [45]:
lr.fit(X_train, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [47]:
bayes.fit(X_train, y_train.values.ravel())

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [48]:
lr.score(X_train, y_train.values.ravel())

0.7993223357158756

In [49]:
lr.score(X_test, y_test.values.ravel())

0.67136568234654

In [50]:
clf.score(X_train, y_train.values.ravel())

0.682252120792422

In [51]:
clf.score(X_test, y_test.values.ravel())

0.653212642825345

In [52]:
bayes.score(X_train, y_train.values.ravel())

0.7763089555561051

In [53]:
bayes.score(X_test, y_test.values.ravel())

0.6645892071029331

In [54]:
# save the clf_model to disk
clf_filename = 'clf_finalized_model.sav'
pickle.dump(clf, open(clf_filename, 'wb'))

In [55]:
# save the lr_model to disk
lr_filename = 'lr_finalized_model.sav'
pickle.dump(lr, open(lr_filename, 'wb'))

In [56]:
# save the lr_model to disk
bayes_filename = 'bayes_finalized_model.sav'
pickle.dump(bayes, open(bayes_filename, 'wb'))

In [59]:
sentence = ["yeah, as if"]
sentence_vector = cvec.transform(sentence) 
lr.predict(sentence_vector)

array([1])