In [1]:
# https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

In [2]:
import sys
!{sys.executable} -m pip install emoji --upgrade --user
!{sys.executable} -m pip install mlxtend --upgrade --user

Requirement already up-to-date: emoji in /Users/emilyroller/Library/Python/3.8/lib/python/site-packages (1.6.1)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Requirement already up-to-date: mlxtend in /Users/emilyroller/Library/Python/3.8/lib/python/site-packages (0.19.0)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas
import numpy
import string
import emoji
import re
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk import word_tokenize
from nltk.corpus import stopwords

In [4]:
# load in the data tweets and corresponding labels
trainDF = pandas.read_csv('../Data/sarcasm_db.csv')

In [5]:
options = ['text', 'parsed with emojis', 'parsed without emojis']
print('There are three text processing options:')
print('1. Keeping emojis in as their emoji representation (text)')
print('2. Keeping emojis in as their text description representation (parsed with emojis)')
print('3. Removing all and any emoji representations (parsed without emojis)')

opt = input("Choose from one: text, parsed with emojis, parsed without emojis\n").lower()
while opt.strip() not in options:
    opt = input("Not a valid option! Choose from one: text, parsed with emojis, parsed without emojis\n").lower()

There are three text processing options:
1. Keeping emojis in as their emoji representation (text)
2. Keeping emojis in as their text description representation (parsed with emojis)
3. Removing all and any emoji representations (parsed without emojis)
Choose from one: text, parsed with emojis, parsed without emojis
text


In [6]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF[opt], trainDF['sarcasm labels'], test_size=0.10, random_state=69)

In [7]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [8]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF[opt])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [9]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,4), max_features=None)
tfidf_vect_ngram.fit(trainDF[opt])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [10]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, valid_y)

In [11]:
print('Calculating accuracy for ' + opt + ' data ...\n')
strategies = ['uniform', 'stratified', 'most_frequent']
  
for s in strategies:
    dclf = DummyClassifier(strategy = s, random_state = 1234)
    accuracy = train_model(dclf, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print('Dummy classifier,', s, ':', accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("Naive-Bayes N-Gram Vectors : ", accuracy)

accuracy = train_model(KNeighborsClassifier(n_neighbors = 5), xtrain_count, train_y, xvalid_count)
print('K-Neighbors :', accuracy)

Calculating accuracy for text data ...

Dummy classifier, uniform : 0.5210643015521065
Dummy classifier, stratified : 0.49223946784922396
Dummy classifier, most_frequent : 0.49889135254988914
Naive-Bayes N-Gram Vectors :  0.7594235033259423
K-Neighbors : 0.9212860310421286
