In [240]:
import sys
!{sys.executable} -m pip install emoji --upgrade --user
!{sys.executable} -m pip install tensorflow --upgrade --user

Requirement already up-to-date: emoji in /Users/emilyroller/Library/Python/3.8/lib/python/site-packages (1.6.1)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Requirement already up-to-date: tensorflow in /Users/emilyroller/Library/Python/3.8/lib/python/site-packages (2.7.0)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [354]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import random
import pandas, numpy, string
import emoji
import re
import tensorflow as tf 
from keras import layers, models, optimizers, callbacks
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

In [342]:
# load in the data tweets and corresponding labels
trainDF = pandas.read_csv('../Data/sarcasm_db.csv')

In [343]:
# here the user can specify which type of preprocessed data they want to use

print('There are three text processing options:')
print('1. Keeping emojis in as their emoji representation (text)')
print('2. Keeping emojis in as their text description representation (parsed with emojis)')
print('3. Removing all and any emoji representations (parsed without emojis)')

opt = input("Choose from one: text, parsed with emojis, parsed without emojis\n").lower()
while opt.strip() not in options:
    opt = input("Not a valid option! Choose from one: text, parsed with emojis, parsed without emojis\n").lower()

There are three text processing options:
1. Keeping emojis in as their emoji representation (text)
2. Keeping emojis in as their text description representation (parsed with emojis)
3. Removing all and any emoji representations (parsed without emojis)
Choose from one: text, parsed with emojis, parsed without emojis
parsed without emojis


In [346]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF[opt], trainDF['sarcasm labels'], test_size=0.10, random_state=69)

In [347]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [348]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF[opt])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [349]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF[opt])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [351]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=tf.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [352]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    rounded_predictions = []
    for i in range(len(predictions)):
        if round(predictions[i][0]) == 0:
            rounded_predictions.append(0)
        else:
            rounded_predictions.append(1)
    return metrics.accuracy_score(rounded_predictions, valid_y)

In [353]:
classifier = create_cnn()

accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x)
print("CNN, Word Embeddings with", opt,  accuracy)

CNN, Word Embeddings with parsed without emojis 0.9789356984478935
