<a href="https://colab.research.google.com/github/gcfc/academic_projects/blob/main/18_0651_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 18.0651 Final Project: Can a Model Distinguish Human vs Machine?
## by George Chen and Steven Diaz


## **0: Mount Google Drive**

In your Google Drive root directory, create a folder named `180651data`, and place the 3 `.txt` files in it.

In [None]:
from google.colab import drive
import os
if not os.path.isdir('/content/drive'):
  drive.mount('/content/drive/')

## **1	Imports**

Import all the ML libraries we need

In [None]:
import keras
from keras import optimizers
from keras.models import Model, Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D, Bidirectional, BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()

from gensim.models import KeyedVectors

from sklearn import manifold
from sklearn.metrics import accuracy_score

import random
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

print("bread has been got")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
bread has been got


## **2 Data**

First, make functions to use **below**

In [None]:
data_directory = 'drive/MyDrive/180651data/'
def get_dir(filename):
  return data_directory + filename

Read CSVs using pandas (executes for >= 2 mins)

In [None]:
#get original data if uncleaned, else get cleaned data

# read 1/5 of the large csv
read_factor = 5
rand_offset = random.randint(0, read_factor - 1)
def should_skip(index):
  return (index + rand_offset) % read_factor != 0

if not os.path.isfile(get_dir('train_cleaned.csv')):
  print('Reading 1/{0} of train csv, with a random offset of {1}'.format(read_factor, rand_offset))
  train = pd.read_csv(get_dir('train.txt'), sep='\t', skiprows= lambda x: should_skip(x))
else: 
  print('Data already cleaned; reading from train_cleaned')
  train = pd.read_csv(get_dir('train_cleaned.csv'))
  train.response_clean = train.response_clean.astype(str)

data = train # this is data we'll use throughout code

Data already cleaned; reading from train_cleaned


### Preprocessing

Handle emojies

In [None]:
emojies = [":‑)", ":)", ":D", ":o)", ":]", ":3", ":c)", ":>", "=]", "8)", "=)", ":}", ":^)", ":っ)", ":‑D", "8‑D", "8D", "x‑D", "xD", "X‑D", "XD", "=‑D", "=D", "=‑3", "=3", "B^D", ":-))", ">:[", ":‑(", ":(", ":‑c", ":c", ":‑<", ":っC", ":<", ":‑[", ":[", ":{", ";(", ":-||", ":@", ">:(", ":'‑(", ":'(", ":'‑)", ":')", "D:<", "D:", "D8", "D;", "D=", "DX", "v.v", "D‑':", ">:O", ":‑O", ":O", ":‑o", ":o", "8‑0", "O_O", "o‑o", "O_o", "o_O", "o_o", "O-O", ":*", ":-*", ":^*", "(", "}{'", ")", ";‑)", ";)", "*-)", "*)", ";‑]", ";]", ";D", ";^)", ":‑,", ">:P", ":‑P", ":P", "X‑P", "x‑p", "xp", "XP", ":‑p", ":p", "=p", ":‑Þ", ":Þ", ":þ", ":‑þ", ":‑b", ":b", "d:", ">:\\", ">:/", ":‑/", ":‑.", ":/", ":\\", "=/", "=\\", ":L", "=L", ":S", ">.<", ":|", ":‑|", ":$", ":‑X", ":X", ":‑#", ":#", "O:‑)", "0:‑3", "0:3", "0:‑)", "0:)", "0;^)", ">:)", ">;)", ">:‑)", "}:‑)", "}:)", "3:‑)", "3:)", "o/\o", "^5", ">_>^", "^<_<", "|;‑)", "|‑O", ":‑J", ":‑&", ":&", "#‑)", "%‑)", "%)", ":‑###..", ":###..", "<:‑|", "<*)))‑{", "><(((*>", "><>", "\o/", "*\0/*", "@}‑;‑'‑‑‑", "@>‑‑>‑‑", "~(_8^(I)", "5:‑)", "~:‑\\", "//0‑0\\\\", "*<|:‑)", "=:o]", "7:^]", ",:‑)", "</3", "<3"]


Handle contractions

In [None]:
cList = {
  "ain 't": "am not",
  "aren 't": "are not",
  "can 't": "cannot",
  "can 't've": "cannot have",
  "'cause": "because",
  "could 've": "could have",
  "couldn 't": "could not",
  "couldn 't 've": "could not have",
  "didn 't": "did not",
  "doesn 't": "does not",
  "don 't": "do not",
  "hadn 't": "had not",
  "hadn 't 've": "had not have",
  "hasn 't": "has not",
  "haven 't": "have not",
  "he 'd": "he would",
  "he 'd 've": "he would have",
  "he 'll": "he will",
  "he 'll 've": "he will have",
  "he 's": "he is",
  "how 'd": "how did",
  "how 'd 'y": "how do you",
  "how 'll": "how will",
  "how 's": "how is",
  "I 'd": "I would",
  "I 'd 've": "I would have",
  "I 'll": "I will",
  "I 'll 've": "I will have",
  "I 'm": "I am",
  "I 've": "I have",
  "isn 't": "is not",
  "it 'd": "it had",
  "it 'd 've": "it would have",
  "it 'll": "it will",
  "it 'll 've": "it will have",
  "it 's": "it is",
  "let 's": "let us",
  "ma 'am": "madam",
  "mayn 't": "may not",
  "might 've": "might have",
  "mightn 't": "might not",
  "mightn 't 've": "might not have",
  "must 've": "must have",
  "mustn 't": "must not",
  "mustn 't 've": "must not have",
  "needn 't": "need not",
  "needn 't 've": "need not have",
  "o 'clock": "of the clock",
  "oughtn 't": "ought not",
  "oughtn 't 've": "ought not have",
  "shan 't": "shall not",
  "sha 'n 't": "shall not",
  "shan 't 've": "shall not have",
  "she 'd": "she would",
  "she 'd 've": "she would have",
  "she 'll": "she will",
  "she 'll 've": "she will have",
  "she 's": "she is",
  "should 've": "should have",
  "shouldn 't": "should not",
  "shouldn 't 've": "should not have",
  "so 've": "so have",
  "so 's": "so is",
  "that 'd": "that would",
  "that 'd 've": "that would have",
  "that 's": "that is",
  "there 'd": "there had",
  "there 'd 've": "there would have",
  "there 's": "there is",
  "they 'd": "they would",
  "they 'd 've": "they would have",
  "they 'll": "they will",
  "they 'll 've": "they will have",
  "they 're": "they are",
  "they 've": "they have",
  "to 've": "to have",
  "wasn 't": "was not",
  "we 'd": "we had",
  "we 'd 've": "we would have",
  "we 'll": "we will",
  "we 'll 've": "we will have",
  "we 're": "we are",
  "we 've": "we have",
  "weren 't": "were not",
  "what 'll": "what will",
  "what 'll 've": "what will have",
  "what 're": "what are",
  "what 's": "what is",
  "what 've": "what have",
  "when 's": "when is",
  "when 've": "when have",
  "where 'd": "where did",
  "where 's": "where is",
  "where 've": "where have",
  "who 'll": "who will",
  "who 'll 've": "who will have",
  "who 's": "who is",
  "who 've": "who have",
  "why 's": "why is",
  "why 've": "why have",
  "will 've": "will have",
  "won 't": "will not",
  "won 't 've": "will not have",
  "would 've": "would have",
  "wouldn 't": "would not",
  "wouldn 't 've": "would not have",
  "y 'all": "you all",
  "y 'alls": "you alls",
  "y 'all 'd": "you all would",
  "y 'all 'd 've": "you all would have",
  "y 'all 're": "you all are",
  "y 'all 've": "you all have",
  "you 'd": "you had",
  "you 'd 've": "you would have",
  "you 'll": "you will",
  "you 'll 've": "you will have",
  "you 're": "you are",
  "you 've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
  def replace(match):
    return cList[match.group(0)]
  return c_re.sub(replace, text)

Add a new column to the pandas object to contain the cleaned, pre-processed responses (executes for a long time)

In [None]:
def preprocess_text(text):
  text = str(text)
  # turn to lowercase + remove '@@ '
  text = text.lower().replace('@@ ', '')
  # expand the contractions
  text = expandContractions(text)
  # remove emojies
  for emoji in emojies:
    text = text.replace(emoji, '')
  # remove <..> tags
  text = re.sub(r'<[a-zA-Z0-9_]*>', '', text)
  # remove punctuations 
  text = re.sub(r'[^\w\s]', '', text)
  # remove leading + trailing whitespace
  text = text.strip()

  # stemming (not doing for now)
  # text = PorterStemmer().stem(text)

  return text

# this runs only if data is uncleaned
if 'response_clean' not in data:
  data['response_clean'] = data['response'].apply(lambda response: preprocess_text(response))
  data.to_csv(get_dir('train' + '_cleaned.csv'), float_format=str)

In [None]:
print(data.head(10))

# verify data is actually clean
print(data.at[0, 'response'])
print(data.at[0, 'response_clean'])

   Unnamed: 0  ...                                     response_clean
0           0  ...  ok  will do  do not be late though  you will m...
1           1  ...  the fans inbetween dont have the kneejerk reac...
2           2  ...  i m not sure if i m going to be able to do any...
3           3  ...           i  i  i did not   looks at you curiously
4           4  ...                       thank you so much for the rt
5           5  ...  thaaaaaat fuccking sucks  i would just jump of...
6           6  ...  ur serius too huh   lol its str8 dependin on w...
7           7  ...                     that is what i m talking about
8           8  ...                    i m sure you will find out soon
9           9  ...        are you agreeing or is that a smart comment

[10 rows x 6 columns]
<first_speaker> <at> ok , will do - don 't be late though , you 'll miss the fun !
ok  will do  do not be late though  you will miss the fun


#### Word Cloud Data Visualization
Show word cloud for visualization

In [None]:
# all_words = ' '.join(validation['response_clean'].values)
# print(all_words[:50])
# from wordcloud import WordCloud
# wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
# plt.figure(figsize=(10, 7))
# plt.imshow(wordcloud, interpolation="bilinear")
# plt.axis('off')
# plt.show()

### Semantic Feature Extraction



Tokenizing

In [None]:
data_responses = data['response_clean'].values.tolist()
# MAX_NB_WORDS = 50000
MAX_NB_WORDS = None
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_responses)

In [None]:
data_sequences = tokenizer.texts_to_sequences(data_responses)

In [None]:
word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens' % vocab_size)

Found 198337 unique tokens


In [None]:
data_array = pad_sequences(data_sequences, padding='post', truncating='post')
print('Shape of data tensor: ', data_array.shape)
print(data_array[0])

Shape of data tensor:  (1438195, 98)
[145  34  10  10   8  25 401 146   2  34 165   3 210   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]


Word2Vec and Embedding

mapping words to decimals based on their meaning,

use pretrained model (takes >= 3.5 minutes to run)

In [None]:
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# word2vec_twitter = "https://drive.google.com/file/d/1lw5Hr6Xw0G0bMT1ZllrtMqEgCTrM7dzc/view?usp=sharing"
# google_news = "https://drive.google.com/u/0/open?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM"

# EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
# word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [None]:
EMBEDDING_DIM = 300
nb_words = min(MAX_NB_WORDS, vocab_size + 1) if MAX_NB_WORDS is not None else vocab_size + 1

# # the embedding matrix
# embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

# for (word, idx) in word_index.items():
#     if word in word2vec.vocab and idx < nb_words:
#         embedding_matrix[idx] = word2vec.word_vec(word)

# print(embedding_matrix.shape)

In [None]:
embeddings_index = dict()
f = open(get_dir('glove.6B.50d.txt'))
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

In [None]:
EMBEDDING_DIM = 50
embedding_matrix2 = np.zeros((nb_words, EMBEDDING_DIM))
for (word, index) in word_index.items():
  embeddings_vector = embeddings_index.get(word)
  if embeddings_vector is not None:
    embedding_matrix2[index] = embeddings_vector

#### Word Vector Data Visualization
Write words and word vectors to output file to use for data visualization

Use website https://projector.tensorflow.org/ and click 'Load'. Then, upload the 'word_vecs.tsv' file for Step 1 for the word vectors, and upload 'words.tsv' for Step 2 to label each point


In [None]:
import csv 

with open('word_vecs.tsv', 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for (word, idx) in word_index.items():
    if word in word2vec.vocab and idx < nb_words:
      vec = word2vec.word_vec(word).tolist()
      tsv_writer.writerow(vec)

with open('words.tsv', 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for (word, idx) in word_index.items():
    if word in word2vec.vocab and idx < nb_words:
      tsv_writer.writerow([word])

#### Splitting and Labelling Data

60-20-20 train-val-test

In [None]:
data_labels = data['human-generated'].values.astype(np.float)
print('Original size: ', data_labels.shape[0])

# first, make 80-20 split on data to create train-test
X_train, X_test, y_train, y_test = train_test_split(
    data_array, data_labels, train_size=0.8, shuffle=True, random_state=10
)

# then, do 75-25 split on train to create train-validation (0.75 * 0.8 = 0.6, 0.25 * 0.8 = 0.2)
X_train, X_validation, y_train, y_validation = train_test_split(
    X_train, y_train, train_size=0.75, shuffle=True, random_state=10
)

# check we have 60-20-20 train-val-test split
print('60%: ', data_labels.shape[0]*0.6)
print('20%: ', data_labels.shape[0]*0.2)
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)
print('X_validation shape: ', X_validation.shape)
print('y_validation shape: ', y_validation.shape)

Original size:  1438195
60%:  862917.0
20%:  287639.0
X_train shape:  (862917, 98)
y_train shape:  (862917,)
X_test shape:  (287639, 98)
y_test shape:  (287639,)
X_validation shape:  (287639, 98)
y_validation shape:  (287639,)


## 3 Machine Learning Model

### Neural Network Architecture (LSTM + CNN)

In [None]:
model = Sequential()
# Embedded layer
model.add(Embedding(input_dim=len(embedding_matrix), output_dim=EMBEDDING_DIM, weights=[embedding_matrix], 
                            input_length=data_array.shape[1], trainable=False))

# Convolutional Layer
# to be tuned later
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))

# LSTM Layer
model.add(LSTM(300, return_sequences=True))
model.add(Dropout(rate=0.5))
model.add(LSTM(300, return_sequences=True))
model.add(Dropout(rate=0.5))
model.add(LSTM(300))
model.add(Dense(units=1, activation='sigmoid'))

nadam = optimizers.Nadam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)

model.compile(loss='binary_crossentropy', optimizer=nadam, metrics=['acc'])
print(model.summary())

In [None]:
model_glove = Sequential()
model_glove.add(Embedding(input_dim=len(embedding_matrix2), output_dim=EMBEDDING_DIM, input_length=data_array.shape[1], weights=[embedding_matrix2], trainable=True))
model_glove.add(Bidirectional(LSTM(20, return_sequences=True)))
model_glove.add(Dropout(0.2))
model_glove.add(BatchNormalization())
model_glove.add(Bidirectional(LSTM(20, return_sequences=True)))
model_glove.add(Dropout(0.2))
model_glove.add(BatchNormalization())
model_glove.add(Bidirectional(LSTM(20)))
model_glove.add(Dropout(0.2))
model_glove.add(BatchNormalization())
model_glove.add(Dense(64, activation='relu'))
model_glove.add(Dense(64, activation='relu'))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model = model_glove
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 98, 50)            9916900   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 98, 40)            11360     
_________________________________________________________________
dropout_3 (Dropout)          (None, 98, 40)            0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 98, 40)            160       
_________________________________________________________________
bidirectional_4 (Bidirection (None, 98, 40)            9760      
_________________________________________________________________
dropout_4 (Dropout)          (None, 98, 40)            0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 98, 40)           

### Fit the model to the training data

In [None]:
# batch size was 16 before
history = dict()
try:
  hist = model.fit(X_train, y_train, \
          validation_data=(X_validation, y_validation), \
          epochs=50, batch_size=2048, shuffle=True, \
          callbacks=[])
except: pass
finally: history = hist.history

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Analyze the model's accuracy and loss

In [None]:
#Results: summarize the history for accuracy
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

In [None]:
#Summarize for loss
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

### Test the model on unseen data

In [None]:
# Percentage accuracy of test data
y_pred = model.predict(X_test)
y_pred = np.round(y_pred.flatten())
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy*100))
res = model.evaluate(X_test, y_test)
print(res)