# Downloading Data from Git

In [18]:
! curl -O https://raw.githubusercontent.com/gaylorav/NLPFinal/main/bg_descriptions_v2.csv
! ls

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3230k  100 3230k    0     0  7281k      0 --:--:-- --:--:-- --:--:-- 7275k
bg_descriptions_v2.csv	sample_data


#Imports

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('popular')
nltk.download('punkt_tab')
import re
from collections import defaultdict
import string

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

# Selecting Relevant Columns From CSV -> PD

In [21]:
descriptions_df = pd.read_csv("bg_descriptions_v2.csv", encoding="utf-8")
descriptions_df['sentiment'] = descriptions_df['sentiment'].astype(int)

print(len(descriptions_df))
descriptions_df.head()

2600


Unnamed: 0,id,description,usersrated,average,name,sentiment
0,10869,From the Mayfair Catalog: Heart of Africa is a...,375,5.46693,Heart of Africa,0
1,72482,Heroes of Graxia is a deck building game for 2...,408,5.47314,Heroes of Graxia,0
2,295394,"In Juduku (aka Guatafac or Dreister), you hav...",141,5.11554,Juduku,0
3,172032,"You are mighty VIKINGS! Mighty, but very small...",497,5.08663,Bottlecap Vikings,0
4,72766,SmileyFace is a card game of face-to-face fami...,475,5.44406,SmileyFace,0


## Normalize descriptions

In [22]:
stoplist = stopwords.words('english')
stoplist = stoplist + list(string.punctuation) + ["'s", "n't"]
stoplist = set(stoplist)

pattern = re.compile(r'^[^A-Za-z0-9\s]+$')

word_lens = []
words = []

for d in descriptions_df["description"]:
  d = nltk.word_tokenize(d)
  word_lens.append(len(d))
  for word in d:
    word = word.lower()
    if word not in stoplist and not pattern.match(word):
      words.append(word)

In [23]:
print(descriptions_df["usersrated"].median())
print(descriptions_df["usersrated"].mean())
print(sum(word_lens) / len(word_lens))
df_words = pd.Series(words).value_counts().reset_index()
df_words.columns = ['word', 'frequency']
df_words.head(50)

276.0
2404.449230769231
236.4526923076923


Unnamed: 0,word,frequency
0,game,7257
1,players,4085
2,player,3266
3,cards,3147
4,one,2638
5,card,1866
6,play,1652
7,new,1486
8,two,1247
9,board,1234


# RNN-LSTM Analysis

In [24]:
#More necessary imports
#Implement RNN using Keras
from sklearn.model_selection import train_test_split

import keras
from keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [25]:
#check how data looks so we can compare later
print(descriptions_df['description'][0])

From the Mayfair Catalog: Heart of Africa is a meaty trading game where the players each lead a trading company which tries to make profits in Africa. Although Africa had been circumnavigated by the start of the 19th century, the continent remained largely undiscovered. One of the great mysteries was the source of the Nile, and explorers like Burton, Speke, Baker and Livingstone became immortal in their attempts to find it. 


### 1. Preprocess data for LSTM

In [26]:
#Clean up the text a little bit (normalize)
def clean(text):
  text = text.lower() #make text lowercase
  text = re.sub('[^a-zA-Z]',' ', text) #remove punctuation
  text = re.sub("\s+[a-zA-Z]\s+", ' ', text) #remove single letters (results from removing apostraphes, etc)
  text = re.sub(r'\s+', ' ', text) #remove extra spaces
  return text

descriptions = []
all_desc = list(descriptions_df["description"])
for desc in all_desc:
  descriptions.append(clean(desc)) #clean each description

#Remove stopwords:
stoplist = stopwords.words('english')
stoplist = stoplist + list(string.punctuation) + ["'s", "n't"]
stoplist = set(stoplist)

pattern = re.compile(r'^[^A-Za-z0-9\s]+$')

word_lens = []
words = []

for d in descriptions:
  d = nltk.word_tokenize(d)
  word_lens.append(len(d))
  for word in d:
    word = word.lower()
    if word not in stoplist and not pattern.match(word):
      words.append(word)

In [27]:
#check that descriptions were formatted the way we intended:
print(descriptions[0])
print(descriptions_df["sentiment"][0])

from the mayfair catalog heart of africa is meaty trading game where the players each lead trading company which tries to make profits in africa although africa had been circumnavigated by the start of the th century the continent remained largely undiscovered one of the great mysteries was the source of the nile and explorers like burton speke baker and livingstone became immortal in their attempts to find it 
0


In [28]:
#Split data before we continue working with it
#split data into 80 = train, 20 = temp
#split temp into 10 = validate, 10 = test

ratings = descriptions_df['sentiment']
train_desc, temp_desc = train_test_split(descriptions, test_size=0.2, random_state=42)
train_rating, temp_rating = train_test_split(ratings, test_size=0.2, random_state=42)

val_desc, test_desc = train_test_split(temp_desc, test_size=0.5, random_state=42)
val_rating, test_rating = train_test_split(temp_rating, test_size=0.5, random_state=42)

In [29]:
#First step is padding all input to be the same length (necessary for RNN)
#So find longest description, and pad all other reviews to be the same length

#tokenizer.fit_on_texts will create a vocabulary from our descriptions
#vocab will be ordered by frequency (most frequent word will be "1", etc.)
#vocab is limited to top 5000 frequent words by default
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions)

#each description is mapped onto the numbers corresponding to each word in the description
#(basically we turned our descriptions into numbers)
train_desc = tokenizer.texts_to_sequences(train_desc)
val_desc = tokenizer.texts_to_sequences(val_desc)
test_desc = tokenizer.texts_to_sequences(test_desc)

#define vocab length
VOCAB_LEN = len(tokenizer.word_index) + 1

#pad data so that each sequence is length of the longest description sequence
MAX_LEN = 250 #changed to average length of descriptions to avoid excessive padding that kept breaking the model
train_desc = pad_sequences(train_desc, maxlen=MAX_LEN, padding='post')
val_desc = pad_sequences(val_desc, maxlen=MAX_LEN, padding='post')
test_desc = pad_sequences(test_desc, maxlen=MAX_LEN, padding='post')


#one-hot encode sentiment categories
#i.e., if a game is "positively" rated, [0,0,1]
#first shift labels from -1,0,1 to 0,1,2
def one_hot(labels):
  labels = to_categorical(labels, num_classes=2)
  return labels

train_rating = one_hot(train_rating)
val_rating = one_hot(val_rating)
test_rating = one_hot(test_rating)


#check all variables and encodings worked:
print("vocab length:", VOCAB_LEN)
print("max length:", MAX_LEN)
print(train_desc[-5:])
print(train_rating[-5:])

#Check shape
print('Shape of training data: ')
print(train_desc.shape)
print(train_rating.shape)
print('Shape of test data: ')
print(test_desc.shape)
print(test_rating.shape)





vocab length: 24995
max length: 250
[[ 437 7012    7 ...    0    0    0]
 [  14    1 9564 ...    0    0    0]
 [ 190  587   83 ...   53  884 1282]
 [  64   25   27 ...    0    0    0]
 [9013    5    1 ...    0    0    0]]
[[0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]
Shape of training data: 
(2080, 250)
(2080, 2)
Shape of test data: 
(260, 250)
(260, 2)


### Defining and Training the Model

In [30]:
from tensorflow.keras.metrics import Recall, Precision

#define model
#input dimension:how many words the model can learn (should be our vocab size)
#output dimesnion: size of word embedding vector **************
#LSTM: argument is the number of LSTM units model has (does the learning part of the model)
#Dense: final output layer, arguments are num categories, softmax turns output into prob dist over categories (does actual classificaiton)

#Build model, need embedding layer (since we didn't embed the words ourselves), LSTM layer with 128 units,
# and dense layer that produces the output (hence 3 dimensions, one for each category)
# dense layer activated by softmax since we have multiple categories
model = Sequential()
model.add(Embedding(VOCAB_LEN, output_dim=100, trainable=True))
model.add(LSTM(128))
model.add(Dense(2, activation='sigmoid'))

#compile and train model
optimizer = keras.optimizers.RMSprop(learning_rate=0.0005)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', Precision(), Recall()])
results = model.fit(train_desc, train_rating, epochs=5, batch_size=32, validation_data=(val_desc, val_rating))


Epoch 1/5
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.5721 - loss: 0.6825 - precision_2: 0.5859 - recall_2: 0.6020 - val_accuracy: 0.6154 - val_loss: 0.6605 - val_precision_2: 0.6122 - val_recall_2: 0.6192
Epoch 2/5
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6764 - loss: 0.6207 - precision_2: 0.6757 - recall_2: 0.6830 - val_accuracy: 0.6077 - val_loss: 0.6478 - val_precision_2: 0.6077 - val_recall_2: 0.6077
Epoch 3/5
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6753 - loss: 0.6187 - precision_2: 0.6743 - recall_2: 0.6778 - val_accuracy: 0.6077 - val_loss: 0.6509 - val_precision_2: 0.6077 - val_recall_2: 0.6077
Epoch 4/5
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.6571 - loss: 0.6281 - precision_2: 0.6568 - recall_2: 0.6620 - val_accuracy: 0.6192 - val_loss: 0.6479 - val_precision_2: 0.6098 - val_recall_2: 0.6192


In [31]:
for key in results.history.keys():
  print(key)
  print(results.history[key])

accuracy
[0.6149038672447205, 0.661057710647583, 0.6677884459495544, 0.6625000238418579, 0.6706730723381042]
loss
[0.672082245349884, 0.632451057434082, 0.6260349154472351, 0.6270585656166077, 0.618315577507019]
precision_2
[0.6184335947036743, 0.6600000262260437, 0.6668262481689453, 0.6619047522544861, 0.6677756905555725]
recall_2
[0.6225961446762085, 0.6663461327552795, 0.6697115302085876, 0.6682692170143127, 0.6754807829856873]
val_accuracy
[0.6153846383094788, 0.607692301273346, 0.607692301273346, 0.6192307472229004, 0.6192307472229004]
val_loss
[0.6605053544044495, 0.6478320956230164, 0.6509251594543457, 0.6478798985481262, 0.6446498036384583]
val_precision_2
[0.6121672987937927, 0.607692301273346, 0.607692301273346, 0.6098484992980957, 0.6121672987937927]
val_recall_2
[0.6192307472229004, 0.607692301273346, 0.607692301273346, 0.6192307472229004, 0.6192307472229004]


In [32]:
results = model.evaluate(test_desc, test_rating)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6821 - loss: 0.5954 - precision_2: 0.6809 - recall_2: 0.6848


[0.5857352018356323,
 0.7153846025466919,
 0.7110266089439392,
 0.7192307710647583]

In [37]:
precision = 0.6809
recall = 0.6848

f1_score = 2 * (precision * recall) / (precision + recall)
print(f1_score)

0.6828444314271069
