# Data Cleaning


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

df = pd.read_csv('7282_1.csv')
## Rename columns 
df.rename(columns = {'reviews.rating':'ratings', 'reviews.text':'reviews','reviews.username':'username'}, inplace = True)

## drop null values
df.dropna(subset=['ratings'], inplace=True)
df.dropna(subset=['reviews'], inplace=True)

## drop duplicates
df.drop_duplicates(subset=['username'])
df.dropna(subset=['ratings'], inplace=True)

## drop row 98 that has an invalid review "xxxxxxxxxxxxxxx"
df.drop(98, inplace=True)

df.reset_index(drop=True, inplace = True)

## standardize the ratings
filtered_values = df.loc[df['ratings'] > 5, 'ratings']
# Divide the filtered values by 2
filtered_values_divided = filtered_values / 2
# Update the original DataFrame with the new values
df.loc[df['ratings'] > 5, 'ratings'] = filtered_values_divided


## categorizing ratings
df.loc[(df['ratings'] >= 4.5), 'ratings'] = 5.0

df.loc[(df['ratings'] >= 3.5) & (df['ratings'] < 4.5), 'ratings'] = 4.0

df.loc[(df['ratings'] >= 2.5) & (df['ratings'] < 3.5), 'ratings'] = 3.0

df.loc[(df['ratings'] >= 1.5) & (df['ratings'] < 2.5), 'ratings'] = 2.0

df.loc[(df['ratings'] >= 0.5) & (df['ratings'] < 1.5), 'ratings'] = 1.0

df.loc[(df['ratings'] < 0.5), 'ratings'] = 0.0

## sentiment mask based on ratings 
df['sentiment'] = df['ratings'].map({0:'negative', 1:'negative', 2:'negative', 3:'neutral', 4:'positive', 5:'positive'})
df = df[['reviews', 'sentiment']]

## drop all rows where sentiment is neutral
df.drop(df[df['sentiment'] =='neutral'].index, inplace=True)

#converting labels to 0 and 1
df['sentiment'] = df['sentiment'].map({'negative':0, 'positive':1})



# Preprocessing data

In [2]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def remove_Stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)

#change word back to its original form
def lemmatize_text(text):
    wordlist = []
    lemmatizer = WordNetLemmatizer()
    sentences = sent_tokenize(text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [lemmatizer.lemmatize(word) for word in words]
        wordlist.append(" ".join(words))
    return " ".join(wordlist)

#remove special characters and punctuations from the text
def clean_text(text):
    delete_dic = {sp_character: "" for sp_character in string.punctuation}
    delete_dic[" "] = " "
    table = str.maketrans(delete_dic)
    text1 = text.translate(table)
    textArr = text1.split()
    text2 = " ".join([word for word in textArr])
    return text2.lower()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aymanadil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/aymanadil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aymanadil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Cleaned data 

In [3]:
df['reviews'] = df['reviews'].apply(clean_text)
df['reviews'] = df['reviews'].apply(remove_Stopwords)
df['reviews'] = df['reviews'].apply(lemmatize_text)

# negative_df = df[df['sentiment'] == 'negative']
# negative_df.head(10)

In [4]:
# df.drop(df[df['sentiment'] == 'neutral'].index, inplace=True)

df.shape

(29243, 2)

# Splitting data

In [5]:
from sklearn.model_selection import train_test_split
X = df['reviews']
y = df['sentiment']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(23394,) (23394,)
(5849,) (5849,)


# Preparing embedding layer

In [7]:
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM

word_tokenize = Tokenizer()
word_tokenize.fit_on_texts(X_train)

X_train = word_tokenize.texts_to_sequences(X_train)
X_test = word_tokenize.texts_to_sequences(X_test)

vocab_size = len(word_tokenize.word_index) + 1
maxlen = max([len(x) for x in X_train])
print(maxlen)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


289


### Creating feature matrix using GloVe word embeddings

In [8]:
from numpy import asarray
from numpy import zeros

In [9]:
embeddings_dict = dict()
glove_file = open('a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
  records = line.split()
  word = records[0]
  vector_dimensions = asarray(records[1:], dtype='float32')
  embeddings_dict[word] = vector_dimensions

glove_file.close()

# Creating embedding matrix 
this will contain 100 dimentional GloVe word embeddings for all words 
in our corpus

In [14]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in word_tokenize.word_index.items():
  embedding_vector = embeddings_dict.get(word)
  if embedding_vector is not None:
    embedding_matrix[index] = embedding_vector

embedding_matrix.shape

(28258, 100)