# Data Cleaning


In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

df = pd.read_csv('7282_1.csv')
## Rename columns 
df.rename(columns = {'reviews.rating':'ratings', 'reviews.text':'reviews','reviews.username':'username'}, inplace = True)

## drop null values
df.dropna(subset=['ratings'], inplace=True)
df.dropna(subset=['reviews'], inplace=True)

## drop duplicates
df.drop_duplicates(subset=['username'])
df.dropna(subset=['ratings'], inplace=True)

## drop row 98 that has an invalid review "xxxxxxxxxxxxxxx"
df.drop(98, inplace=True)

df.reset_index(drop=True, inplace = True)

## standardize the ratings
filtered_values = df.loc[df['ratings'] > 5, 'ratings']
# Divide the filtered values by 2
filtered_values_divided = filtered_values / 2
# Update the original DataFrame with the new values
df.loc[df['ratings'] > 5, 'ratings'] = filtered_values_divided


## categorizing ratings
df.loc[(df['ratings'] >= 4.5), 'ratings'] = 5.0

df.loc[(df['ratings'] >= 3.5) & (df['ratings'] < 4.5), 'ratings'] = 4.0

df.loc[(df['ratings'] >= 2.5) & (df['ratings'] < 3.5), 'ratings'] = 3.0

df.loc[(df['ratings'] >= 1.5) & (df['ratings'] < 2.5), 'ratings'] = 2.0

df.loc[(df['ratings'] >= 0.5) & (df['ratings'] < 1.5), 'ratings'] = 1.0

df.loc[(df['ratings'] < 0.5), 'ratings'] = 0.0

## sentiment mask based on ratings 
df['sentiment'] = df['ratings'].map({0:'negative', 1:'negative', 2:'negative', 3:'neutral', 4:'positive', 5:'positive'})
df = df[['reviews', 'sentiment']]

## drop all rows where sentiment is neutral
df.drop(df[df['sentiment'] =='neutral'].index, inplace=True)


# Preprocessing data

In [42]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def remove_Stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)

#change word back to its original form
def lemmatize_text(text):
    wordlist = []
    lemmatizer = WordNetLemmatizer()
    sentences = sent_tokenize(text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [lemmatizer.lemmatize(word) for word in words]
        wordlist.append(" ".join(words))
    return " ".join(wordlist)

#remove special characters and punctuations from the text
def clean_text(text):
    delete_dic = {sp_character: "" for sp_character in string.punctuation}
    delete_dic[" "] = " "
    table = str.maketrans(delete_dic)
    text1 = text.translate(table)
    textArr = text1.split()
    text2 = " ".join([word for word in textArr])
    return text2.lower()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aymanadil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/aymanadil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aymanadil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Cleaned data 

In [43]:
df['reviews'] = df['reviews'].apply(clean_text)
df['reviews'] = df['reviews'].apply(remove_Stopwords)
df['reviews'] = df['reviews'].apply(lemmatize_text)

# negative_df = df[df['sentiment'] == 'negative']
# negative_df.head(10)

Unnamed: 0,reviews,sentiment
19,hotellihuone oli ullakolla jossa ei pystynyt k...,negative
20,dont stay unless youre le 2 foot tall like sle...,negative
44,wall extremely thin hear everything excessive ...,negative
57,share opinion businesswith yp visitor across u...,negative
58,share opinion businesswith yp visitor across u...,negative
59,share opinion businesswith yp visitor across u...,negative
79,share opinion businesswith yp visitor across u...,negative
82,pathetic discriminatory free shuttle service a...,negative
85,room clean hotel worker good breakfast also go...,negative
91,visited comfort suite new iberia louisiana dau...,negative


In [40]:
# df.drop(df[df['sentiment'] == 'neutral'].index, inplace=True)

df.shape

(29243, 2)