## 1. Read the source data

In [1]:
#import pandas module for creating dataframe
import pandas as pd

#read CSV into DataFrame
data = pd.read_csv("Review.csv")
data

Unnamed: 0,Review
0,I like this books very much!!! It is VERY INTE...
1,Do not like this book. so boring 2. Too length...


## 2. Remove punctuations and standardize words into lowercases in the documents using string library

In [2]:
#import string module for string manipulation
import string

#The constant in string module to remove punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
#defining the function to remove punctuations in the documents
def remove_punctuation(text):
    # Initialize an empty string to store the result
    punctuation_free = ""

    # Iterate over each character in the text
    for i in text:
        # Check if the character is not in the string.punctuation set
        if i not in string.punctuation:
            # If not, add the character to the result string
            punctuation_free += i
    return punctuation_free

#applying the remove_punctuation function to the 'Review' column and storing the result in a new column 'clean_punctuation'
data['clean_punctuation']= data['Review'].apply(remove_punctuation)
data

Unnamed: 0,Review,clean_punctuation
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy


In [4]:
#to standardize the cases in the documents into lower case
data['clean_lower']= data['clean_punctuation'].str.lower()
data


Unnamed: 0,Review,clean_punctuation,clean_lower
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy


##3. Remove numbers using re.sub ( ) in regular expression library

In [5]:
#import regular expression library
import re

#function to remove digit (\d) or hypens (-) from the documents with an empty string ''
def remove_numbers(text):
    return re.sub("[\d-]",'',text)

#applying the remove_numbers function to the 'clean_lower' column and storing the result in a new column 'clean_number'
data['clean_number'] = data['clean_lower'].apply(remove_numbers)
data


Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy


##4. Break down the words in documents into tokens using nltk library

In [6]:
#import Natural Language Processing (NLP) library called
#Natural Language Toolkit (NLTK)
import nltk
nltk.download('punkt')

# import the library for word tokenization
from nltk.tokenize import word_tokenize

#the word tokens in the document
data['token_data']= data['clean_number'].apply(word_tokenize)
data


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver..."
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l..."


##5. Remove stopwords using nltk library

In [7]:
#download stopwords
nltk.download('stopwords')

#Get the list of English stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

# Print the list of stopwords
print(stopwords)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

#Applying the remove_stopwords function to the 'token_data' column and storing the result in a new column 'clean_xstopwords'
data['clean_xstopwords'] = data['token_data'].apply(remove_stopwords)
data


Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[like, books, much, interesting]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[like, book, boring, lengthy]"


##6. Perform word stemming using Porter Stemmer in nltk library

In [9]:
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer

#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

#applying the stemming function to the 'clean_xstopwords' column and storing the result in a new column 'clean_stemmed'
data['clean_stemmed'] = data['clean_xstopwords'].apply(stemming)
data


Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords,clean_stemmed
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[like, books, much, interesting]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[like, book, boring, lengthy]","[like, book, bore, lengthi]"


##7. Perform word lemmatization using WordNetLemmatizer( ) in nltk library

In [10]:
nltk.download('wordnet')

#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

# #applying the lemmatizer function to the 'clean_xstopwords' column and storing the result in a new column 'clean_lemmatized1'
data['clean_lemmatized1']=data['clean_xstopwords'].apply(lemmatizer)

# #applying the lemmatizer function to the 'clean_stemmed' column and storing the result in a new column 'clean_lemmatized2'
data['clean_lemmatized2']=data['clean_stemmed'].apply(lemmatizer)
data


[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords,clean_stemmed,clean_lemmatized1,clean_lemmatized2
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[like, books, much, interesting]","[like, book, much, interest]","[like, book, much, interesting]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[like, book, boring, lengthy]","[like, book, bore, lengthi]","[like, book, boring, lengthy]","[like, book, bore, lengthi]"
