# Installing Libraries

In [1]:
%%capture

!pip install pycld2
!pip install contractions
!pip install spacy

# Importing Libraries

In [2]:
#Libraries for data and regex handling
import re
import numpy as np
import pandas as pd
import time
from google.colab import drive

#Libraries for text preprocessing
import contractions
from pycld2 import detect
import nltk;
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer,word_tokenize

#Library for warnings
import warnings

ModuleNotFoundError: ignored

# Functions

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
def clean_text(document):
  document = re.sub("'<.*?>'", "", document)
  document = remove_emoji(document)
  document = re.sub('[-!.,?:;_()*/\|<>^°#@=%"&]+', "", document)
  document = re.sub(r'([a-z])\1+', r'\1\1', document)
  return document

In [None]:
def lang_dect(text):
    isReliable, textBytesFound, details = detect(text)
    if isReliable and (details[0][0] == 'ENGLISH'):
        return "English"
    return "Other"

In [None]:
def parser(document):
  document = contractions.fix(document) #contractions expansion
  document = document.lower() #converting all words in lower case
  document = re.sub("\d+", "", document) #numbers removal
  document = re.sub("[£$€₽¥]+", "money", document)

  return document

In [None]:
def text_prep(document):
  tokens = word_tokenize(document, language="english") #tokenization
  filtered_tokens = [token for token in tokens if token not in stop_words] #stop words removal
  #reconstruct document
  document = ' '.join(filtered_tokens)
  return document

In [None]:
def parser_ema(document):
  document = contractions.fix(document) #contractions expansion
  document = re.sub("\d+", "", document) #numbers removal
  document = re.sub("[£$€₽¥]+", "money", document)

  return document

In [None]:
def clean_text_ale(document):
  document = contractions.fix(document) 
  document = re.sub("<br>", "", document)
  document = re.sub("[£$€₽¥]+", "money", document)
  document = remove_emoji(document)
  document = re.sub('[-_()*/\|<>^°#@=%"&]+', "", document)
  return document



# Text Preprocessing

In [None]:
# Supressing warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
# NLTK Stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#mount the drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Text_Mining_Project_Amazon/Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
print(df.shape)

(568454, 10)


In [None]:
subset = {"UserId", "Time", "Text"}
AmazonReviews = df.drop_duplicates(subset=subset, keep="first")

In [None]:
AmazonReviews.shape

(393892, 10)

In [None]:
AmazonReviews.isna().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               11
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                    3
Text                       0
dtype: int64

In [None]:
start = time.time()
AmazonReviews['Text'] = AmazonReviews['Text'].apply(clean_text)
end = time.time()
print(end - start)

40.842726707458496


In [None]:
start = time.time()
Text = AmazonReviews['Text'].to_list()
Language = []
for i in range(0, len(Text)):
    try:
        Language.append(lang_dect(Text[i]))
    except:
        Language.append(False)
        continue
AmazonReviews['Language'] = Language
end = time.time()
print(end - start)

16.318511486053467


In [None]:
print(AmazonReviews['Language'].value_counts())

English    393735
Other         149
False           8
Name: Language, dtype: int64


In [None]:
AmazonReviews = AmazonReviews[AmazonReviews['Language'] == "English"]
AmazonReviews_ema = AmazonReviews.copy()

In [None]:
start = time.time()
AmazonReviews['Text'] = AmazonReviews['Text'].apply(parser)
end = time.time()
print(end - start)

51.48536157608032


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
start = time.time()
Documents = AmazonReviews['Text'].apply(text_prep)
end = time.time()
print(end - start)

134.93397974967957


In [None]:
AmazonFinal = pd.DataFrame({'Text':Documents,
                            'Summary': AmazonReviews['Summary'],  
                            'Score': AmazonReviews['Score'],
                            })

In [None]:
AmazonFinal

Unnamed: 0,Text,Summary,Score
0,bought several vitality canned dog food produc...,Good Quality Dog Food,5
1,product arrived labeled jumbo salted peanutsth...,Not as Advertised,1
2,confection around centuries light pillowy citr...,"""Delight"" says it all",4
3,looking secret ingredient robitussin believe f...,Cough Medicine,2
4,great taffy great price wide assortment yummy ...,Great taffy,5
...,...,...,...
568449,great sesame chickenthis good better resturant...,Will not do without,5
568450,disappointed flavor chocolate notes especially...,disappointed,2
568451,stars small give one training session tried tr...,Perfect for our maltipoo,5
568452,best treats training rewarding dog good groomi...,Favorite Training and reward treat,5


In [None]:
AmazonFinal.to_csv("/content/drive/MyDrive/Text_Mining_Project_Amazon/AmazonFinal_diandre.csv", index = False)

# Some changes for Emanuele's tasks

In [None]:
start = time.time()
AmazonReviews_ema['Text'] = AmazonReviews_ema['Text'].apply(parser_ema)
end = time.time()
print(end - start)

22.71772003173828


In [None]:
start = time.time()
Documents_ema = AmazonReviews_ema['Text'].apply(text_prep)
end = time.time()
print(end - start)

179.2514684200287


In [None]:
AmazonReviews.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'Language'],
      dtype='object')

In [None]:
AmazonFinal_ema = pd.DataFrame({'Text':Documents_ema,
                            'Summary': AmazonReviews['Summary'],  
                            'Score': AmazonReviews['Score'],
                            'Time': AmazonReviews['Time'],
                            'HelpfulnessNumerator': AmazonReviews['HelpfulnessNumerator']
                            })

In [None]:
AmazonFinal_ema

Unnamed: 0,Text,Summary,Score,Time,HelpfulnessNumerator
0,I bought several Vitality canned dog food prod...,Good Quality Dog Food,5,1303862400,1
1,Product arrived labeled Jumbo Salted Peanutsth...,Not as Advertised,1,1346976000,0
2,This confection around centuries It light pill...,"""Delight"" says it all",4,1219017600,1
3,If looking secret ingredient Robitussin I beli...,Cough Medicine,2,1307923200,3
4,Great taffy great price There wide assortment ...,Great taffy,5,1350777600,0
...,...,...,...,...,...
568449,Great sesame chickenthis good better resturant...,Will not do without,5,1299628800,0
568450,I disappointed flavor The chocolate notes espe...,disappointed,2,1331251200,0
568451,These stars small give one training session I ...,Perfect for our maltipoo,5,1329782400,2
568452,These BEST treats training rewarding dog good ...,Favorite Training and reward treat,5,1331596800,1


In [None]:
AmazonFinal_ema.to_csv("/content/drive/MyDrive/Text_Mining_Project_Amazon/AmazonFinal_diandreperema.csv", index = False)