In [1]:
import pandas as pd
import time

import re

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from wordcloud import WordCloud

import string
from langdetect import detect
import numpy as np

# 1. Load Data

In [2]:
df = pd.read_csv('web_content.csv', index_col=False)

In [3]:
#converting columns to strings
df['reviews'] = df['reviews'].astype(str)
df['rating'] = df['rating'].astype(str)

In [4]:
#converting ratings to a number
df['rating_value'] = df['rating'].apply(lambda x: int(re.search(r'(\d+) out of 5', x).group(1)) if re.search(r'(\d+) out of 5', x) else None)

In [12]:
df.shape

(1011, 4)

# 2. Cleaning Text

## Cleaning the text of alpha numerics, characters and extra white space

In [6]:
# define a function to clean the text
def clean_text(text):
    # remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # remove non-alphanumeric characters and extra whitespace
    text = re.sub('[^\w\s]', '', text).strip()
    return text

def filter_english_reviews(df):
    english_reviews = []
    for index, row in df.iterrows():
        try:
            lang = detect(row['reviews'])
            if lang == 'en':
                english_reviews.append(row)
        except:
            continue
    return pd.DataFrame(english_reviews)


In [7]:

# apply the clean_text function to the reviews column
df['clean_reviews'] = df['reviews'].apply(clean_text)

In [8]:
df['clean_reviews']

0       This is the most difficult review Ive ever wri...
1       I really enjoyed this on audio Over 15 hours i...
2       35  I listened to Prince Harry reading this on...
3       My curiosity got the better of me and I read P...
4       So I have always liked Harry This memoir made ...
                              ...                        
1110    Rating 25 Ill admit it The hype got the best o...
1111    If you go into Spare baring negative preconcei...
1112    okay so i dont really give a shit about the ro...
1113    35 stjärnor Är man inte intresserad av kungafa...
1114    The first 60 of this book was a slog to get th...
Name: clean_reviews, Length: 1115, dtype: object

In [9]:


# filter out non-English reviews
df = filter_english_reviews(df)

In [10]:
print(df)

                 rating                                            reviews  \
0     Rating 1 out of 5  This is the most difficult review I’ve ever wr...   
1     Rating 5 out of 5  I really enjoyed this on audio!! Over 15 hours...   
2     Rating 3 out of 5  3.5 ⭐⭐⭐ I listened to Prince Harry reading thi...   
3     Rating 3 out of 5  My curiosity got the better of me and I read P...   
4     Rating 3 out of 5  So, I have always “liked” Harry. This memoir m...   
...                 ...                                                ...   
1109  Rating 3 out of 5  I have many conflicting feelings about this bo...   
1110  Rating 2 out of 5  Rating: 2.5 I’ll admit it. The hype got the be...   
1111  Rating 4 out of 5  If you go into “Spare” baring negative preconc...   
1112  Rating 2 out of 5  okay so i don't really give a shit about the r...   
1114  Rating 2 out of 5  The first 60% of this book was a slog to get t...   

      rating_value                                      clean_r

In [11]:
df.to_csv('cleaned_reviews.csv', index=False)

In [13]:
df.head()

Unnamed: 0,rating,reviews,rating_value,clean_reviews
0,Rating 1 out of 5,This is the most difficult review I’ve ever wr...,1,This is the most difficult review Ive ever wri...
1,Rating 5 out of 5,I really enjoyed this on audio!! Over 15 hours...,5,I really enjoyed this on audio Over 15 hours i...
2,Rating 3 out of 5,3.5 ⭐⭐⭐ I listened to Prince Harry reading thi...,3,35 I listened to Prince Harry reading this on...
3,Rating 3 out of 5,My curiosity got the better of me and I read P...,3,My curiosity got the better of me and I read P...
4,Rating 3 out of 5,"So, I have always “liked” Harry. This memoir m...",3,So I have always liked Harry This memoir made ...
