#cleaning new dataset

In [1]:
import pandas as pd
import re
import nltk

In [2]:

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv("data/real data collected/New folder/Reviews.csv")

In [5]:

df.head()

Unnamed: 0,overall,review text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [6]:
df = df.drop_duplicates(subset=["review text"])

In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 393578 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   overall      393578 non-null  int64 
 1   review text  393578 non-null  object
dtypes: int64(1), object(1)
memory usage: 9.0+ MB


In [8]:
print(df['overall'].value_counts())

overall
5    250715
4     56042
1     36275
3     29754
2     20792
Name: count, dtype: int64


In [9]:
print(df.isnull().sum())

overall        0
review text    0
dtype: int64


In [10]:
def clean_text(text):
    text = str(text).lower()                                # lowercase
    text = re.sub(r"http\S+|www\S+", "", text)              # remove links
    text = re.sub(r"<.*?>", "", text)                       # remove HTML
    text = re.sub(r"[^a-z\s]", " ", text)                   # keep only letters
    text = re.sub(r"\s+", " ", text).strip()                # remove extra spaces
    return text


In [11]:
def process_text(text):
    words = text.split()
    words = [w for w in words if w not in stop_words]       # remove stopwords
    words = [lemmatizer.lemmatize(w) for w in words]        # lemmatize
    return " ".join(words)

In [12]:
df["cleaned_reviews"] = df["review text"].apply(clean_text).apply(process_text)

In [14]:
df.head()

Unnamed: 0,overall,review text,cleaned_reviews
0,5,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut pe...
2,4,This is a confection that has been around a fe...,confection around century light pillowy citrus...
3,2,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,5,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 393578 entries, 0 to 568453
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   overall          393578 non-null  int64 
 1   review text      393578 non-null  object
 2   cleaned_reviews  393578 non-null  object
dtypes: int64(1), object(2)
memory usage: 12.0+ MB


In [16]:
df["review_length"] = df["cleaned_reviews"].str.split().apply(len)

In [17]:
df = df.reset_index(drop=True)

In [18]:
df = df[(df['review_length']>=3)&(df['review_length']<=200)]

In [19]:

df.head()

Unnamed: 0,overall,review text,cleaned_reviews,review_length
0,5,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,23
1,1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut pe...,18
2,4,This is a confection that has been around a fe...,confection around century light pillowy citrus...,40
3,2,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,18
4,5,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,13


In [20]:
df.tail()

Unnamed: 0,overall,review text,cleaned_reviews,review_length
393573,5,Great for sesame chicken..this is a good if no...,great sesame chicken good better resturants ea...,12
393574,2,I'm disappointed with the flavor. The chocolat...,disappointed flavor chocolate note especially ...,22
393575,5,"These stars are small, so you can give 10-15 o...",star small give one training session tried tra...,37
393576,5,These are the BEST treats for training and rew...,best treat training rewarding dog good groomin...,18
393577,5,"I am very satisfied ,product is as advertised,...",satisfied product advertised use cereal raw vi...,9


In [22]:
df = df.drop(columns=['review text'])

In [23]:
df.tail()

Unnamed: 0,overall,cleaned_reviews,review_length
393573,5,great sesame chicken good better resturants ea...,12
393574,2,disappointed flavor chocolate note especially ...,22
393575,5,star small give one training session tried tra...,37
393576,5,best treat training rewarding dog good groomin...,18
393577,5,satisfied product advertised use cereal raw vi...,9


In [24]:
max_count = 2000  

# Downsample df2 to at most 2000 per rating
df_balanced = (
    df.groupby("overall", group_keys=False)
       .apply(lambda x: x.sample(n=min(len(x), max_count), random_state=42))
       .reset_index(drop=True)
)

print(df_balanced['overall'].value_counts())

overall
1    2000
2    2000
3    2000
4    2000
5    2000
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min(len(x), max_count), random_state=42))


In [25]:
df_balanced.to_csv("data/cleaned_dataset/last_data.csv", index=False)


In [26]:
df_balanced.head()

Unnamed: 0,overall,cleaned_reviews,review_length
0,1,taste much like cherry flavored cough syrup dr...,16
1,1,note purchased box july may amazon must used d...,171
2,1,first little disappointed got one pouch try ta...,41
3,1,needed filler item figured get food sure use w...,46
4,1,highly recommended hei customer like fragmenta...,14


In [27]:
df_balanced.tail()

Unnamed: 0,overall,cleaned_reviews,review_length
9995,5,using nupro year pleased result nupro balanced...,34
9996,5,good quality dog eat many dry dog food turn nose,10
9997,5,fed trio min american eskimo french bulldog bl...,43
9998,5,complaint purchase delicious nut shell half wa...,24
9999,5,store manager always interested customer would...,71
