# phase 1

data exploration and cleaning

## data exploration

In [13]:
import re

import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...


True

In [11]:
chunks = pd.read_json("yelp_academic_dataset_review.json", lines=True, chunksize=100000)

for chunk in chunks:
    print("HEAD")
    print(chunk.head())
    print("\nCOLUMNS")
    print(chunk.columns)
    break

HEAD
                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      5       1      0     1   
2      3       0      0     0   
3      5       1      0     1   
4      4       1      0     1   

                                                text                date  
0  If you decide to eat here, just be aware it is... 2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year... 2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delic

## data cleaning

In [14]:
# drop irrelevant columns, process text and check for missing values

def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text) # remove special characters
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    stop_words.discard('not')
    filtered_tokens = [token for token in tokens if token.casefold() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return " ".join(stemmed_tokens)

irrelevant_columns = ["review_id", "user_id", "business_id", "useful", "funny", "cool", "date"]

for i, chunk in enumerate(chunks):
    chunk = chunk.drop(columns=irrelevant_columns)
    chunk["text"] = chunk["text"].apply(clean_text)
    with open("yelp_reviews_cleaned.csv", mode="a", newline="", encoding="utf-8") as file:
        chunk.to_csv(file, index=False, header=(i==0))

    print("COLUMNS")
    print(chunk.columns)
    print("\nINFO")
    print(chunk.info())

    break

COLUMNS
Index(['stars', 'text'], dtype='object')

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 200000 to 299999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   stars   100000 non-null  int64 
 1   text    100000 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB
None
