# phase 1 - data preparation

In [1]:
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## original data exploration

In [None]:
# select first 100 000 records, check first records, columns and missing information

chunks = pd.read_json("yelp_academic_dataset_review.json", lines=True, chunksize=100000)

for chunk in chunks:
    print("HEAD")
    print(chunk.head())
    print("\nCOLUMNS")
    print(chunk.columns)
    print("\nINFO")
    print(chunk.info())
    break

HEAD
                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      5       1      0     1   
2      3       0      0     0   
3      5       1      0     1   
4      4       1      0     1   

                                                text                date  
0  If you decide to eat here, just be aware it is... 2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year... 2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delic

## data cleaning

In [None]:
# drop irrelevant columns, process text and check for missing values

def clean_text(text: str) -> str:
    text = re.sub(r"[^a-zA-Z\s]", "", text) # remove special characters
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    stop_words.discard('not')
    filtered_tokens = [token for token in tokens if token.casefold() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return " ".join(lemmatized_tokens)

irrelevant_columns = ["review_id", "user_id", "business_id", "useful", "funny", "cool", "date"]

for i, chunk in enumerate(chunks):
    chunk = chunk.drop(columns=irrelevant_columns)
    chunk["text"] = chunk["text"].apply(clean_text)
    with open("yelp_reviews_cleaned.csv", mode="a", newline="", encoding="utf-8") as file:
        chunk.to_csv(file, index=False, header=(i==0))

    print("COLUMNS")
    print(chunk.columns)
    
    break

COLUMNS
Index(['stars', 'text'], dtype='object')


## split data into training and testing sets

In [None]:
df = pd.read_csv("yelp_reviews_cleaned.csv")

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["stars"], test_size=0.2, random_state=42
)

type(train_texts)

75220    fried pickle appetizer lunch fried green tomat...
48955    Went great review yelpit everything expected f...
44966    LOVE NEW PLACE Ate pickled vegetable chicken w...
13568    wife three time time look forward trying somet...
92727    first time visiting Libertine resulted really ...
                               ...                        
6265     hand best florida like barbqyou love place goi...
54886    Damn good steak salad perfect Bloody Marys bac...
76820    lunch afternoon wife Good food great service t...
860      Food cold Waitress said one person working bac...
15795    One best prime rib dip Ive ever hand attached ...
Name: text, Length: 80000, dtype: object