# phase 1 - data preparation

In [None]:
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


nltk.download("stopwords")
nltk.download("wordnet")

## original data exploration

In [None]:
# select first 100 000 records

chunks = pd.read_json("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/yelp_academic_dataset_review.json", lines=True, chunksize=100000)

for chunk in chunks:
    chunk.to_json("../data/yelp_academic_dataset_review_short.json", orient="records", lines=True)
    break

In [None]:
# check first records, columns and missing information

data = pd.read_json("../data/yelp_academic_dataset_review_short.json", lines=True)

print("HEAD")
print(data.head())
print("\nCOLUMNS")
print(data.columns)
print("\nINFO")
print(data.info())


## data cleaning

In [None]:
# drop irrelevant columns, process text and check for missing values

def clean_text(text: str) -> str:
    # remove special characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # tokenize text
    tokens = word_tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words("english"))
    stop_words.discard('not')
    filtered_tokens = [token for token in tokens if token.casefold() not in stop_words]

    # lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # convert tokens to lowercase
    lowercase_tokens = [token.lower() for token in lemmatized_tokens]

    # join tokens into a single string
    return " ".join(lowercase_tokens)

# drop irrelevant columns
irrelevant_columns = ["review_id", "user_id", "business_id", "useful", "funny", "cool", "date"]
for col in irrelevant_columns:
    if col in data.columns:
        print(f"Dropping: {col}")
        data = data.drop(columns=[col])

# apply cleaning function to data
data["text"] = data["text"].apply(clean_text)

# save cleaned data to a file
with open("../data/yelp_reviews_cleaned.csv", mode="a", newline="", encoding="utf-8") as file:
    data.to_csv(file, index=False)

# view cleaned data columns
print("COLUMNS")
print(data.columns)

## split data into training and testing sets

In [None]:
df = pd.read_csv("../data/yelp_reviews_cleaned.csv")

# split into 80/20

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["stars"], test_size=0.2, random_state=42
)

train_texts.to_csv("../data/80_20/train_texts.csv")
train_labels.to_csv("../data/80_20/train_labels.csv")
test_texts.to_csv("../data/80_20/test_texts.csv")
test_labels.to_csv("../data/80_20/test_labels.csv")


# split into 70/30

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["stars"], test_size=0.3, random_state=42
)

train_texts.to_csv("../data/70_30/train_texts.csv")
train_labels.to_csv("../data/70_30/train_labels.csv")
test_texts.to_csv("../data/70_30/test_texts.csv")
test_labels.to_csv("../data/70_30/test_labels.csv")