*This project has a large file size*

Follow these instructions to use LFS on GitHub
- git lfs install
- git lfs track 'data/yelp-reviews.csv'
- git add .gitattributes
- git add data/yelp-reviews.csv
- git commit -m 'Track large file with Git LFS'
- git push origin main

In [1]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

### **Load Data**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    
    return df

file_path = 'data/yelp-reviews.csv'
df = load_data(file_path)
df.head()

### **EDA**

In [None]:
# EDA
def perform_eda(df):
    missing_values = df.isna().sum()
    
    print(f'Number of missing values: {missing_values}')

perform_eda(df)
df.info()

In [None]:
# Summary statistics
statistics = df.describe().T
print('Summary Statistics')
statistics

### **Data Visualization**

In [None]:
# Dark mode
plt.style.use('dark_background')

plt.figure(figsize=(10, 6))
df['stars'].value_counts().sort_index().plot(kind='bar', color='steelblue')

plt.title('Distribution of Star Ratings for Sandbar', fontsize=16)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

### **Data Preprocessing**

In [None]:
import re

# Clean text data
def clean_data(df, text_column):
    
    # Remove missing values
    df = df.dropna(subset=[text_column, 'stars'])
    
    # Normalize text data
    df[text_column] = df[text_column].str.lower()
    
    # Remove punctuation and special characters
    df[text_column] = df[text_column].apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x))
    
    return df

df = clean_data(df, text_column='text')
df.head()

In [None]:
# Check for NaN/missing values in the text column
missing_text = df['text'].isna().sum()
print(f'Number of missing values in text column: {missing_text}')

### **Feature Engineering** (Tokenization and Lemmatization)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# nltk.download('punkt_tab') if not downloaded

# Initialize the stopwords object
stop_words = set(stopwords.words('english'))

# Preprocess text => tokens, remove stopwords and join tokens
def preprocess_text_to_tokens(text):
    
    # Tokeinize text and filter stopwords
    filtered_tokens = [
        word for word in word_tokenize(text.lower()) if word not in stop_words
    ]
    
    # Join the tokens into a string
    return ' '.join(filtered_tokens)

# Create a tokens column and apply preprocessing to the text column and store the results in a tokens column
df['tokens'] = df['text'].apply(preprocess_text_to_tokens)

# Check transformations
df[['text', 'tokens']].head() # return only the text and tokens columns