*This project has a large file size*

Follow these instructions to use LFS on GitHub
- git lfs install
- git lfs track 'data/yelp-reviews.csv'
- git add .gitattributes
- git add data/yelp-reviews.csv
- git commit -m 'Track large file with Git LFS'
- git push origin main

In [1]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

KeyboardInterrupt: 

### **Load Data**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    
    return df

file_path = 'data/yelp-reviews.csv'
df = load_data(file_path)
df.head()

### **EDA**

In [None]:
# EDA
def perform_eda(df):
    missing_values = df.isna().sum()
    
    print(f'Number of missing values: {missing_values}')

perform_eda(df)
df.info()

In [None]:
# Summary statistics
statistics = df.describe().T
print('Summary Statistics')
statistics

### **Data Visualization**

In [None]:
# Dark mode
plt.style.use('dark_background')

plt.figure(figsize=(10, 6))
df['stars'].value_counts().sort_index().plot(kind='bar', color='steelblue')

plt.title('Distribution of Star Ratings for Sandbar', fontsize=16)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

### **Data Preprocessing**

In [None]:
import re

# Clean text data
def clean_data(df, text_column):
    
    # Remove missing values
    df = df.dropna(subset=[text_column, 'stars'])
    
    # Normalize text data
    df[text_column] = df[text_column].str.lower()
    
    # Remove punctuation and special characters
    df[text_column] = df[text_column].apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x))
    
    return df

df = clean_data(df, text_column='text')
df.head()

In [None]:
# Check for NaN/missing values in the text column
missing_text = df['text'].isna().sum()
print(f'Number of missing values in text column: {missing_text}')

### **Feature Engineering** (Tokenization and Lemmatization)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# nltk.download('punkt_tab') if not downloaded

# Initialize the stopwords object
stop_words = set(stopwords.words('english'))

# Preprocess text => tokens, remove stopwords and join tokens
def preprocess_text_to_tokens(text):
    
    # Tokeinize text and filter stopwords
    filtered_tokens = [
        word for word in word_tokenize(text.lower()) if word not in stop_words
    ]
    
    # Join the tokens into a string
    return ' '.join(filtered_tokens)

# Create a tokens column and apply preprocessing to the text column and store the results in a tokens column
df['tokens'] = df['text'].apply(preprocess_text_to_tokens)

# Check transformations
df[['text', 'tokens']].head() # return only the text and tokens columns

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer and stopwords object
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocess text => lemmas, remove stopwords and join lemmas
def preprocess_text_to_lemmas(text):
    
    # Lemmatize text and filter stopwords
    lemmas = [
        lemmatizer.lemmatize(word) for word in word_tokenize(text.lower()) if word not in stop_words
    ]
    
    # Join the lemmas into a string
    return ' '.join(lemmas)

# Create a lemmas column and apply preprocessing to the text column and store the results in a lemmas column
df['lemmas'] = df['text'].apply(preprocess_text_to_lemmas)

# Check transformations
df[['text', 'lemmas']].head() # return only the text and lemmas columns

In [None]:
# Display transformed dataframe
df.head()

### **Time Series Preprocessing**
- Create a resampled dataset for time series analysis

In [11]:
# import pandas as pd
# import matplotlib.pyplot as plt

# # Convert review dates to datetime objects
# def convert_to_datetime(df, date_column):
#     df[date_column] = pd.to_datetime(df[date_column], errors='coerce') # Convert to datetime, handle errors
    
#     return df

# # Set the date as the index
# def set_date_as_index(df, date_column):
#     df.set_index(date_column, inplace=True)
    
#     return df

# # Resample the data by a specific time interval
# def resample_data(df, interval='ME'): # 'M' stands for months, 'D' stands for daily, 'W' for weekly
#     df_resampled = df.resample(interval).mean()
    
#     return df_resampled

In [None]:
import pandas as pd

# Convert date into datetime object
def convert_to_datetime(df, date_column):
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    return df

# Convert date to datetime
df = convert_to_datetime(df, date_column='date')

# Set date as index
df.set_index('date', inplace=True)

# Resample numeric data
def resample_numeric_data(df, interval='M'):
    # Select only numeric columns for resampling
    numeric_df = df.select_dtypes(include='number')
    df_resampled = numeric_df.resample(interval).mean()
    
    return df_resampled

# Resample numeric data only
df_resampled = resample_numeric_data(df, interval='M')
print(df_resampled.head())

In [None]:
# Save the dataframe
output_path = 'data/reviews_processed.csv'
df_resampled.to_csv(output_path)
print(f'Processed dataframed saved to {output_path}')

In [None]:
# Load cleaned dataframe
df_resampled = pd.read_csv('data/reviews_processed.csv', keep_default_na=False)

# Check for missing values
missing_values = df_resampled.isna().sum()
print(f'Number of missing values: {missing_values}')

### **Time Series Analysis**

In [None]:
import matplotlib.pyplot as plt

# Annotating the hightest and lowest points
max_value = df_resampled['stars'].max()
min_value = df_resampled['stars'].min()
max_date = df_resampled['stars'].idxmax()
min_date = df_resampled['stars'].idxmin()

plt.figure(figsize=(15, 6))
plt.plot(df_resampled.index, df_resampled['stars'], color='skyblue')
plt.title('Average Star Ratings Over Time', fontsize=16)
plt.xlabel('Time')
plt.ylabel('Average Star Rating')

plt.annotate(f'Max {max_value:.2f}',
             xy=(max_date, max_value),
             xytext=(max_date, max_value + 0.05),
             arrowprops=dict(facecolor='green', shrink=0.05))

plt.annotate(f'Min {min_value:.2f}',
             xy=(min_date, min_value),
             xytext=(min_date, min_value + - 0.05),
             arrowprops=dict(facecolor='red', shrink=0.05))

plt.show()

In [None]:
# Calculate 3 month-moving average
df_resampled['3-month-MA'] = df_resampled['stars'].rolling(window=3).mean()

plt.figure(figsize=(15, 6))
plt.plot(df_resampled.index, df_resampled['stars'], label='Monthly Average', color='skyblue')
plt.plot(df_resampled.index, df_resampled['3-month-MA'], label='3-Month Moving Average', color='violet')
plt.title('Average Star Ratings Over Time with 3 Month Moving Average', fontsize=16)
plt.xlabel('Time')
plt.ylabel('Average Star Rating')
plt.legend(loc='best')
plt.show()

### **Deep Learning**
- Data Preparation

In [17]:
# import numpy as np
# from sklearn.model_selection import train_test_split

# # Create sequences from the time series data
# def create_sequences(data, sequence_length):
    
#     sequences = []
#     targets = []
    
#     # Loop
#     for start_index in range(len(data) - sequence_length):
#         end_index = start_index + sequence_length
#         sequence = data[start_index:end_index]
#         target = data[end_index]
        
#         sequences.append(sequence)
#         targets.append(target)
    
#     return np.array(sequences), np.array(targets)

# # Sequence length 3 for quarterly, 6 is semi-annual, 12 is for annual
# sequence_length = 3 # for 3 months or quarterly
# X, y = create_sequences(df_resampled['stars'].values, sequence_length)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

#### LSTM Model (Long Short Term Memory) Deep Learning Model

common errors:
- ModuleNotFoundError: No module named 'tensorflow'
- `pip install tensorflow`

In [18]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout

# def build_lstm_model(input_shape, units_first_layer, units_second_layer, dropout_first, dropout_second):
    
#     # Initialize model
#     model = Sequential()
    
#     # Add first layer with dropout
#     model.add(LSTM(units=units_first_layer, return_sequences=True, input_shape=input_shape))
#     model.add(Dropout(dropout_first))
    
#     # Add second layer with dropout
#     model.add(LSTM(units=units_second_layer, return_sequences=False))
#     model.add(Dropout(dropout_second))
    
#     # Add output layer
#     model.add(Dense(units=1))
    
#     return model

# # Compile the model
# def compile_model(model, optimizer, loss):
#     model.compile(optimizer=optimizer, loss=loss)
    
#     return model

In [19]:
# # Define input shape
# input_shape = (X_train.shape[1], 1)

# # Build the model
# model = build_lstm_model(input_shape, units_first_layer=100, units_second_layer=50, dropout_first=0.5, dropout_second=0.2)

# # Compile the model
# model = compile_model(model, optimizer='adam', loss='mean_squared_error')

In [20]:
# from tqdm.keras import TqdmCallback

# # Train the model
# history = model.fit(
#     X_train,
#     y_train,
#     epochs=100,
#     batch_size=32,
#     validation_split=0.2,
#     callbacks=[TqdmCallback(verbose=1)]
# )

### **Model Evaluation**

In [21]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import mean_absolute_error, mean_squared_error

# def evaluate_model(model, X_test, y_test):
    
#     test_loss = model.evaluate(X_test, y_test)
#     print(f'Test Loss: {test_loss}')
    
#     return test_loss

# def calculate_metrics(y_test, predictions):
#     mse = mean_squared_error(y_test, predictions)
#     mae = mean_absolute_error(y_test, predictions)
    
#     print(f'Mean Squared Error: {mse}')
#     print(f'Mean Absolute Error: {mae}')
    
#     return mse, mae

# def predict_and_evaluate(model, X_test, y_test):
    
#     # Evaluate model
#     test_loss = evaluate_model(model, X_test, y_test)
#     predictions = model.predict(X_test)
    
#     # Calculate metrics
#     mse, mae = calculate_metrics(y_test, predictions)
    
#     return predictions, mse, mae

# predictions, mse, mae = predict_and_evaluate(model, X_test, y_test)

# import matplotlib.pyplot as plt
# from sklearn.metrics import mean_absolute_error, mean_squared_error

# def evaluate_model(model, X_test, y_test):
#     test_loss = model.evaluate(X_test, y_test)
#     print(f'Test Loss: {test_loss}')
    
#     return test_loss

# def calculate_metrics(y_test, predictions):
#     mse = mean_squared_error(y_test, predictions)
#     mae = mean_absolute_error(y_test, predictions)
    
#     print(f'Mean Squared Error: {mse}')
#     print(f'Mean Absolute Error: {mae}')
    
#     return mse, mae

# def predict_and_evaluate(model, X_test, y_test):
    
#     # Evaluate model
#     test_loss = evaluate_model(model, X_test, y_test)
#     predictions = model.predict(X_test)
    
#     # Calculate metrics
#     mse, mae = calculate_metrics(y_test, predictions)
    
#     return predictions, mse, mae

# predictions, mse, mae = predict_and_evaluate(model, X_test, y_test)

In [22]:
# import matplotlib.pyplot as plt

# def plot_training_and_predictions(history, y_test, predictions):
    
#     fig, axes = plt.subplots(1, 2, figsize=(18, 6))
    
#     # Training and validation loss
#     axes[0].plot(history.history['loss'], label='Training Loss')
#     axes[0].plot(history.history['val_loss'], label='Validation Loss')
#     axes[0].set_title('Model Loss')
#     axes[0].set_xlabel('Loss')
#     axes[0].set_xlabel('Epoch')
#     axes[0].legend()
    
#     # Prediction VS Actual Values
#     axes[1].plot(y_test, label='Actual Values')
#     axes[1].plot(predictions, label='Predicted Values')
#     axes[1].set_title('Model Predictions VS Actual Values')
#     axes[1].set_xlabel('Time')
#     axes[1].set_xlabel('Star Rating')
#     axes[1].legend()
    
#     plt.tight_layout()
#     plt.show()
    
# print('LSTM Model Performance')
# plot_training_and_predictions(history, y_test, predictions)

In [23]:
# # Save model
# model.save('models/lstm_model.h5')

### **VADER**

In [24]:
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon') # if not already downloaded

# Initialize the VADER object
analyzer = SentimentIntensityAnalyzer()

# Apply VADER to the text data
def apply_vader(text):
    
    return analyzer.polarity_scores(text)

# Create vader_scores and vader_compound columns
df['vader_scores'] = df['text'].apply(apply_vader)
df['vader_compound'] = df['vader_scores'].apply(lambda score_dict: score_dict['compound'])

# Convert compound score to sentiment labels
def vader_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
    
df['vader_sentiment'] = df['vader_compound'].apply(vader_sentiment_label)

# Map stars to true_label for evaluation
def map_stars_to_sentiment(stars):
    if stars >= 4:
        return 'Positive'
    elif stars < 3:
        return 'Negative'
    else:
        return 'Neutral'
    
df['true_label'] = df['stars'].apply(map_stars_to_sentiment)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

labels = ['Negative', 'Neutral', 'Positive']

# Create confusion matrix
cm = confusion_matrix(df['true_label'], df['vader_sentiment'], labels=labels)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=labels)

disp.plot(cmap='Purples')
plt.title('')
plt.show()

### **Naive Bayes**

In [26]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Vectorize text
def vectorize_text(text_data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text_data)
    
    return X, vectorizer

# Create binary target based on star rating
def prepare_target_variable(stars, threshold=4):
    return stars >= threshold

# Classifier model
def train_naive_bayes(X_train, y_train):
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)
    
    return nb_classifier

# Evaluate model
def evaluate_model_nb(model, X_test, y_test):
    test_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, test_pred)
    
    return accuracy, test_pred

In [None]:
from sklearn.model_selection import train_test_split

# Vectorize text
X, vectorizer = vectorize_text(text_data=df['lemmas'])
y = df['true_label']

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train model
nb_classifier = train_naive_bayes(X_train, y_train)
accuracy, test_pred = evaluate_model_nb(nb_classifier, X_test, y_test)

print(f'Naive Bayes Accuracy with TF-IDF (Multi-Class): {accuracy:.2f}')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Classification report
report = classification_report(y_test, test_pred, labels=labels)
print(report)

# Confusion matrix
cm = confusion_matrix(y_test, test_pred, labels=labels)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=labels)
disp.plot(cmap='Purples')
plt.title('Sentiment Analysis with Naive Bayes and TF-IDF (Multi-Class)')
plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import learning_curve

# Calculate learning curves
train_sizes, train_scores, test_scores = learning_curve(nb_classifier, X, y,
                                                        cv=5,
                                                        n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 10))

# Plot the curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score', color='blue')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Testing score', color='orange')
plt.title('Naive Bayes Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.legend(loc='best')
plt.show()

In [None]:
import joblib
import os

# Create a directory for models if it doesn't exist already
os.makedirs('models', exist_ok=True)

# Save the model and vectorizer
joblib.dump(nb_classifier, 'models/naive-bayes_model.pkl')
joblib.dump(vectorizer, 'models/vectorizer.pkl')
print('Model and vectorizer saved in the models directory')

In [None]:
# Load model and vectorizer
loaded_nb_classifier = joblib.load('models/naive-bayes_model.pkl')
loaded_vectorizer = joblib.load('models/vectorizer.pkl')
print(f'Loaded Naive Bayes Model: {type(loaded_nb_classifier)} and vectorizer: {type(loaded_vectorizer)}')

In [None]:
# Test with sample input
sample_text = ['This app is awesome BRO!']
sample_vectorized = loaded_vectorizer.transform(sample_text) # transform the text into a matrix of token counts
prediction = loaded_nb_classifier.predict(sample_vectorized)

print(f'Predicted Statement: {prediction[0]}')