In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from imblearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
try:
    # Attempt to read with UTF-8 encoding
    df = pd.read_csv('https://raw.githubusercontent.com/derrickyau9/IMDB-Review-Emotion_Rating-Predictor/main/CSVs/Merged_Reviews_Final.csv', encoding='utf-8')
except UnicodeDecodeError:
    # If UTF-8 fails, try reading with ISO-8859-1 encoding
    df = pd.read_csv('https://raw.githubusercontent.com/derrickyau9/IMDB-Review-Emotion_Rating-Predictor/main/CSVs/Merged_Reviews_Final.csv', encoding='ISO-8859-1')



def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Joining the tokens back into text
    return ' '.join(tokens)

In [None]:
# Apply the preprocessing to the review column
df['Review'] = df['Review'].apply(preprocess_text)

# TF-IDF

In [None]:
# Separate features and target labels
X = df['Review']
y = df.drop(['Review'], axis=1)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Feature extraction with TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X_train_tfidf.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y_train.shape[1], activation='sigmoid'))  # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

X_train_tfidf_dense = X_train_tfidf.toarray()

# Train the model
model.fit(X_train_tfidf_dense, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7d612b9f1720>

In [None]:
try:
    # Attempt to read with UTF-8 encoding
    full_df = pd.read_csv('https://raw.githubusercontent.com/derrickyau9/IMDB-Review-Emotion_Rating-Predictor/main/CSVs/50k_Full_Data.csv', encoding='utf-8')
except UnicodeDecodeError:
    # If UTF-8 fails, try reading with ISO-8859-1 encoding
    full_df = pd.read_csv('https://raw.githubusercontent.com/derrickyau9/IMDB-Review-Emotion_Rating-Predictor/main/CSVs/50k_Full_Data.csv', encoding='ISO-8859-1')

# Apply the same preprocessing to the full dataset
full_df['review'] = full_df['review'].apply(preprocess_text)

# Separate features
X_full = full_df['review']

# Transform features using the same TF-IDF Vectorizer
X_full_tfidf = vectorizer.transform(X_full)

# Convert to dense format if required by the model
X_full_tfidf_dense = X_full_tfidf.toarray()

# Use the trained model to make predictions
predictions = model.predict(X_full_tfidf_dense)



In [None]:
threshold = 0.25
binary_predictions = (predictions > threshold).astype(int)

In [None]:
# Assuming 'label_columns' contains the names of your label columns
label_columns = ['Disgust', 'Disappointment', 'Sadness', 'Confusion', 'Anger', 'Fear', 'Indifference', \
                 'Surprise', 'Interest', 'Happiness', 'Reflective']

# Convert binary predictions to a DataFrame
predictions_df = pd.DataFrame(binary_predictions, columns=label_columns)

# Concatenate the predictions with the full DataFrame
full_df_with_predictions = pd.concat([full_df, predictions_df], axis=1)

In [None]:
full_df_with_predictions.to_csv('/content/50k_full_data_with_predictions.csv', index=False)