In [2]:
import csv
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
train_path = "/content/train.csv"
test_path  = "/content/test.csv"

In [4]:
with open(train_path, 'r') as f:
    for i in range(10):
        print(f.readline())

"2","Stuning even for the non-gamer","This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^"

"2","The best soundtrack ever to anything.","I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."

"2","Amazing!","This soundtrack is my favorite 

In [5]:
train_df = pd.read_csv(train_path, header=None, names=["label","title","review"], quoting=csv.QUOTE_MINIMAL, quotechar='"', on_bad_lines='skip', engine='python')

In [6]:
test_df = pd.read_csv(test_path, header=None,names=["label","title","review"], quoting=csv.QUOTE_MINIMAL, quotechar='"', on_bad_lines='skip', engine='python')

**Data Preprocessing Steps:-**

1. removes duplicate rows and rows with missing values in the "label" or "review" columns from the train and test set.

In [7]:
train_df = train_df.drop_duplicates().dropna(subset=["label","review"])
test_df  = test_df.drop_duplicates().dropna(subset=["label","review"])

2. Accept only labels 1(neg) and 2(pos)

In [10]:
train_df = train_df[train_df["label"].isin([1,2])]
test_df  = test_df[test_df["label"].isin([1,2])]


3. combine title + review
- concats the title & review into 1 cleaned text field (handling missing values) so, can preprocess/tokenize a single input for the model.

In [44]:
train_df["text"] = (train_df["title"].fillna("") + " " + train_df["review"].fillna("")).str.strip()
test_df["text"]  = (test_df["title"].fillna("")  + " " + test_df["review"].fillna("")).str.strip()


4. Map labels: 1->0 (negative), 2->1 (positive)

In [45]:
train_df["y"] = (train_df["label"] == 2).astype(int)
test_df["y"]  = (test_df["label"] == 2).astype(int)

5. class distribution

In [17]:
print("Class counts (train):")
print(train_df["y"].value_counts().rename({0:"negative",1:"positive"}))
print("+ve %age:", round(train_df["y"].mean()*100, 2), "%")

Class counts (train):
y
positive    63870
negative    62258
Name: count, dtype: int64
+ve %age: 50.64 %


In [22]:
train_lengths = train_df["text"].str.split().apply(len)
print("\nReview length (words) summary (train):")
print(train_lengths.describe(percentiles=[.5,.75,.90,.95]))


Review length (words) summary (train):
count    126128.000000
mean         79.924624
std          43.236601
min          10.000000
50%          72.000000
75%         110.000000
90%         146.000000
95%         162.000000
max         241.000000
Name: text, dtype: float64


**Text preprocessing**

In [49]:
import re, string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet') # Download wordnet data
nltk.download('omw-1.4') # Download open multilingual wordnet data

url_re   = re.compile(r'(https?://\S+|www\.\S+)')
num_re   = re.compile(r'\d+')
punct_tbl = str.maketrans('', '', string.punctuation)
stop     = set(ENGLISH_STOP_WORDS)

lemmatizer = WordNetLemmatizer()

def clean_text(s):
    s = str(s).lower() #lowercase
    s = url_re.sub(' ', s) # remove URLs
    s = num_re.sub(' ', s)  # remove numbers
    s = s.translate(punct_tbl) # remove punctuation
    toks = [w for w in s.split() if w not in stop and len(w) > 2]
    toks = [lemmatizer.lemmatize(w) for w in toks]  # Added lemmatization
    return ' '.join(toks)

train_df['clean'] = train_df['text'].apply(clean_text)
test_df['clean']  = test_df['text'].apply(clean_text)

#Drop empty rows
train_df = train_df[train_df['clean'].str.strip().astype(bool)]
test_df  = test_df[test_df['clean'].str.strip().astype(bool)]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


 Tokenize the processed text

In [62]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding

MAX_VOCAB = 20000
MAX_LEN   = 256

# tokenize (fit on TRAIN only)
tok = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tok.fit_on_texts(train_df['clean'])

X_train = pad_sequences(tok.texts_to_sequences(train_df['clean']), maxlen=MAX_LEN)
X_test  = pad_sequences(tok.texts_to_sequences(test_df['clean']),  maxlen=MAX_LEN)
y_train = train_df['y'].values
y_test  = test_df['y'].values

print("Shapes:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)


Shapes: (126128, 256) (126128,) (131757, 256) (131757,)


In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dense, Dropout

vocab_size = min(MAX_VOCAB, len(tok.word_index)+1)
EMBED_DIM  = 128
LSTM_UNITS = 64

model = Sequential([Input(shape=(MAX_LEN,), dtype='int32'), Embedding(vocab_size, EMBED_DIM), Bidirectional(LSTM(LSTM_UNITS, return_sequences=True)), GlobalMaxPool1D(), Dense(64, activation='relu'), Dropout(0.3), Dense(1, activation='sigmoid')])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [64]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

es = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
ckpt = ModelCheckpoint('/content/best.keras', monitor='val_loss', save_best_only=True)

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=128,
    validation_split=0.2,
    callbacks=[es, ckpt],
    verbose=1
)

Epoch 1/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 33ms/step - accuracy: 0.7964 - loss: 0.4169 - val_accuracy: 0.8848 - val_loss: 0.2750
Epoch 2/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.9157 - loss: 0.2185 - val_accuracy: 0.8834 - val_loss: 0.2869


In [65]:
from sklearn.metrics import precision_recall_curve, accuracy_score, precision_recall_fscore_support

y_prob = model.predict(X_test).ravel()
prec, rec, thr = precision_recall_curve(y_test, y_prob)
f1 = 2*prec*rec/(prec+rec+1e-9)
best_thr = thr[f1.argmax()]
y_pred = (y_prob >= best_thr).astype(int)

acc = accuracy_score(y_test, y_pred)
p, r, f1s, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
print(f"Best threshold: {best_thr:.3f} | Acc: {acc:.4f} | P: {p:.4f} | R: {r:.4f} | F1: {f1s:.4f}")

[1m4118/4118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8ms/step
Best threshold: 0.390 | Acc: 0.8812 | P: 0.8621 | R: 0.9107 | F1: 0.8857
