# Word2Vec

In [3]:
!conda install -y gdown

Retrieving notices: ...working... done
Channels:
 - rapidsai
 - nvidia
 - conda-forge
 - defaults
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - gdown


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.8.30  |       hbcca054_0         155 KB  conda-forge
    conda-24.7.1               |  py310hff52083_0         940 KB  conda-forge
    filelock-3.15.4            |     pyhd8ed1ab_0          17 KB  conda-forge
    gdown-5.2.0                |     pyhd8ed1ab_0          21 KB  conda-forge
    openssl-3.3.1              |       hb9d3cd8_3         2.8 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.9 MB

The following NEW packages will be 

In [4]:
!gdown 1ihU82zD5LKm8KvgvpSYM_32LC841MFGG

Downloading...
From: https://drive.google.com/uc?id=1ihU82zD5LKm8KvgvpSYM_32LC841MFGG
To: /kaggle/working/balanced_yelp_reviews.csv
100%|███████████████████████████████████████| 95.8M/95.8M [00:00<00:00, 102MB/s]


In [5]:
file_path = './balanced_yelp_reviews.csv'

In [6]:
import pandas as pd
import numpy as np

import time
import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from gensim.utils import tokenize
from gensim.parsing.porter import PorterStemmer
from gensim.models import Word2Vec

In [7]:
VECTOR_SIZE = 500
WINDOW = 5
MIN_COUNT = 10
WORKERS = 3
SG = 1

BATCH_SIZE = 32
NUM_EPOCHS = 15
LEARNING_RATE = 1e-3
PATIENCE = 3
DROPOUT_RATE = 0.5

In [8]:
statement_df = pd.read_csv(file_path)

In [9]:
statement_df.head()

Unnamed: 0,statement,sentiment
0,My wife and I were down town for a show and we...,neutral
1,My friends and I had a craving for fried chick...,negative
2,"They have very nice tiles and displays, but th...",negative
3,THE ABSOLUTE WORST! They pretty much charged u...,negative
4,The restaurant by itself was so cute but the b...,neutral


In [10]:
statement_df['tokenized_text'] = [list(tokenize(line, lowercase=False, deacc=True)) for line in statement_df['statement']]

def stem_preserving_case(word):
    porter_stemmer = PorterStemmer()
    stemmed_word = porter_stemmer.stem(word.lower())
    if word.isupper():
        return stemmed_word.upper()
    elif word[0].isupper():
        return stemmed_word.capitalize()
    else:
        return stemmed_word
    
# porter_stemmer = PorterStemmer()
# statement_df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in statement_df['tokenized_text']]
statement_df['stemmed_tokens'] = [[stem_preserving_case(word) for word in tokens] for tokens in statement_df['tokenized_text']]

le = LabelEncoder()
statement_df['sentiment'] = le.fit_transform(statement_df['sentiment'])

In [11]:
statement_df.head()

Unnamed: 0,statement,sentiment,tokenized_text,stemmed_tokens
0,My wife and I were down town for a show and we...,1,"[My, wife, and, I, were, down, town, for, a, s...","[My, wife, and, I, were, down, town, for, a, s..."
1,My friends and I had a craving for fried chick...,0,"[My, friends, and, I, had, a, craving, for, fr...","[My, friend, and, I, had, a, crave, for, fri, ..."
2,"They have very nice tiles and displays, but th...",0,"[They, have, very, nice, tiles, and, displays,...","[Thei, have, veri, nice, tile, and, displai, b..."
3,THE ABSOLUTE WORST! They pretty much charged u...,0,"[THE, ABSOLUTE, WORST, They, pretty, much, cha...","[THE, ABSOLUT, WORST, Thei, pretti, much, char..."
4,The restaurant by itself was so cute but the b...,1,"[The, restaurant, by, itself, was, so, cute, b...","[The, restaur, by, itself, wa, so, cute, but, ..."


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(statement_df[['statement', 'stemmed_tokens']],
                                                    statement_df['sentiment'],
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    random_state=15)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                  shuffle=True,
                                                  test_size=0.25,
                                                  random_state=15)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.to_frame().reset_index(drop=True)
Y_val = Y_val.to_frame().reset_index(drop=True)
Y_test = Y_test.to_frame().reset_index(drop=True)

In [13]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round(elapsed))))

In [14]:
start = time.time()

stemmed_tokens = pd.Series(X_train['stemmed_tokens']).values
w2v_model = Word2Vec(stemmed_tokens, min_count=MIN_COUNT, vector_size=VECTOR_SIZE, workers=WORKERS, window=WINDOW, sg=SG)

print(format_time(time.time() - start))

word2vec_model_file = 'word2vec_' + str(VECTOR_SIZE) + '.model'
w2v_model.save(word2vec_model_file)
sg_w2v_model = Word2Vec.load(word2vec_model_file)

0:04:10


In [15]:
def get_mean_feature_vector(data, model, vector_size):
    features = []
    for tokens in data:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if len(vectors) > 0:
            mean_vector = np.mean(vectors, axis=0)
        else:
            mean_vector = np.zeros(vector_size)
        features.append(mean_vector)
    return np.array(features)

In [16]:
X_train_vec = get_mean_feature_vector(X_train['stemmed_tokens'], sg_w2v_model, VECTOR_SIZE)
X_val_vec = get_mean_feature_vector(X_val['stemmed_tokens'], sg_w2v_model, VECTOR_SIZE)
X_test_vec = get_mean_feature_vector(X_test['stemmed_tokens'], sg_w2v_model, VECTOR_SIZE)

X_train_tensor = torch.tensor(X_train_vec, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train['sentiment'].values, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_vec, dtype=torch.float32)
Y_val_tensor = torch.tensor(Y_val['sentiment'].values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vec, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test['sentiment'].values, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, Y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [17]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [18]:
input_size = VECTOR_SIZE
num_classes = len(le.classes_)
model = SimpleNN(input_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

best_val_loss = float('inf')
early_stopping_counter = 0
best_model_path = 'best_model.pth'

start_time = time.time()

for epoch in range(NUM_EPOCHS):
    epoch_start_time = time.time()
    
    print(f"\nEpoch {epoch + 1} / {NUM_EPOCHS}")
    print("-" * 50)
    
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for X_batch, Y_batch in train_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, Y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_train += Y_batch.size(0)
        correct_train += (predicted == Y_batch).sum().item()

    train_accuracy = correct_train / total_train

    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for X_batch, Y_batch in val_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

            outputs = model(X_batch)
            loss = criterion(outputs, Y_batch)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_val += Y_batch.size(0)
            correct_val += (predicted == Y_batch).sum().item()

            all_val_preds.extend(predicted.cpu().numpy())
            all_val_labels.extend(Y_batch.cpu().numpy())

    val_accuracy = correct_val / total_val
    
    print(f"Train Loss: {train_loss / len(train_loader)} | Train Accuracy: {train_accuracy}")
    print(f"Val Loss: {val_loss / len(val_loader)} | Val Accuracy: {val_accuracy}\n")
    print(f'Epoch Train Time: {format_time(time.time() - epoch_start_time)}')
    print('\n')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), best_model_path)
        print("Saved best model")
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= PATIENCE:
            print("Early stopping triggered")
            break
print('Finished Training.')
print(f'Fold Train Time: {format_time(time.time() - start_time)}')
print('\n')


Epoch 1 / 15
--------------------------------------------------
Train Loss: 0.6490584009913334 | Train Accuracy: 0.7146222222222223
Val Loss: 0.5701985127095983 | Val Accuracy: 0.7597

Epoch Train Time: 0:00:08


Saved best model

Epoch 2 / 15
--------------------------------------------------
Train Loss: 0.5904728404533978 | Train Accuracy: 0.7496666666666667
Val Loss: 0.6025833055408779 | Val Accuracy: 0.7393

Epoch Train Time: 0:00:07



Epoch 3 / 15
--------------------------------------------------
Train Loss: 0.5817115993271762 | Train Accuracy: 0.7530333333333333
Val Loss: 0.5633497564777382 | Val Accuracy: 0.7567666666666667

Epoch Train Time: 0:00:07


Saved best model

Epoch 4 / 15
--------------------------------------------------
Train Loss: 0.576572515968322 | Train Accuracy: 0.7548
Val Loss: 0.5821544419028866 | Val Accuracy: 0.7488666666666667

Epoch Train Time: 0:00:07



Epoch 5 / 15
--------------------------------------------------
Train Loss: 0.5695277075516639 | T

In [19]:
model.load_state_dict(torch.load(best_model_path))

model.eval()

print("Classification Report on Validation Set:")
print(classification_report(all_val_labels, all_val_preds, target_names=le.classes_))

Classification Report on Validation Set:
              precision    recall  f1-score   support

    negative       0.78      0.83      0.81      9890
     neutral       0.68      0.67      0.67     10114
    positive       0.84      0.80      0.82      9996

    accuracy                           0.77     30000
   macro avg       0.77      0.77      0.77     30000
weighted avg       0.77      0.77      0.77     30000



## Test set evaluation (generalization error)

In [20]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, Y_batch in test_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(Y_batch.cpu().numpy())

test_accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {test_accuracy:.2f}')

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=le.classes_))

Test Accuracy: 0.77

Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.78      0.80      9948
     neutral       0.67      0.71      0.69     10136
    positive       0.83      0.82      0.82      9916

    accuracy                           0.77     30000
   macro avg       0.77      0.77      0.77     30000
weighted avg       0.77      0.77      0.77     30000

