In [33]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from typing import List, Tuple, Optional, Any
import warnings
import re
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
import gensim.models
import gensim.downloader as api
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm 
from gensim import utils

In [29]:
# Global package object instantiation 

warnings.filterwarnings("ignore")
nltk.download('stopwords')
tqdm.pandas()
stop_words = set(stopwords.words('english'))


# Precompile regular expressions for performance
pattern_contractions = re.compile(r'\b(' + '|'.join(contraction_mapping.keys()) + r')\b')
pattern_html = re.compile(r'http\S+')
pattern_non_alphabetic = re.compile(r'[^a-zA-Z]')
pattern_whitespace = re.compile(r'\s+')


# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

NVIDIA GeForce RTX 3050 Ti Laptop GPU


## 1) Dataset Generation 

In [11]:
df = pd.read_csv("../data/data.tsv", sep = '\t', on_bad_lines = 'skip') #, nrows = 1000)#, usecols=['review_body','star_rating']) #lineterminator='\r'
df.drop(df.columns[0], axis = 1, inplace = True)
df = df[['review_body', 'star_rating']]
df.columns

Index(['review_body', 'star_rating'], dtype='object')

### Preprocess/Cleaning take 15 minutes

In [12]:
contraction_mapping = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}


def expand_contractions(text: str, contraction_map: dict = contraction_mapping) -> str:
    """
    Expands contractions in a given text using a specified mapping.

    Args:
        text (str): The text to be processed.
        contraction_map (dict): A dictionary where keys are contractions and values are their expanded forms.

    Returns:
        str: The processed text with contractions expanded.
    """
    return pattern_contractions.sub(lambda match: contraction_map[match.group(0)], text)

def remove_stopwords(text: str, stopwords: List[str]) -> str:
    """
    Removes stopwords from a given text.

    Args:
        text (str): The text to be processed.
        stopwords (List[str]): A list of stopwords to be removed.

    Returns:
        str: The text with stopwords removed.
    """
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    return ' '.join(filtered_words)

def lemmatize_text(text: str) -> str:
    """
    Lemmatizes the words in a given text.

    Args:
        text (str): The text to be processed.

    Returns:
        str: The text with words lemmatized.
    """
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def clean_and_preprocess_reviews(reviews, stopwords: List[str]) -> pd.Series:
    """
    Cleans and preprocesses review texts.

    Args:
        reviews (pd.Series): A Pandas Series containing review texts.
        stopwords (List[str]): A list of stopwords to be removed during preprocessing.

    Returns:
        pd.Series: The preprocessed review texts.
    """
    # Cleaning
    reviews = reviews.str.lower()
    reviews = reviews.progress_apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
    reviews = reviews.replace(pattern_html, '', regex=True)
    reviews = reviews.replace(pattern_non_alphabetic, " ", regex=True)
    reviews = reviews.replace(pattern_whitespace, ' ', regex=True).str.strip()
    reviews = reviews.progress_apply(lambda x: expand_contractions(x))

    # Preprocessing
    reviews = reviews.progress_apply(lambda x: remove_stopwords(x, stopwords))
    reviews = reviews.progress_apply(lemmatize_text)

    return reviews

In [13]:
df['review_body'] = df['review_body'].astype(str)
df.dropna(subset=['review_body'], inplace=True)
df['review_body'] = clean_and_preprocess_reviews(df['review_body'], stop_words)
df.dropna(subset=['review_body'], inplace=True)

df['label'] = df['star_rating'].progress_apply(lambda x: 0 if x in [4, 5] else (1 if x in [1, 2] else 2))
samples = [df[df['star_rating'] == rating].sample(n = 50000, random_state=42) for rating in [5, 4, 3, 2, 1]]
merged_dataset = pd.concat(samples)


100%|██████████| 2640254/2640254 [01:47<00:00, 24548.70it/s]
100%|██████████| 2640254/2640254 [01:15<00:00, 34828.15it/s]
100%|██████████| 2640254/2640254 [00:19<00:00, 134135.10it/s]
100%|██████████| 2640254/2640254 [04:09<00:00, 10584.75it/s]
100%|██████████| 2640254/2640254 [00:01<00:00, 1436903.61it/s]


## 2) Word Embedding

### (a)
Load the pretrained “word2vec-google-news-300” Word2Vec model and learn
how to extract word embeddings for your dataset. Try to check semantic
similarities of the generated vectors using two examples of your own, e.g.,
King − M an + W oman = Queen or excellent ∼ outstanding.


Reference https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [18]:
pretrained_model = api.load('word2vec-google-news-300')
print("Similarity between 'Ocean' and 'Sea': ", pretrained_model.similarity('ocean', 'sea'))

Similarity between 'Ocean' and 'Sea':  0.76435417


### (b)
Check the semantic similarities for the same two examples
in part (a). What do you conclude from comparing vectors generated by
yourself and the pretrained model? Which of the Word2Vec models seems
to encode semantic similarities between words better?

In the pretrained model, the similarity score between Outstanding and Excellent was lower than the custom model I trained, thus showing that similarities between vectors were stronger in my model. But the pretrained model does a better join of building relationships between words

In [31]:
class MyCorpus:
    def __init__(self, df, col):
        self.df = df
        self.col = col

    def __iter__(self):
        for line in self.df[self.col]:
            yield utils.simple_preprocess(line)

In [34]:
my_model = gensim.models.Word2Vec(sentences = MyCorpus(merged_dataset, 'review_body'), vector_size = 300, window = 11, min_count = 10, workers = 4)

In [35]:
word_vectors = my_model.wv

print("Similarity between 'ocean' and 'sea':", word_vectors.similarity('ocean', 'sea'))

Similarity between 'ocean' and 'sea': 0.5721045


In [23]:
def document_vector(word2vec_model: KeyedVectors, doc_review: List[str]) -> np.ndarray:
    """
    Generates a document vector by averaging the vectors of words in the document
    that are present in the word2vec model.

    Args:
        word2vec_model (KeyedVectors): A word2vec model.
        doc_review (List[str]): A list of words in the document.

    Returns:
        np.ndarray: The averaged vector of the document.
    """
    doc_review = [word for word in doc_review if word in word2vec_model.key_to_index]
    
    if not doc_review:
        return np.zeros(word2vec_model.vector_size)
        
    return np.mean(word2vec_model[doc_review], axis=0)

def gen_concat_feature_vector(word2vec_model: KeyedVectors, doc_review: List[str], vector_size: int = 300, max_words: int = 10) -> np.ndarray:
    """
    Generates a concatenated feature vector for the first 'max_words' words in a document,
    using the word vectors from the provided word2vec model.

    Args:
        word2vec_model (KeyedVectors): A word2vec model.
        doc_review (List[str]): A list of words in the document.
        vector_size (int): The size of the word vectors.
        max_words (int): The maximum number of word vectors to concatenate.

    Returns:
        np.ndarray: The concatenated vector of the document.
    """
    concatenated_vector = np.zeros(vector_size * max_words)
    
    for i, word in enumerate(doc_review[:max_words]):
        if word in word2vec_model.key_to_index:
            concatenated_vector[i * vector_size:(i + 1) * vector_size] = word2vec_model[word]
            
    return concatenated_vector


### Retrieve all word embeddings for training

In [36]:
merged_dataset['processed_text'] = merged_dataset['review_body'].progress_apply(gensim.utils.simple_preprocess)
merged_dataset['pretrained_vector'] = merged_dataset['processed_text'].progress_apply(lambda doc_review: document_vector(pretrained_model, doc_review))
merged_dataset['custom_vector'] = merged_dataset['processed_text'].progress_apply(lambda doc_review: document_vector(my_model.wv, doc_review))
merged_dataset['pre_concatenated_vector'] = merged_dataset['processed_text'].progress_apply(lambda row_indx: gen_concat_feature_vector(pretrained_model, row_indx))
merged_dataset['custom_concatenated_vector'] = merged_dataset['processed_text'].progress_apply(lambda row_indx: gen_concat_feature_vector(pretrained_model, row_indx)) 

### Create the binary labels ###
filtered_dataset = merged_dataset[merged_dataset['label'] != 2]
filtered_dataset['binary_label'] = filtered_dataset['label'].astype(int)

100%|██████████| 250000/250000 [00:08<00:00, 28351.26it/s]
100%|██████████| 250000/250000 [00:20<00:00, 12061.32it/s]
100%|██████████| 250000/250000 [00:19<00:00, 12603.98it/s]
100%|██████████| 250000/250000 [00:13<00:00, 19024.73it/s]
100%|██████████| 250000/250000 [00:17<00:00, 14404.71it/s]


## 3) Simple models 

What do you conclude from comparing performances for the models
trained using the three different feature types (TF-IDF, pretrained Word2Vec,
your trained Word2Vec)?

It seems tha pretrained Word2Vec embeddings marginally perform better than the custom models embeddings and better than TF-IDF 
most likely due to the moderately large window size that is being used in the Word2Vec model.


In [37]:
def evaluate(y_label: np.ndarray, y_predicted: np.ndarray) -> Tuple[float, float, float, float]:
    """
    Evaluates the performance of a model using accuracy, precision, recall, and F1 score.

    Args:
        y_label (np.ndarray): The true labels.
        y_predicted (np.ndarray): The predicted labels by the model.

    Returns:
        Tuple[float, float, float, float]: A tuple containing accuracy, precision, recall, and F1 score.
    """
    accuracy = accuracy_score(y_label, y_predicted)
    precision = precision_score(y_label, y_predicted, average='weighted')
    recall = recall_score(y_label, y_predicted, average='weighted')
    f1 = f1_score(y_label, y_predicted, average='weighted')

    return accuracy, precision, recall, f1

def sklearn_model_train(X: np.ndarray, Y: np.ndarray, model_types: List[Tuple[str, BaseEstimator]], prefix: str, test_size: float = 0.2, random_state: int = 42) -> None:
    """
    Trains and evaluates models specified in model_types on the provided dataset, 
    printing the accuracy of each model.

    Args:
        X (np.ndarray): Feature vectors of the dataset.
        Y (np.ndarray): Labels of the dataset.
        model_types (List[Tuple[str, BaseEstimator]]): A list of tuples containing model names and their instances.
        prefix (str): A prefix string for printing model performance.
        test_size (float): The proportion of the dataset to include in the test split.
        random_state (int): Controls the shuffling applied to the data before applying the split.
    """
    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)

    for name, model in model_types:
        model.fit(X_train, y_train.ravel()) 
        y_pred_test = model.predict(X_test)
        te_acc, _, _, _ = evaluate(y_test, y_pred_test)

    
        print(f"{prefix} {name} Testing: Accuracy: {te_acc:.4f}")

In [40]:

model_names = [
    ("Perceptron Model", Perceptron()),
    ("SVM Model", SVC(max_iter = 1000))
]

# Train and evaluate models for ternary labels
sklearn_model_train(np.vstack(merged_dataset['custom_vector'].values), np.vstack( merged_dataset['label'].values), model_names, prefix='Ternary Custom Embedding')
sklearn_model_train(np.vstack(merged_dataset['pretrained_vector'].values), np.vstack( merged_dataset['label'].values), model_names, prefix='Ternary Pretrained Embedding')

# Train and evaluate models for binary labels
sklearn_model_train(np.vstack(filtered_dataset['custom_vector'].values),  np.vstack(filtered_dataset['binary_label'].values), model_names, prefix='Binary Custom Embedding')
sklearn_model_train(np.vstack(filtered_dataset['pretrained_vector'].values), np.vstack(filtered_dataset['binary_label'].values), model_names, prefix='Binary Pretrained Embedding')


Ternary Custom Embedding Perceptron Model Testing: Accuracy: 0.6327
Ternary Custom Embedding SVM Model Testing: Accuracy: 0.3597
Ternary Pretrained Embedding Perceptron Model Testing: Accuracy: 0.5828
Ternary Pretrained Embedding SVM Model Testing: Accuracy: 0.2840
Binary Custom Embedding Perceptron Model Testing: Accuracy: 0.7805
Binary Custom Embedding SVM Model Testing: Accuracy: 0.6251
Binary Pretrained Embedding Perceptron Model Testing: Accuracy: 0.7404
Binary Pretrained Embedding SVM Model Testing: Accuracy: 0.5642


## 4) Feedforward Neural Network Approach

In [None]:
class Net(nn.Module):
    """
    A simple feedforward neural network for classification tasks.

    Attributes:
        n_classes (int): The number of output classes.
        n_dim (int): The dimensionality of the input features.
        fc1 (nn.Linear): The first fully connected layer.
        fc2 (nn.Linear): The second fully connected layer.
        fc3 (nn.Linear): The third fully connected layer, producing the output.
        dropout (nn.Dropout): Dropout layer for regularization.

    Args:
        n_classes (int): The number of classes in the target classification task.
        n_dim (int): The size of each input sample.
    """

    def __init__(self, n_classes: int, n_dim: int):
        super(Net, self).__init__()

        hidden_1 = 50
        hidden_2 = 10

        self.n_classes = n_classes
        self.n_dim = n_dim

        # Define layers
        self.fc1 = nn.Linear(n_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, n_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        """
        Defines the forward pass of the network. Utilizing GeLu activation
        functions for NLP task specific nonlinearity 

        Args:
            x (Tensor): The input tensor containing the features of the input samples.

        Returns:
            Tensor: The output tensor after passing through the network.
        """
        
        x = f.gelu(self.fc1(x))
        x = self.dropout(x)
        x = f.gelu(self.fc2(x))
        x = self.dropout(x)
        x = f.softmax(self.fc3(x)) 
        return x


ternary_model = Net(n_classes = 3, n_dim = 300)
binary_model = Net(n_classes = 2, n_dim = 300)
concat_ternary_model = Net(n_classes = 3, n_dim = 3000)
concat_binary_model = Net(n_classes = 2, n_dim = 3000)

print(ternary_model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:
class TextDataset(Dataset):
    """
    A custom dataset class for text data to be used with PyTorch DataLoader.

    Attributes:
        features (torch.Tensor): Tensor containing all input features.
        labels (torch.Tensor): Tensor containing all labels.
    """
    def __init__(self, features: torch.Tensor, labels: torch.Tensor):
        self.features = features
        self.labels = labels

    def __len__(self) -> int:
        return len(self.features)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.features[idx], self.labels[idx]


def model_preprocess(x: List, y: List, model: nn.Module, cnn_bit: int = 0, optimi: Optional[torch.optim.Optimizer] = None) -> Tuple[DataLoader, DataLoader, nn.Module, torch.optim.Optimizer]:
    """
    Prepares and preprocesses the dataset for training and validation, including creating DataLoader instances.

    Args:
        x (List): Input features as a list of tensors.
        y (List): Corresponding labels as a list.
        model (nn.Module): The PyTorch model to be trained.
        cnn_bit (int, optional): Flag to select the optimizer type. Defaults to 0.
        optimi (Optional[torch.optim.Optimizer], optional): Optionally, a specific optimizer can be provided. Defaults to None.

    Returns:
        Tuple[DataLoader, DataLoader, nn.Module, torch.optim.Optimizer]: A tuple containing training and test DataLoader, the loss criterion, and the optimizer.
    """

    # Split the dataset into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    
    # Convert lists to tensors
    x_train = torch.tensor(x_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    x_test = torch.tensor(x_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Create DataLoader instances for training and test sets
    train_loader = DataLoader(TextDataset(x_train, y_train), batch_size=64, shuffle=True)
    test_loader = DataLoader(TextDataset(x_test, y_test), batch_size=64, shuffle=False)

    # Determine the optimizer
    if optimi is None:
        if cnn_bit == 1:
            optimi = torch.optim.Adadelta(model.parameters(), lr = 0.25, rho = 0.95)
        else:
            optimi = torch.optim.Adam(model.parameters(), lr = 0.0001)

    return train_loader, test_loader, nn.CrossEntropyLoss(), optimi


In [None]:
def train_model(hyperparams: Tuple[DataLoader, DataLoader, nn.Module, torch.optim.Optimizer], 
                model: nn.Module, 
                debug_mode: int = 0, 
                n_epochs: int = 50) -> float:
    """
    Trains a given PyTorch model with specified hyperparameters, and evaluates it on a test set.

    Args:
        hyperparams (Tuple[DataLoader, DataLoader, nn.Module, torch.optim.Optimizer]): 
            A tuple containing training and testing DataLoaders, the loss criterion, and the optimizer.
        model (nn.Module): The neural network model to be trained and evaluated.
        debug_mode (int, optional): If set to 1, prints detailed training progress and accuracy. Defaults to 0.
        n_epochs (int, optional): Number of epochs for training the model. Defaults to 50.

    Returns:
        float: The accuracy of the model on the test set after the final epoch of training.
    """
    train_loader, test_loader, criterion, optimizer = hyperparams
    valid_loss_min = np.inf

    for epoch in range(n_epochs):
        # Initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        correct_train = 0
        correct_valid = 0
        total_train = 0
        total_valid = 0

        # Training phase
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * data.size(0)
            _, predicted = torch.max(output.data, 1)
            correct_train += (predicted == target).sum().item()
            total_train += target.size(0)

        # Validation phase
        model.eval()
        with torch.no_grad():
            for data, target in test_loader:
                output = model(data)
                loss = criterion(output, target)
                valid_loss += loss.item() * data.size(0)
                _, predicted = torch.max(output.data, 1)
                correct_valid += (predicted == target).sum().item()
                total_valid += target.size(0)

        # Calculate average losses
        train_loss = train_loss / total_train
        valid_loss = valid_loss / total_valid
        train_accuracy = correct_train / total_train
        valid_accuracy = correct_valid / total_valid

        # Print training/validation statistics 
        if debug_mode == 1:
            print(f'Epoch: {epoch + 1}/{n_epochs} \tTraining Loss: {train_loss:.6f} \tTraining Accuracy: {train_accuracy * 100:.2f}% \tValidation Loss: {valid_loss:.6f} \tValidation Accuracy: {valid_accuracy * 100:.2f}%')

            # Save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min, valid_loss))
                torch.save(model.state_dict(), 'model.pt')
                valid_loss_min = valid_loss

    return valid_accuracy


### Custom Embeddings and 3 Class MLP

In [94]:
model_hyperparameters_tern_custom = model_preprocess(merged_dataset['custom_vector'].values, merged_dataset['label'].values, ternary_model )
train_model(model_hyperparameters_tern_custom, ternary_model)

Test Accuracy: 


0.6971

### Pretrained Embeddings and 3 Class MLP

In [95]:
model_hyperparameters_tern_pre = model_preprocess(merged_dataset['pretrained_vector'].values, merged_dataset['label'].values, ternary_model)
train_model(model_hyperparameters_tern_pre, ternary_model)

Test Accuracy: 


0.66548

### Custom Embeddings and 2 Class MLP

In [96]:
model_hyperparameters_bin_custom = model_preprocess(filtered_dataset['custom_vector'].values, filtered_dataset['binary_label'].values, binary_model)
train_model(model_hyperparameters_bin_custom, binary_model)

Test Accuracy: 


0.86325

### Pretrained Embeddings and 2 Class MLP

In [98]:
model_hyperparameters_bin_pre = model_preprocess(filtered_dataset['pretrained_vector'].values, filtered_dataset['binary_label'].values, binary_model)
train_model(model_hyperparameters_bin_pre, binary_model)

Test Accuracy: 


0.822525

(b) (15 points)
What do you conclude by comparing accuracy values you obtain with
those obtained in the “’Simple Models” section (note you can compare the
accuracy values for binary classification)

In the ternary classifiation, simple models perform worse by an average of 10% in their validation accuracies. This is due to the unique choice of hyperparamters (hidden dimensions, nonlinerity and learning etc) that pytorch allows us to customize. 

### Pretrained Concatenated Embeddings and 2 Class MLP

In [45]:
model_hyperp_concat_bin_pre = model_preprocess(filtered_dataset['pre_concatenated_vector'].values, filtered_dataset['binary_label'].values, concat_binary_model)
train_model(model_hyperp_concat_bin_pre, concat_binary_model)

Test Accuracy: 


0.786475

### Custom Concatenated Embeddings and 2 Class MLP

In [27]:
model_hyperp_concat_bin_custom = model_preprocess(filtered_dataset['custom_concatenated_vector'].values, filtered_dataset['binary_label'].values, concat_binary_model, optimi = torch.optim.SGD(concat_binary_model.parameters(), lr = 0.001))
train_model(model_hyperp_concat_bin_custom, concat_binary_model)

Test Accuracy: 


0.854975

### Pretrained Concatenated Embeddings and 3 Class MLP

In [None]:
model_hyperp_concat_ter_pre = model_preprocess(merged_dataset['pre_concatenated_vector'].values, merged_dataset['label'].values, concat_ternary_model, optimi = torch.optim.Adam(concat_ternary_model.parameters(), lr = 0.001))
train_model(model_hyperp_concat_ter_pre, concat_ternary_model, n_epochs = 1, debug_mode = 0) #0.68756

Test Accuracy: 


0.68756

### Custom Concatenated Embeddings and 3 Class MLP

In [29]:
model_hyperp_concat_ter_custom = model_preprocess(merged_dataset['custom_concatenated_vector'].values, merged_dataset['label'].values, concat_ternary_model)
train_model(model_hyperp_concat_ter_custom, concat_ternary_model, n_epochs = 10)

Test Accuracy: 


0.62498

## 5) Convolutional Neural Networks

In [35]:
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.num_classes = num_classes
        self.conv1 = nn.Conv1d(1, 50, kernel_size = 5, padding = 2) ## 50 words store as 300 dim wv's 
        self.conv2 = nn.Conv1d(50, 10, kernel_size = 5, padding = 2) 
        self.fc = nn.Linear(3000, self.num_classes)
        
    def forward(self, x):
        x = x.squeeze()  
        # x = self.embedding(x).permute(0, 2, 1)
        x = x.reshape(x.shape[0], 1 , x.shape[1])
        x = F.gelu(self.conv1(x))
        x = F.gelu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

### Custom embeddings & 3 Class CNN

In [31]:
CNNnet_ter_custom_complex = CNN(num_classes = 3)
cnn_hyperp_ter_cus = model_preprocess(x = merged_dataset['custom_vector'].values, y = merged_dataset['label'].values, model = CNNnet_ter_custom_complex, cnn_bit = 1)
train_model(hyperparams = cnn_hyperp_ter_cus, model = CNNnet_ter_custom_complex, n_epochs = 20 )

Test Accuracy: 


0.68908

### Custom embeddings & 2 Class CNN

In [32]:
CNNnet_bin_custom = CNN(num_classes = 2)
cnn_hyperp_bin_cust = model_preprocess(x = filtered_dataset['custom_vector'].values, y = filtered_dataset['label'].values, model = CNNnet_bin_custom, cnn_bit = 1)
train_model(hyperparams = cnn_hyperp_bin_cust, model = CNNnet_bin_custom, n_epochs = 15) 

Test Accuracy: 


0.855325

### Pretrained embeddings & 2 Class CNN

In [33]:
CNNnet_bin_pre = CNN(num_classes = 2)
cnn_hyperp_bin_pre = model_preprocess(x = filtered_dataset['pretrained_vector'].values, y = filtered_dataset['label'].values, model = CNNnet_bin_pre, cnn_bit = 1)
train_model(hyperparams = cnn_hyperp_bin_pre, model = CNNnet_bin_pre, n_epochs = 42)

Test Accuracy: 


0.8189

### Pretrained embeddings & 3 Class CNN

In [34]:
CNNnet_ter_pre = CNN(num_classes = 3)
cnn_hyperp_ter_pre = model_preprocess(x = merged_dataset['pretrained_vector'].values, y = merged_dataset['label'].values, model = CNNnet_ter_pre, cnn_bit = 1)
train_model(hyperparams = cnn_hyperp_ter_pre, model = CNNnet_ter_pre, n_epochs = 35) # 66.71

Test Accuracy: 


0.67174