We will import the necessary packages. We will then create some functions to use in our calculations. Please note that while we did start with creating a function to compute a tfidf score, we do not use it throughout the assignment. We use sklearn's TfidfVectorizer instead as it comes with attributes that allow us to remove common English words as well as rare words. We still did the calculation function for practice.


In [None]:
#import necessary libraries and set graphing theme
import pandas as pd
import math
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import scipy.special
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")


In [None]:
#Define all functions that we use in this project
#To calculate a tf score
def tf_score(text):
    term_counter = Counter(text)
    for term in term_counter.keys():
        term_counter[term] = term_counter[term]/float(len(text))
    
    return term_counter

#To calculate an idf score
def idf_score(document_list):
    document_counter = {}

    for document in document_list:
        for word in document:
            document_counter[word]=0

    n = len(document_list)

    for document in document_list:
        encountered_words = set()
        for word, val in document.items():
            if val > 0 and word not in encountered_words:
                document_counter[word] += 1
                encountered_words.add(word)

    for word, val in document_counter.items():
        document_counter[word] = math.log(n/float(val))
    
    return document_counter

#To calculate a tfidf score
def compute_tf_idf(tf, idf):
    tf_idf_score = {}
    for word, val in tf.items():
        tf_idf_score[word] = val * idf[word]

    return tf_idf_score

#To create tfidf dataframe
def create_tfidf_dataframe(data, vectorizer):
    tfidf_values = vectorizer.transform(data['review'])
    tfidf_df = pd.DataFrame(tfidf_values.toarray(), columns=vectorizer.get_feature_names_out())
    tfidf_df['sentiment'] = data['sentiment'].reset_index(drop=True)
    return tfidf_df

#To separate features and target
def separate_features_and_target(dataframe, target_column='sentiment'):
    X = dataframe.drop(target_column, axis=1).values
    y = dataframe[target_column].values
    return X, y

#To calculate the step function
def ro(t):
    return 1.0/t

#To calculate the sigmoid function
def sigmoid(x):
    return np.clip(scipy.special.expit(x),1e-20,1)

def conditional_expectation(x, beta):
    return sigmoid(np.dot(x,beta))

#To calculate the gradient
def gradient(lamb, beta, n, B, y, x, y_hat):
    regularization_term = -1/(lamb**2) * beta
    
    sum_term = np.sum((y - y_hat).reshape(-1, 1) * x, axis=0)
    
    g = regularization_term + n/B * sum_term
    
    return g

#To check for convergence by comparing the norms of two consecutive betas
def convergence_by_norm(beta1, beta2, tolerance):
    if np.any(np.isnan(beta1)) or np.any(np.isnan(beta2)):
        raise ValueError('Invalid values encountered: ', beta1, beta2)
    if np.abs(np.linalg.norm(beta1) - np.linalg.norm(beta2)) < tolerance:
        return True
    else:
        return False
     
#To find beta MAP while adding diagnoistics metrics and using minibatches and data sweeping
def beta_map_sweep(X, y, lamb, B, step_function, convergence_test, seed=None, verbose=False):
    if X.shape[0] != len(y):
        raise ValueError("Mismatched dimensions: X has shape {} but y has length {}".format(X.shape, len(y)))
    beta = np.zeros(X.shape[1])
    max_iter = 1000000
    max_epochs = max_iter // (X.shape[0] // B)
    converged = False
    tolerance = 1e-6
    t = 0
    np.random.seed(seed)

    diagnostics = {
        "log_likelihood": [],
        "gradient_magnitude": [],
        "beta_update_magnitude": []
    }
    
    for epoch in range(max_epochs):
        if converged:
            break
        
        permuted_indices = np.random.permutation(X.shape[0])

        for i in range(0, X.shape[0], B):
            t+=1
            indices = permuted_indices[i: i+B]
            x_sample = X[indices]
            y_sample = y[indices]

            y_hat = conditional_expectation(x_sample, beta)

            grad = gradient(lamb, beta, X.shape[0], B, y_sample, x_sample, y_hat)
            step = step_function(t)
            beta_new = beta + step*grad
            beta_update_magnitude = np.linalg.norm(step*grad)

            # Append diagnostics
            diagnostics["log_likelihood"].append(predictive_log_likelihood(beta, y_sample, x_sample))
            diagnostics["gradient_magnitude"].append(np.linalg.norm(grad))
            diagnostics["beta_update_magnitude"].append(beta_update_magnitude)

            # Check convergence
            converged = convergence_test(beta_new, beta, tolerance)
            if converged:
                break

            # Update beta
            beta = beta_new

            if verbose and t % 1000 == 0:
                print(f'Epoch {epoch+1}, Iteration {t}, Beta: {beta}')
                
        if converged:
            break

    return beta, t, diagnostics

#To predict future values
def predict(x, beta):
    probabilities =  sigmoid(np.dot(x,beta))
    predictions = np.where(probabilities >= 0.5, 1, 0)
    return predictions

#To calculate predicitive log likelihood
def predictive_log_likelihood(beta, y, x_out):
    m = len(y)
    log_likelihood = np.sum(y * np.log(sigmoid(np.dot(x_out, beta))) + (1 - y) * np.log(sigmoid(-np.dot(x_out, beta))))
    return 1/m * log_likelihood

#To evalute the model
def evaluate_model(beta, X, y):
    predictions = predict(X, beta)
    
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions)
    recall = recall_score(y, predictions)
    f1 = f1_score(y, predictions)
    
    return accuracy, precision, recall, f1

#To evalute the model with predictive log likelihood
def evaluate_model_with_ll(beta, X_train, y_train, X_test, y_test):
    predictions = predict(X_test, beta)
    
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    train_ll = predictive_log_likelihood(beta, y_train, X_train)
    test_ll = predictive_log_likelihood(beta, y_test, X_test)
    
    return accuracy, precision, recall, f1, train_ll, test_ll

We will read the dataset, initialize the vectorizer, split our data into a training and a testing set. We thun run two for-loops to ensure that both dataframes, the in and out of sample, have the same number of columns.


In [None]:
#To read the dataset and replace "positive" with 1, and "negative" with 0
df = pd.read_csv("IMDB Dataset.csv")
df['sentiment'].replace({"positive": 1, "negative": 0}, inplace=True)

In [None]:
#To split the data into training and testing, and to intialize the vectorizer
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

vectorizer = TfidfVectorizer(
    max_df=0.9,
    min_df=0.1,
    stop_words="english"
)
vectorizer.fit(train_df['review'])

tfidf_train_df = create_tfidf_dataframe(train_df, vectorizer)
tfidf_test_df = create_tfidf_dataframe(test_df, vectorizer)

In [None]:
#To ensure that the test and train dataframes have the same number of columns
extra_columns = set(tfidf_test_df.columns) - set(tfidf_train_df.columns)
missing_columns = set(tfidf_train_df.columns) - set(tfidf_test_df.columns)

for column in extra_columns:
    tfidf_train_df[column] = 0.0

for column in missing_columns:
    tfidf_test_df[column] = 0.0

In [None]:
#To separate the X and y values in train and test
X_train, y_train = separate_features_and_target(tfidf_train_df)
X_test, y_test = separate_features_and_target(tfidf_test_df)

We will find the optimal lambda by having lambda take on a range of values, inputting those values into our beta MAP function, calculating the predictive log-likelihood, and returning the one with the highest log-likelihood.


In [None]:
#To find the optimal lambda value
performances = []
lambda_values = np.logspace(0, 4, 20)
B = 100
for lamb in lambda_values:
    beta = beta_map_sweep(X_train, y_train, lamb, B, ro, convergence_by_norm, seed=0)[0]
    lambda_pefromance = predictive_log_likelihood(beta, y_test, X_test)
    performances.append(lambda_pefromance)

optimal_lambda = lambda_values[np.argmax(performances)]

In [None]:
print(optimal_lambda)

We run a few tests and plots to understand the influence of the prior, the relation between number of iterations and predictive log likelihood, most influential words, and the influence of batch size on convergence.


In [None]:
#To plot the influence of the Prior by ploting the predictive log likelihood on lambda, and to find some metrics to evaluate that influence
lambdas = [0.1, 1, 10, 100, 1000]
log_likelihoods = []

metrics = {
    "lambda": [], 
    "accuracy": [], 
    "precision": [], 
    "recall": [], 
    "f1": [], 
    "train_log_likelihood": [],
    "test_log_likelihood": []
}

for lamb in lambdas:
    beta, _, _ = beta_map_sweep(X_train, y_train, lamb, 100, ro, convergence_by_norm)
    ll = predictive_log_likelihood(beta, y_test, X_test)
    log_likelihoods.append(ll)

    accuracy, precision, recall, f1, train_ll, test_ll = evaluate_model_with_ll(beta, X_train, y_train, X_test, y_test)
    
    metrics["lambda"].append(lamb)
    metrics["accuracy"].append(accuracy)
    metrics["precision"].append(precision)
    metrics["recall"].append(recall)
    metrics["f1"].append(f1)
    metrics["train_log_likelihood"].append(train_ll)
    metrics["test_log_likelihood"].append(test_ll)
    

plt.plot(lambdas, log_likelihoods, 'o-')
plt.xscale('log')
plt.xlabel('Lambda (Regularization Strength)')
plt.ylabel('Predictive Log Likelihood')
plt.title('Influence of the Prior')
plt.show()

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


In [None]:
#To plot the predictive log likelihood against the number of iterations
def beta_map_sweep_with_ll(X, y, lamb, B, step_function, convergence_test, seed=None, verbose=False):
    beta, t,_ = beta_map_sweep(X, y, lamb, B, step_function, convergence_test, seed, verbose)
    log_likelihoods = [predictive_log_likelihood(beta, y, X) for _ in range(t)]
    return beta, log_likelihoods

beta, likelihoods = beta_map_sweep_with_ll(X_train, y_train, optimal_lambda, 100, ro, convergence_by_norm)

plt.plot(likelihoods)
plt.xlabel('Iteration')
plt.ylabel('Predictive Log Likelihood')
plt.title('Predictive Log-Likelihood vs Iteration')
plt.show()

In [None]:
#To ensure that convergence occurs without any issues
beta, _, diagnostics = beta_map_sweep(X_train, y_train, optimal_lambda, 100, ro, convergence_by_norm)

# Plotting
plt.figure(figsize=(15,5))

plt.subplot(1, 3, 1)
plt.plot(diagnostics["log_likelihood"], color="blue")
plt.title("Log-Likelihood per Iteration")

plt.subplot(1, 3, 2)
plt.plot(diagnostics["gradient_magnitude"], color="green")
plt.title("Gradient Magnitude per Iteration")

plt.subplot(1, 3, 3)
plt.plot(diagnostics["beta_update_magnitude"], color="red")
plt.title("Beta Update Magnitude per Iteration")

plt.tight_layout()
plt.show()

In [None]:
#To plot the top influential words and their coefficients
beta, _ = beta_map_sweep_with_ll(X_train, y_train, optimal_lambda, 100, ro, convergence_by_norm)
top_n = 10
sorted_indices = np.argsort(np.abs(beta))[-top_n:]
feature_names = vectorizer.get_feature_names_out()

plt.barh(range(top_n), beta[sorted_indices], align='center')
plt.yticks(range(top_n), [feature_names[i] for i in sorted_indices])
plt.xlabel('Coefficient Value')
plt.title('Top Influential Words')
plt.show()


In [None]:
#To plot the numbers of iterations against size of batch to get to convergence
batch_sizes = [10, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000]
iterations_to_converge = []

for B in batch_sizes:
    _, iterations, _ = beta_map_sweep(X_train, y_train, optimal_lambda, B, ro, convergence_by_norm)
    iterations_to_converge.append(iterations)

plt.plot(batch_sizes, iterations_to_converge, marker='o', linestyle='-', color='b')
plt.xlabel('Batch Size')
plt.ylabel('Iterations to Converge')
plt.title('Influence of Batch Size on Convergence')
plt.grid(True)
plt.xscale('log')
plt.show()
