# Import libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [3]:
# Config "default", "dictionary", "ptb"
default_dataset = load_dataset("sst", "default")
# dictionary_dataset = load_dataset("sst", "dictionary")
# ptb_dataset = load_dataset("sst", "ptb")

In [4]:
print("Default:",default_dataset)
# print("Dictionary:",dictionary_dataset)
# print("PTB:",ptb_dataset)

Default: DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})


## Check dataset

### Default format

In [5]:
train_def = default_dataset["train"].to_pandas()
test_def = default_dataset["test"].to_pandas()
val_def = default_dataset["validation"].to_pandas()
print(train_def.head())
print(test_def.head())
print(val_def.head())

                                            sentence    label  \
0  The Rock is destined to be the 21st Century 's...  0.69444   
1  The gorgeously elaborate continuation of `` Th...  0.83333   
2  Singer\/composer Bryan Adams contributes a sle...  0.62500   
3  You 'd think by now America would have had eno...  0.50000   
4               Yet the act is still charming here .  0.72222   

                                              tokens  \
0  The|Rock|is|destined|to|be|the|21st|Century|'s...   
1  The|gorgeously|elaborate|continuation|of|``|Th...   
2  Singer\/composer|Bryan|Adams|contributes|a|sle...   
3  You|'d|think|by|now|America|would|have|had|eno...   
4               Yet|the|act|is|still|charming|here|.   

                                                tree  
0  70|70|68|67|63|62|61|60|58|58|57|56|56|64|65|5...  
1  71|70|69|69|67|67|66|64|63|62|62|61|61|58|57|5...  
2  72|71|71|70|68|68|67|67|66|63|62|62|60|60|58|5...  
3  36|35|34|33|33|32|30|29|27|26|25|24|23|23|22|2...

### Dictionary format

In [6]:
# dic = dictionary_dataset["dictionary"].to_pandas()
# print(dic.head())

### PTB format

In [7]:
# train_ptb = ptb_dataset["train"].to_pandas()
# test_ptb = ptb_dataset["test"].to_pandas()
# val_ptb = ptb_dataset["validation"].to_pandas()
# print(train_ptb.head())
# print(test_ptb.head())
# print(val_ptb.head())

## We will use the default config

In [8]:
X_train_def = train_def["sentence"]
y_train_def = train_def["label"]
X_test_def = test_def["sentence"]
y_test_def = test_def["label"]
X_val_def = val_def["sentence"]
y_val_def = val_def["label"]
print(X_train_def.head())
print(y_train_def.head())
print(X_test_def.head())
print(y_test_def.head())
print(X_val_def.head())
print(y_val_def.head())

0    The Rock is destined to be the 21st Century 's...
1    The gorgeously elaborate continuation of `` Th...
2    Singer\/composer Bryan Adams contributes a sle...
3    You 'd think by now America would have had eno...
4                 Yet the act is still charming here .
Name: sentence, dtype: object
0    0.69444
1    0.83333
2    0.62500
3    0.50000
4    0.72222
Name: label, dtype: float32
0                       Effective but too-tepid biopic
1    If you sometimes like to go to the movies to h...
2    Emerges as something rare , an issue movie tha...
3    The film provides some great insight into the ...
4    Offers that rare combination of entertainment ...
Name: sentence, dtype: object
0    0.51389
1    0.73611
2    0.86111
3    0.59722
4    0.83333
Name: label, dtype: float32
0    It 's a lovely film with lovely performances b...
1    No one goes unindicted here , which is probabl...
2    And if you 're not nearly moved to tears by a ...
3                     A warm , funny , 

# Preprocessing

### Map function

In [9]:
# Map the scores to sentiment classes
def map_sentiment(score):
    if score <= 0.2:
        return 0
    elif score <= 0.4:
        return 1
    elif score <= 0.6:
        return 2
    elif score <= 0.8:
        return 3
    else:
        return 4

In [10]:
# Map the scores to sentiment classes
y_train = y_train_def.apply(map_sentiment)
y_test = y_test_def.apply(map_sentiment)
y_val = y_val_def.apply(map_sentiment)
print(y_train.unique())
print(y_test.unique())
print(y_val.unique())

[3 4 2 1 0]
[2 3 4 1 0]
[3 2 4 0 1]


# Models

## Naive Bayes

### Feature extraction

In [15]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X_train = vectorizer.fit_transform(X_train_def).astype(int)
X_test = vectorizer.transform(X_test_def).astype(int)
X_val = vectorizer.transform(X_val_def).astype(int)
print(vectorizer.get_feature_names_out())
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

79268
8544
(2210, 79268)
(1101, 79268)


### Implementation from scratch

In [59]:
import numpy as np

class NaiveBayes:
    def __init__(self, smoothing_factor=1):
        self.classes = None
        self.class_priors = None
        self.feature_log_prob_ = None  # Changed name to match scikit-learn's convention
        self.smoothing_factor = smoothing_factor
        self.vocab = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_priors = self.calculate_class_priors(y)
        self.feature_log_prob_ = self.calculate_feature_log_prob(X, y)  # Changed method name

    def calculate_class_priors(self, y):
        class_counts = np.bincount(y)
        total_samples = len(y)
        class_priors = class_counts / total_samples
        return class_priors

    def calculate_feature_log_prob(self, X, y):  # Changed method name
        num_features = X.shape[1]
        feature_log_prob_ = []  

        for c in self.classes:
            X_c = X[y == c]
            log_probabilities = []

            for feature in range(num_features):
                feature_values = X_c[:, feature]
                if feature_values.getnnz() != 0:
                    self.vocab = vectorizer.get_feature_names_out()
                    print(feature_values)
                feature_counts = feature_values.getnnz(axis=0)
                feature_probabilities = (feature_counts + self.smoothing_factor) / (X_c.shape[0] + self.smoothing_factor * num_features)
                feature_log_probabilities = np.log(feature_probabilities)
                log_probabilities.append(feature_log_probabilities)

            feature_log_prob_.append(log_probabilities)

        return np.array(feature_log_prob_).transpose()


    def predict(self, X):
        predictions = []

        for j in range(len(self.classes)):
            class_prior = self.class_priors[j]
            class_scores = []

            for i in range(X.shape[0]):  # Iterate over the number of rows
                sample = X[i]
                log_likelihood = 0

                if sample in self.vocab:
                    for feature in range(len(sample)):
                        if sample[feature] in self.vocab:
                            log_likelihood += self.feature_log_prob_[j][feature]

                    class_score = np.log(class_prior) + log_likelihood
                    class_scores.append(class_score)
                    # Sample exists in the vocabulary
                    print("Sample exists in the vocabulary")
                else:
                    continue

            predicted_class = self.classes[np.argmax(class_scores)]
            predictions.append(predicted_class)

        return predictions


    def log_prior(self):
        return np.log(self.class_priors)

    def log_likelihood(self):
        return self.feature_log_prob_  # No need to modify this, as it already returns log probabilities


In [60]:
naive_bayes = NaiveBayes()
naive_bayes.fit(X_train, y_train)
nb1 = naive_bayes

  (13, 0)	1
  (247, 0)	1
  (310, 0)	1
  (498, 0)	1
  (841, 0)	1
  (662, 0)	1
  (181, 0)	1
  (701, 0)	1
  (848, 0)	1
  (360, 0)	1
  (292, 0)	1
  (74, 0)	1
  (0, 0)	1
  (183, 0)	1
  (665, 0)	1
  (263, 0)	1
  (739, 0)	1
  (898, 0)	1
  (804, 0)	1
  (432, 0)	1
  (783, 0)	1
  (102, 0)	1
  (83, 0)	1
  (560, 0)	1
  (360, 0)	1
  (770, 0)	1
  (247, 0)	1
  (169, 0)	1
  (431, 0)	1
  (247, 0)	1
  (169, 0)	1
  (67, 0)	1
  (918, 0)	1
  (432, 0)	1
  (70, 0)	1
  (1038, 0)	1
  (651, 0)	1
  (43, 0)	1
  (675, 0)	1
  (1000, 0)	1
  (870, 0)	1
  (167, 0)	1
  (367, 0)	1
  (84, 0)	1
  (235, 0)	1
  (84, 0)	1
  (422, 0)	1
  (10, 0)	1
  (560, 0)	1
  (879, 0)	1
  (211, 0)	1
  (956, 0)	1
  (970, 0)	1
  (73, 0)	1
  (227, 0)	1
  (46, 0)	1
  (460, 0)	1
  (194, 0)	1
  (818, 0)	1
  (633, 0)	1
  (760, 0)	1
  (798, 0)	1
  (798, 0)	1
  (507, 0)	1
  (868, 0)	1
  (507, 0)	1
  (202, 0)	1


KeyboardInterrupt: 

In [None]:
nb1_predictions = nb1.predict(X_test)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2210,) + inhomogeneous part.

### Implementation using scikit learn

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
nb2 = naive_bayes

In [None]:
print(nb2.class_log_prior_)
print(nb1.log_prior())
print(nb2.feature_log_prob_)
print(nb1.log_likelihood()[0].transpose())

[-2.0572184  -1.34862339 -1.66033704 -1.3028004  -1.89213865]
[-2.0572184  -1.34862339 -1.66033704 -1.3028004  -1.89213865]
[[-11.47047715 -11.47047715 -11.47047715 ... -11.47047715 -11.47047715
  -10.77732997]
 [-11.63595693 -10.94280975 -10.94280975 ... -11.63595693 -11.63595693
  -11.63595693]
 [-10.8486961  -11.54184329 -11.54184329 ... -10.8486961  -11.54184329
  -11.54184329]
 [-11.65872134 -11.65872134 -11.65872134 ... -11.65872134 -11.65872134
  -11.65872134]
 [-11.49646066 -11.49646066 -11.49646066 ... -11.49646066 -10.80331348
  -11.49646066]]
[[-11.29427182 -11.29427182 -11.29427182 ... -11.29427182 -11.29427182
  -10.60112464]
 [-11.30818651 -10.61503932 -10.61503932 ... -11.30818651 -11.30818651
  -11.30818651]
 [-10.60772303 -11.30087021 -11.30087021 ... -10.60772303 -11.30087021
  -11.30087021]
 [-11.30946198 -11.30946198 -11.30946198 ... -11.30946198 -11.30946198
  -11.30946198]
 [-11.29670787 -11.29670787 -11.29670787 ... -11.29670787 -10.60356069
  -11.29670787]]


Same logs so both models work the same

## Logistic Regression

### Implement Logistic Regression from scratch

In [265]:
class LogisticRegressionClassifier:
    def __init__(self, learning_rate=0.01, num_iterations=100):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def initialize_parameters(self, num_features):
        self.weights = np.zeros(num_features)
        self.bias = 0
    
    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.initialize_parameters(num_features)
        
        for _ in range(self.num_iterations):
            linear_model = X.dot(self.weights.T) + self.bias
            y_pred = self.sigmoid(linear_model)
            
            dw = (1 / num_samples) * X.T.dot((y_pred - y))
            db = (1 / num_samples) * np.sum(y_pred - y)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_model = X.dot(self.weights.T) + self.bias
        y_pred = self.sigmoid(linear_model)
        y_pred_class = np.where(y_pred > 0.5, 1, 0)
        return y_pred_class

In [266]:
(type(X_train))

scipy.sparse._csr.csr_matrix

In [267]:
lr_classifier = LogisticRegressionClassifier()
lr_classifier.fit(X_train, y_train)
lr1 = lr_classifier

### Implement Logistic Regression from scikit learn

In [268]:
lr_classifier = LogisticRegression()
lr2 = lr_classifier.fit(X_train, y_train)

In [269]:
lr_classifier = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(max_iter=100, tol=1e-3, shuffle=True, random_state=42))
lr3 = lr_classifier.fit(X_train, y_train)

# Confusion Matrix &Evaluation Metrics

### Confusion Matrix using scikit learn

In [270]:
def generate_confusion_matrix1(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    labels = np.unique(np.concatenate((y_true, y_pred)))
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    return cm_df

### Confusion Matrix from scratch

In [271]:
def generate_confusion_matrix2(y_true, y_pred):
    labels = np.unique(np.concatenate((y_true, y_pred)))
    num_labels = len(labels)
    cm = np.zeros((num_labels, num_labels), dtype=int)
    
    for i in range(len(y_true)):
        true_label = np.where(labels == y_true[i])[0][0]
        pred_label = np.where(labels == y_pred[i])[0][0]
        cm[true_label][pred_label] += 1
    
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    return cm_df

### Metrics from scratch

In [272]:
def compute_metrics(confusion_matrix):
    # Get the unique labels
    labels = confusion_matrix.index.tolist()

    # Initialize lists to store precision, recall, and F1 score per class
    precision_per_class = []
    recall_per_class = []
    f1_score_per_class = []

    # Compute precision, recall, and F1 score per class
    for label in labels:
        true_positives = confusion_matrix.loc[label, label]
        false_positives = confusion_matrix.loc[:, label].sum() - true_positives
        false_negatives = confusion_matrix.loc[label, :].sum() - true_positives

        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1 = 2 * (precision * recall) / (precision + recall)

        precision_per_class.append(precision)
        recall_per_class.append(recall)
        f1_score_per_class.append(f1)

    # Compute macro-averaged precision, recall, and F1 score
    macro_precision = sum(precision_per_class) / len(precision_per_class)
    macro_recall = sum(recall_per_class) / len(recall_per_class)
    macro_f1_score = sum(f1_score_per_class) / len(f1_score_per_class)

    return {
        'precision_per_class': precision_per_class,
        'recall_per_class': recall_per_class,
        'f1_score_per_class': f1_score_per_class,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1_score': macro_f1_score
    }

In [301]:
model = nb1 # Change to nb2, lr1, or lr2 to generate confusion matrix for other models
y_pred = model.predict(X_test)

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

Confusion matrix using scikit learn

In [None]:
cm1 = generate_confusion_matrix1(y_test, y_pred)
# Plot the confusion matrix
sns.heatmap(cm1, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Confusion matrix from scratch

In [None]:
cm2 = generate_confusion_matrix2(y_test, y_pred)
# Plot the confusion matrix
sns.heatmap(cm2, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
Metrics = compute_metrics(cm2)
precision_scratch, recall_scratch, f1_scratch = Metrics['precision_per_class'], Metrics['recall_per_class'], Metrics['f1_score_per_class']
# Calculate precision, recall, and F1 score using sklearn functions
precision_sklearn = precision_score(y_test, y_pred, average="macro")
recall_sklearn = recall_score(y_test, y_pred, average="macro")
f1_sklearn = f1_score(y_test, y_pred, average="macro")

# Compare the metrics
print("Precision (from scratch):", precision_scratch)
print("Precision (sklearn):", precision_sklearn)
print("Recall (from scratch):", recall_scratch)
print("Recall (sklearn):", recall_sklearn)
print("F1 Score (from scratch):", f1_scratch)
print("F1 Score (sklearn):", f1_sklearn)
