In [0]:
%load_ext autoreload
%autoreload 2

from domain.text_vectorizers import ASCIIVectorizer, BoWVectorizer, BiLSTMVectorizer
import pandas as pd
import numpy as np
import torch

### Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

### 1. Spam Classification

In [0]:
train = pd.read_csv("data/spam_classify/train.csv")
test = pd.read_csv("data/spam_classify/test.csv")

In [0]:
train_texts, train_labels = list(train.v2), list(train.v1)
test_texts, test_labels = list(test.v2), list(test.v1)

#### ASCII

In [28]:
model = LogisticRegression(random_state=0, solver='liblinear')
for method in ["mean", "first"]:
    print(f"Agg method: {method}")
    vectorizer = ASCIIVectorizer(max_features=128, char_aggregator=method) 
    
    X_train, y_train = vectorizer.transform(train_texts, train_labels)
    X_test, y_test = vectorizer.transform(test_texts, test_labels)
    
    # Fit the model to training data
    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_prec = precision_score(y_train, y_train_pred)
    train_rec = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec = precision_score(y_test, y_test_pred)
    test_rec = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(f"Training accuracy: {train_acc:.3f}, "
          f"Precison: {train_prec:.3f}, "
          f"Recall: {train_rec:.3f}, "
          f"F1 score: {train_f1:.3f}")
    print(f"Test accuracy: {test_acc:.3f}, "
          f"Precison: {test_prec:.3f}, "
          f"Recall: {test_rec:.3f}, "
          f"F1 score: {test_f1:.3f}\n")

Agg method: mean
Training accuracy: 0.867, Precison: 0.522, Recall: 0.179, F1 score: 0.267
Test accuracy: 0.856, Precison: 0.373, Recall: 0.152, F1 score: 0.216

Agg method: first
Training accuracy: 0.868, Precison: 0.531, Recall: 0.198, F1 score: 0.288
Test accuracy: 0.858, Precison: 0.377, Recall: 0.138, F1 score: 0.202



#### BoW 

In [29]:
model = LogisticRegression(random_state=0, solver='liblinear')
for nf in [8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
    print(f"Vocab size: {nf}")
    
    vectorizer = BoWVectorizer(max_features=nf)
    vectorizer.fit(train_texts)
    
    X_train, y_train = vectorizer.transform(train_texts, train_labels)
    X_test, y_test = vectorizer.transform(test_texts, test_labels)

    # Fit the model to training data
    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_prec = precision_score(y_train, y_train_pred)
    train_rec = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec = precision_score(y_test, y_test_pred)
    test_rec = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(f"Training accuracy: {train_acc:.3f}, "
          f"Precison: {train_prec:.3f}, "
          f"Recall: {train_rec:.3f}, "
          f"F1 score: {train_f1:.3f}")
    print(f"Test accuracy: {test_acc:.3f}, "
          f"Precison: {test_prec:.3f}, "
          f"Recall: {test_rec:.3f}, "
          f"F1 score: {test_f1:.3f}\n")

Vocab size: 8
Training accuracy: 0.864, Precison: 0.441, Recall: 0.025, F1 score: 0.047
Test accuracy: 0.873, Precison: 1.000, Recall: 0.028, F1 score: 0.054

Vocab size: 16
Training accuracy: 0.897, Precison: 0.738, Recall: 0.374, F1 score: 0.496
Test accuracy: 0.905, Precison: 0.753, Recall: 0.400, F1 score: 0.523

Vocab size: 32
Training accuracy: 0.919, Precison: 0.774, Recall: 0.563, F1 score: 0.652
Test accuracy: 0.922, Precison: 0.796, Recall: 0.538, F1 score: 0.642

Vocab size: 64
Training accuracy: 0.939, Precison: 0.849, Recall: 0.664, F1 score: 0.746
Test accuracy: 0.935, Precison: 0.841, Recall: 0.621, F1 score: 0.714

Vocab size: 128
Training accuracy: 0.969, Precison: 0.946, Recall: 0.816, F1 score: 0.876
Test accuracy: 0.956, Precison: 0.875, Recall: 0.772, F1 score: 0.821

Vocab size: 256
Training accuracy: 0.982, Precison: 0.980, Recall: 0.882, F1 score: 0.928
Test accuracy: 0.973, Precison: 0.946, Recall: 0.841, F1 score: 0.891

Vocab size: 512
Training accuracy: 0.98

#### BiLSTM

In [30]:
model = LogisticRegression(random_state=0, solver='liblinear')
for vs in [10, 20, 50, 100, 200, 500, 1000, 2000]:    
    vectorizer = BiLSTMVectorizer(vocab_size=vs)
    
    X_train, y_train = vectorizer.transform(train_texts, train_labels)
    X_test, y_test = vectorizer.transform(test_texts, test_labels)

    # Fit the model to training data
    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_prec = precision_score(y_train, y_train_pred)
    train_rec = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec = precision_score(y_test, y_test_pred)
    test_rec = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(f"Training accuracy: {train_acc:.3f}, "
          f"Precison: {train_prec:.3f}, "
          f"Recall: {train_rec:.3f}, "
          f"F1 score: {train_f1:.3f}")
    print(f"Test accuracy: {test_acc:.3f}, "
          f"Precison: {test_prec:.3f}, "
          f"Recall: {test_rec:.3f}, "
          f"F1 score: {test_f1:.3f}\n")

Vocab size : 10
Training accuracy: 0.891, Precison: 0.764, Recall: 0.274, F1 score: 0.403
Test accuracy: 0.880, Precison: 0.704, Recall: 0.131, F1 score: 0.221

Vocab size : 20
Training accuracy: 0.921, Precison: 0.838, Recall: 0.517, F1 score: 0.639
Test accuracy: 0.895, Precison: 0.684, Recall: 0.359, F1 score: 0.471

Vocab size : 50
Training accuracy: 0.955, Precison: 0.929, Recall: 0.721, F1 score: 0.812
Test accuracy: 0.936, Precison: 0.863, Recall: 0.607, F1 score: 0.713

Vocab size : 100
Training accuracy: 0.968, Precison: 0.947, Recall: 0.809, F1 score: 0.873
Test accuracy: 0.946, Precison: 0.938, Recall: 0.628, F1 score: 0.752

Vocab size : 200
Training accuracy: 0.977, Precison: 0.970, Recall: 0.859, F1 score: 0.911
Test accuracy: 0.952, Precison: 0.876, Recall: 0.731, F1 score: 0.797

Vocab size : 500
Training accuracy: 0.985, Precison: 0.980, Recall: 0.910, F1 score: 0.944
Test accuracy: 0.963, Precison: 0.941, Recall: 0.766, F1 score: 0.844

Vocab size : 1000
Training accu

### 2. Binary Sentiment Analysis

In [0]:
train = pd.read_csv("data/sen_imdb/train.csv")
test = pd.read_csv("data/sen_imdb/test.csv")

In [0]:
train_texts, train_labels = list(train.text), list(train.pos)
test_texts, test_labels = list(test.text), list(test.pos)

#### ASCII

In [33]:
model = LogisticRegression(random_state=0, solver='liblinear')
for method in ["mean", "first"]:
    print(f"Agg method: {method}")
    vectorizer = ASCIIVectorizer(max_features=128, char_aggregator=method)
    
    X_train, y_train = vectorizer.transform(train_texts, train_labels)
    X_test, y_test = vectorizer.transform(test_texts, test_labels)
    
    # Fit the model to training data
    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_prec = precision_score(y_train, y_train_pred)
    train_rec = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec = precision_score(y_test, y_test_pred)
    test_rec = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(f"Training accuracy: {train_acc:.3f}, "
          f"Precison: {train_prec:.3f}, "
          f"Recall: {train_rec:.3f}, "
          f"F1 score: {train_f1:.3f}")
    print(f"Test accuracy: {test_acc:.3f}, "
          f"Precison: {test_prec:.3f}, "
          f"Recall: {test_rec:.3f}, "
          f"F1 score: {test_f1:.3f}\n")

Agg method: mean
Training accuracy: 0.538, Precison: 0.538, Recall: 0.540, F1 score: 0.539
Test accuracy: 0.513, Precison: 0.513, Recall: 0.516, F1 score: 0.514

Agg method: first
Training accuracy: 0.535, Precison: 0.538, Recall: 0.494, F1 score: 0.515
Test accuracy: 0.511, Precison: 0.512, Recall: 0.475, F1 score: 0.493



#### BoW

In [0]:
model = LogisticRegression(random_state=0, solver='liblinear')
for nf in [8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
    print(f"Vocab size: {nf}")
    
    vectorizer = BoWVectorizer(max_features=nf)
    vectorizer.fit(train_texts)
    
    X_train, y_train = vectorizer.transform(train_texts, train_labels)
    X_test, y_test = vectorizer.transform(test_texts, test_labels)

    # Fit the model to training data
    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_prec = precision_score(y_train, y_train_pred)
    train_rec = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec = precision_score(y_test, y_test_pred)
    test_rec = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(f"Training accuracy: {train_acc:.3f}, "
          f"Precison: {train_prec:.3f}, "
          f"Recall: {train_rec:.3f}, "
          f"F1 score: {train_f1:.3f}")
    print(f"Test accuracy: {test_acc:.3f}, "
          f"Precison: {test_prec:.3f}, "
          f"Recall: {test_rec:.3f}, "
          f"F1 score: {test_f1:.3f}\n")

Vocab size: 8
Training accuracy: 0.581, Precison: 0.586, Recall: 0.554, F1 score: 0.569
Test accuracy: 0.579, Precison: 0.584, Recall: 0.553, F1 score: 0.568

Vocab size: 16
Training accuracy: 0.612, Precison: 0.610, Recall: 0.619, F1 score: 0.615
Test accuracy: 0.612, Precison: 0.610, Recall: 0.619, F1 score: 0.614

Vocab size: 32
Training accuracy: 0.628, Precison: 0.625, Recall: 0.641, F1 score: 0.633
Test accuracy: 0.627, Precison: 0.624, Recall: 0.638, F1 score: 0.631

Vocab size: 64
Training accuracy: 0.681, Precison: 0.673, Recall: 0.704, F1 score: 0.688
Test accuracy: 0.675, Precison: 0.669, Recall: 0.694, F1 score: 0.681

Vocab size: 128
Training accuracy: 0.756, Precison: 0.747, Recall: 0.773, F1 score: 0.760
Test accuracy: 0.746, Precison: 0.739, Recall: 0.761, F1 score: 0.750

Vocab size: 256
Training accuracy: 0.797, Precison: 0.787, Recall: 0.815, F1 score: 0.800
Test accuracy: 0.787, Precison: 0.778, Recall: 0.803, F1 score: 0.790

Vocab size: 512
Training accuracy: 0.84

#### BiLSTM

In [43]:
model = LogisticRegression(random_state=0, solver='liblinear')
for vs in [10, 20, 50, 100, 200, 500, 1000, 2000]:    
    vectorizer = BiLSTMVectorizer(vocab_size=vs)

    #if vs >= 200:
    #  bsize = 32
    #else:
    #  bsize = 64
    
    X_train, y_train = vectorizer.transform(train_texts, train_labels, 32)
    X_test, y_test = vectorizer.transform(test_texts, test_labels, 32)

    # Fit the model to training data
    model.fit(X_train, y_train)

    # Make prediction using the trained model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_prec = precision_score(y_train, y_train_pred)
    train_rec = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_prec = precision_score(y_test, y_test_pred)
    test_rec = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    print(f"Training accuracy: {train_acc:.3f}, "
          f"Precison: {train_prec:.3f}, "
          f"Recall: {train_rec:.3f}, "
          f"F1 score: {train_f1:.3f}")
    print(f"Test accuracy: {test_acc:.3f}, "
          f"Precison: {test_prec:.3f}, "
          f"Recall: {test_rec:.3f}, "
          f"F1 score: {test_f1:.3f}\n")

Vocab size : 10
Training accuracy: 0.648, Precison: 0.649, Recall: 0.647, F1 score: 0.648
Test accuracy: 0.607, Precison: 0.605, Recall: 0.612, F1 score: 0.609

Vocab size : 20
Training accuracy: 0.674, Precison: 0.675, Recall: 0.671, F1 score: 0.673
Test accuracy: 0.623, Precison: 0.623, Recall: 0.622, F1 score: 0.623

Vocab size : 50
Training accuracy: 0.744, Precison: 0.740, Recall: 0.753, F1 score: 0.746
Test accuracy: 0.694, Precison: 0.691, Recall: 0.703, F1 score: 0.697

Vocab size : 100
Training accuracy: 0.774, Precison: 0.769, Recall: 0.781, F1 score: 0.775
Test accuracy: 0.722, Precison: 0.720, Recall: 0.729, F1 score: 0.724

Vocab size : 200
Training accuracy: 0.805, Precison: 0.805, Recall: 0.804, F1 score: 0.805
Test accuracy: 0.765, Precison: 0.767, Recall: 0.761, F1 score: 0.764

Vocab size : 500
Training accuracy: 0.834, Precison: 0.832, Recall: 0.837, F1 score: 0.835
Test accuracy: 0.790, Precison: 0.788, Recall: 0.795, F1 score: 0.791

Vocab size : 1000
Training accu