In [9]:
# README:
# This file mainly used for tokenize variation
# In file SI630_Assignment1_junqich.ipynb (main file), I use only better_tokenize()
# In this file tokenize_variation.ipynb, I use only tokenize()
# And save the result for task3.4, which is read by main file.

In [10]:
from tqdm import tqdm

import numpy as np
import pandas as pd
import re

from collections import Counter
from collections import defaultdict
from scipy import sparse
from scipy.stats import uniform

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# Part 1: Representing Text Data

In [11]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,party_affiliation,email_text,uid_email
0,Democratic Party,⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱⋱...,3083493a6b205eabd8d0f1e7772db09b
1,Democratic Party,This new report needs your attention now. \n ...,3a583e28c820e1fac8902e4df0ef50e0
2,Democratic Party,<PARTY> <ORG> <ORG> <ORG> \n <> NEWS \n <PERS...,044219f46cca419d1d95242dfe036c15
3,Democratic Party,Chip in today to <GPE> to our virtual grassroo...,68059dd1d93d0cbf456763822d1ab680
4,Democratic Party,I ’m really sorry to bother you but I ’m not o...,9f41a878e2839dc013546e615da83efa


In [12]:
dev = pd.read_csv("dev.csv")
dev.head()

Unnamed: 0,party_affiliation,email_text,uid_email
0,Democratic Party,"<> <>, \n <DATE>, <> <PERSON> <PERSON> suspend...",64241625785edfde727dd84c08e5cda2
1,Democratic Party,We simply could not run our campaign without o...,d005af10b61a2565704c237fd506b5e9
2,Democratic Party,I have to give <> <> <PERSON> and the team an ...,75088c6211cca345172d18aab778b93c
3,Democratic Party,"Here ’s your challenge: Can we raise $ 10,000 ...",b56badd20bd35485b7197587333283d1
4,Democratic Party,"This week, I and more than 30 of my colleagues...",f60dc5576465f00970e35e36e57e9f1e


In [13]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,email_text,uid_email
0,"20,000 <GPE> <GPE> signatures needed \n 20,000...",5dfbe09ce5b500dd3dcb9f93c8fb185f
1,We 've worked way too hard and given way too m...,07e48f4183b98420a18503791fb412f6
2,If you use your 800%-MATCH \n <> \n <ORG> <OR...,b58c8607d96a414db0e9cc10108c35f5
3,"<PERSON>, a proposal for Vote- by- <> threaten...",3901d9539d69ada89e5c82e2f1ca950d
4,The <ORG> <ORG> <ORG> is under immediate threa...,6a3e8e6f31381e84a34571deee0f1238


## Task 1.1: Tokenization

In [14]:
punctuations = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—'''
stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'} 

In [15]:
def tokenize(input_str):
    token = input_str.split()
    return token

def better_tokenize(input_str):
    # manipulation
    temp_str = input_str.replace('\n', '')
    temp_str = re.sub(r"\<[^<>]*\>", '', temp_str) # remove hidden names inside <>
    for punc in punctuations:
        temp_str = temp_str.replace(punc, '') # remove punctuations
    token = temp_str.lower().split() # turn into lower cases
    token = [t for t in token if not t in stopwords] # remove stopwords
    return token

## Task 1.2: Building the Term-Document Matrix

In [16]:
min_tf = 10
term_dict = defaultdict(int) # mapping from term to total tf
all_term_dict = defaultdict(int)

# set up a term dictionary for all
for index, row in train.iterrows():
    temp = Counter(tokenize(row["email_text"])) # tf in one doc
    for term, freq in temp.items():
        all_term_dict[term] += freq
    
# remove terms less then minimum word frequency 
for term, freq in all_term_dict.items():
    if freq >= min_tf:
        term_dict[term] = freq
        
print(f"Number of terms in term_dict is {len(term_dict)}.")

Number of terms in term_dict is 29060.


In [17]:
# set up the COO sparse matrix
# DO NOT USE! TOO SLOW!

# doc_id = np.array([]) # doc dimension D
# term_id = np.array([]) # vocabulary dimension V
# doc_tf = np.array([]) # term df in a doc
# for index, row in tqdm(train.iterrows(), total = train.shape[0]):
#     temp = Counter(tokenize(row["email_text"])) # tf in one doc
#     for term, freq in temp.items():
#         if term in term_id_dict.keys():
#             doc_id = np.append(doc_id, index)
#             term_id = np.append(term_id, term_id_dict[term])
#             doc_tf = np.append(doc_tf, freq)
        
# term_doc_mat = sparse.coo_matrix((doc_tf, (doc_id, term_id)))

In [18]:
# set up the CSR sparse matrix
def CSRMatrixGeneration(docs, vocabulary = {}, term_dict = term_dict):
    # INPUT: docs is a vector length D with tokens of each doc
    # OUTPUT: a CSR term freq matrix length V * D
    indptr = [0]
    indices = []
    data = []
    
    print("Generating CSR sparse matrix...")
    for doc in tqdm(docs):
        for term in doc:
            if term in term_dict.keys(): # remove term with tf < 10
                index = vocabulary.setdefault(term, len(vocabulary))
                indices.append(index)
                data.append(1)
        indptr.append(len(indices))

    csr_mat = sparse.csr_matrix((data, indices, indptr), dtype=int)
    return csr_mat, vocabulary

In [19]:
# generate doc list
docs = []
print("Generating doc list for train...")
for index, row in tqdm(train.iterrows(), total = train.shape[0]):
    docs.append(tokenize(row["email_text"]))
    
train_mat, train_vocabulary = CSRMatrixGeneration(docs)
train_mat = sparse.hstack([train_mat, np.ones(len(train_mat.toarray()))[:,None]]) # add the bias column
train_mat = sparse.csr_matrix(train_mat)

Generating doc list for train...


100%|█████████████████████████████████████████████████████████████████████████| 59999/59999 [00:03<00:00, 17408.27it/s]


Generating CSR sparse matrix...


100%|██████████████████████████████████████████████████████████████████████████| 59999/59999 [00:06<00:00, 9300.36it/s]


In [20]:
print(f"The shape of the constructed term-document matrix is (D, V) = {train_mat.toarray().shape}")

The shape of the constructed term-document matrix is (D, V) = (59999, 29061)


# Part 2: Logistic Regression in numpy

In [21]:
def sigmoid(X):
    # Input: an np.array
    # Output: an np.array
    return 1 / (1 + np.exp(-X))

def log_likelihood(X, y, beta):
    # INPUT: y, beta are 1-d np.array length V
    # INPUT: X is a sprase matrix length doc number D * length V
    # OUTPUT: np.array log-likelihood length V
    sum = 0
    n = len(X[0].toarray().flatten()) # vocabulary (feature) length
    for i in range(n):
        sum += y[i] * np.dot(beta, X[i].toarray().flatten()) - \
                np.log10(1 + np.exp(np.dot(beta, X[i].toarray().flatten())))
    
    return sum

def compute_gradient(x, y, beta):
    # INPUT: beta is 1-d np.array length V
    # INPUT: y is 1-d np.array length 1
    # INPUT: x is a 1-d np.array length V
    # OUTPUT: 1-d np.array of gradient length V
    return np.dot((sigmoid(np.dot(beta, x)) - y), x)

In [22]:
def logistic_regression(X, y, learning_rate = 5e-5, num_step = 1000, is_plot = False):
    # INPUT: X is a sparse matrix length doc number D * length V
    # INPUT: y is a 1-d np.array length V
    # OUTPUT: a trained parameter beta length V
    n = len(X[0].toarray().flatten()) # vocabulary (feature) length
    beta = np.zeros(n) # init beta
    prev_ll = log_likelihood(X, y, beta) # for previous log-likelihood, recorded per 100 steps
    if is_plot:
        step_list = []
        beta_list = [] # for log likelihood plot per 100 steps
    
    print("Starting Logistic Regression to find the parameter vector beta...")
    for step_count in tqdm(range(num_step)):
        beta = beta - learning_rate * compute_gradient(X[step_count % n].toarray().flatten(), 
                                                       y[step_count % n], 
                                                       beta)
        if step_count % 100 == 0:
            if is_plot:
                step_list.append(step_count)
                beta_list.append(log_likelihood(X, y, beta))
            
#             curr_ll = log_likelihood(X, y, beta)
#             if abs(curr_ll - prev_ll) < 1e-5:
#                 print("The hyperparameter has converged. Early stop.")
#                 break
#             else:
#                 prev_ll = curr_ll
        
    if is_plot:
        plt.plot(step_list, beta_list)
        
    return beta

In [23]:
def predict(text, beta, vocab_dict = train_vocabulary):
    x_pred = np.zeros(len(vocab_dict) + 1)
    
    term_dict = Counter(tokenize(text))
    for term, freq in term_dict.items():
        if term in vocab_dict.keys():
            x_pred[vocab_dict[term]] = freq
    x_pred[-1] = 1 # bias
    
    y_pred = sigmoid(np.dot(beta, x_pred))
    if y_pred < 0.5:
        return 0
    else:
        return 1

## Task 2.1: Plot log-likelihood

In [39]:
label_dict = {'Democratic Party': 0, 'Republican Party': 1}
y_train = np.array([label_dict[p] for p in train["party_affiliation"]])

# beta = logistic_regression(X = train_mat,
#                            y = y_train, 
#                            is_plot = True)

## Task 2.2: Make prediction on validation dataset

In [25]:
# # train the beta
# beta = logistic_regression(X = train_mat,
#                            y = y_train, 
#                            learning_rate = 5e-5, 
#                            num_step = 600000)

In [26]:
# make prediction
# y_test = [label_dict[p] for p in dev["party_affiliation"]]
# y_pred = []
# print("Starting prediction on validation dataset...")
# for i in tqdm(range(len(dev))):
#     y_pred.append(predict(dev["email_text"][i], beta))

In [45]:
# compute f1score
from sklearn.metrics import f1_score
# f1_score(y_test, y_pred)

## Task 2.3: Make prediction on test dataset¶

In [28]:
# rev_label_dict = {0: 'Democratic Party', 1: 'Republican Party'}

# # make prediction
# y_pred = []
# print("Starting prediction on test dataset...")
# for i in tqdm(range(len(test))):
#     y_pred.append(predict(test["email_text"][i], beta))
    
# result = pd.DataFrame()
# result["uid_email"] = test["uid_email"]
# result["party_affiliation"] = [rev_label_dict[p] for p in y_pred]

In [29]:
# output result
# result.to_csv("part2_result.csv", index=False)

# Part 3: Logistic Regression with PyTorch

In [30]:
# ignore the installation command if torch has been installed
# ! pip3 install torch torchvision torchaudio
import torch
import torch.nn as nn

In [31]:
def to_sparse_tensor(np_sparse_mat):
    # INPUT: a numpy sprase matrix used previously
    # OUTPUT: a torch sparse matrix
    coo_mat = np_sparse_mat.tocoo()
    
    values = coo_mat.data
    indices = np.vstack((coo_mat.row, coo_mat.col))
    
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo_mat.shape

    torch_sparse_mat = torch.sparse_coo_tensor(i, v, torch.Size(shape))
    return torch_sparse_mat

train_mat_torch = to_sparse_tensor(train_mat)

In [32]:
sig = nn.Sigmoid()
class LogisticRegression(nn.Module):
    def __init__(self, input_features = train_mat_torch.shape[1], output_features = 1, bias=True):
        super().__init__()
        self.layer = nn.Linear(input_features, output_features)
        
    def forward(self, x):
        return sig(self.layer(x))

In [33]:
# useless helper function
def GetTensorRow(tensor, i):
    return tensor.index_select(0, torch.tensor([i % len(tensor)])).to_dense()[0]

In [34]:
# dev data preprocessing for prediction
docs = []
print("Generating doc list for dev ...")
for index, row in tqdm(dev.iterrows(), total = dev.shape[0]):
    docs.append(tokenize(row["email_text"]))
    
dev_mat, dev_vocabulary = CSRMatrixGeneration(docs, vocabulary=train_vocabulary)
dev_mat = sparse.hstack([dev_mat, np.zeros(len(dev_mat.toarray()))[:,None]]) # add the missing column
dev_mat = sparse.hstack([dev_mat, np.ones(len(dev_mat.toarray()))[:,None]]) # add the bias column
dev_mat = sparse.csr_matrix(dev_mat)

dev_mat_torch = to_sparse_tensor(dev_mat)

Generating doc list for dev ...


100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:01<00:00, 19362.66it/s]


Generating CSR sparse matrix...


100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [00:02<00:00, 8866.85it/s]


In [36]:
# given a model, predict on the new text
def predict(text, model, vocab_dict = train_vocabulary):
    x_pred = np.zeros(len(vocab_dict) + 1)
    
    term_dict = Counter(tokenize(text))
    for term, freq in term_dict.items():
        if term in vocab_dict.keys():
            x_pred[vocab_dict[term]] = freq
    x_pred[-1] = 1 # bias
    
    y_pred = model(torch.tensor(x_pred, dtype=torch.float32))
    if y_pred < 0.5:
        return 0
    else:
        return 1

# given a model, predict on the tensor of new text
def better_predict(x_tensor, model):
    y_pred = model(x_tensor)
    if y_pred < 0.5:
        return 0
    else:
        return 1

In [37]:
# make prediction and get f1 score
dev_mat_torch_dense = dev_mat_torch.to_dense() # set global for quicklier running
def GetF1Score(model, dev = dev):
    y_test = [label_dict[p] for p in dev["party_affiliation"]]
    y_pred = []
    for i in tqdm(range(len(dev))):
        y_pred.append(predict(dev["email_text"][i], model))
    return f1_score(y_test, y_pred) # compute f1score

def better_GetF1Score(model, dev = dev):
    # require dev_mat_torch as global variable
    y_test = [label_dict[p] for p in dev["party_affiliation"]]
    y_pred = []
    for i in range(len(dev_mat_torch)):
        y_pred.append(
            better_predict(dev_mat_torch_dense[i], model)
        )
    return f1_score(y_test, y_pred) # compute f1score

In [40]:
train_mat_torch_dense = train_mat_torch.to_dense()  # set global for quicklier running

def TrainModel(X = train_mat_torch_dense, y = y_train, num_epoch = 1, num_step = len(train_mat_torch), \
               opt_choice = 1, learning_rate = 5e-5, l2penalty = 0, \
               isloss = False, isf1score = False):
    # INPUT: X is a torch sparse matrix
    # INPUT: y is a list or np.array of labels
    # OUTPUT: a LR model with trained parameters
    model = LogisticRegression()
    criterion = nn.BCELoss() # loss function
    if opt_choice == 1: # default optimizer
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=l2penalty)
    elif opt_choice == 2:
        optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=l2penalty)
    elif opt_choice == 3:
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=l2penalty)

    # for plotting
    step_list = []
    if isloss:
        loss_list = []
    if isf1score:
        f1score_list = []

    for epoch in range(num_epoch):  # loop over the dataset multiple times
        running_loss = 0.0
        print(f"Starting training in epoch {epoch + 1}...")
        for i in tqdm(range(num_step)):
            # get the inputs and label
            inputs = X[i]
            labels = torch.tensor(y_train[i], dtype=torch.float32).reshape(1)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if (i + 1) % 1000 == 0:
                # print(f'epoch = {epoch + 1}, step = {i} => loss: {running_loss / 1000:.3f}')
                step_list.append(epoch * num_step + i)
                if isloss:
                    loss_list.append(running_loss)
                if isf1score:
                    f1score_list.append(better_GetF1Score(model))
                
                running_loss = 0.0
    
    print('Finished Training')
    
    if isloss and isf1score: 
        return model, step_list, loss_list, f1score_list
    elif isloss and not isf1score:
        return model, step_list, loss_list
    else:
        return model

## Task 3.1: Compute loss and F1score for 5 epochs

## Task 3.2: Compute loss and F1score for L2 penalty variations

## Task 3.3: Compute loss and F1score for optimizer variations

## Task 3.4: Compute loss and F1score for tokenization variations

In [46]:
# See additional file tokenize.ipynb
tokenize_model1, step_list1, loss_list1, f1score_list1 = TrainModel(isloss=True, isf1score=True)

Starting training in epoch 1...


100%|██████████████████████████████████████████████████████████████████████████| 59999/59999 [00:59<00:00, 1009.40it/s]

Finished Training





In [49]:
# save data to file for main file
tokenize_data_df1 = pd.DataFrame({
    "step": step_list1,
    "loss": loss_list1,
    "f1score": f1score_list1,
    "label": "Worse"
})

tokenize_data_df1.to_csv("tokenize_data.csv", index=False)

## Task 3.5: Compute loss and F1score for learning rate variations