In [17]:
import math
import pandas as pd
import numpy as np
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression
import nltk
nltk.download ('punkt')
nltk.download('stopwords')

stopwords = set(stopwords.words("english"))
ps = PorterStemmer()
np.random.seed(0)



def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of reviews, a list of labels
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)

    return df['id'], df["text"], df['label']



def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)['label']


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results


def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]


def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

def to_numerical(train_n_gram, test_n_gram, min_thresh=10):

    # build the feature dict mapping each feature to its index 
    
    ###### Your Code Here ######
    # build a Counter for n-gram features
    feat_cnter = Counter()
    for feats in train_n_gram:
        feat_cnter.update(feats)

    # add those n-gram features which occurs more than min_thresh times into the feature set.
    ngram_vocab = [f for f, cnt in feat_cnter.items() if cnt > min_thresh]

    feats_dict = dict(zip(ngram_vocab, range(len(ngram_vocab))))
    
    # build the feature list
    train_feats = list()
    for i in range(len(train_ids)):
    # concatenate the stemmed token list and all n-gram list together
        train_feats.append(train_stemmed[i] + train_n_gram[i])
    
    test_feats = list()
    
    for i in range(len(test_ids)):
        # concatenate the stemmed token list and all n-gram list together
        test_feats.append(test_stemmed[i] + test_n_gram[i])
    # build the feats_matrix for both train and test set.
    # We first convert each example to a ont-hot vector, and then stack vectors as a matrix. Afterwards,
    # we save this feature matirx in a COO sparse matrix format to reduce memory consumption.

    train_feats_matrix = coo_matrix(np.vstack([get_onehot_vector(f, feats_dict) for f in train_feats]))
    test_feats_matrix = coo_matrix(np.vstack([get_onehot_vector(f, feats_dict) for f in test_feats]))


    return train_feats_matrix, test_feats_matrix

def classify_lr(train_feats_matrix, test_feats_matrix, train_labels):
    ###### Your Code Here ######
    clf0 = LogisticRegression()
    clf0.fit(train_feats_matrix.toarray(), train_labels.values)
    test_pred = clf0.predict(test_feats_matrix.toarray())
    
    return test_pred


############ Naive Bayes related############

def normalize(P, smoothing_prior=0):

    N = P.shape[0]
    
 
    norm = np.sum(P, axis=0, keepdims=True)
    

    return (P + smoothing_prior) / (norm + smoothing_prior*N)

def compute_prior(data_label, data_matrix):

    N = data_matrix.shape[0]
    K = max(data_label) # labels begin with 1
    
    ###### Your Code Here ######
    
    data_label_onehot_matrix = np.zeros((N, K))

    for i, l in enumerate(data_label):
        # YOUR CODE HERE
        data_label_onehot_matrix[i, l-1] = 1

    label_freq = np.sum(data_label_onehot_matrix, axis=0, keepdims=False)

    # (use 1 as the smoothing prior)
    P_y = normalize(label_freq, smoothing_prior=1)

    return P_y, data_label_onehot_matrix

def compute_likelihood(data_matrix, data_label_onehot_matrix):

    ###### Your Code Here ######
    word_freq = data_matrix.transpose().dot(data_label_onehot_matrix)

    # (use 1 as the smoothing prior)
    P_xy = normalize(word_freq,smoothing_prior=1)

    return P_xy

def  classify_nb(data_matrix, P_y, P_xy):

    ###### Your Code Here ######
    log_P_y = np.expand_dims(np.log(P_y), axis=0)
    log_P_xy = np.log(P_xy)
    log_P_dy = data_matrix.dot(log_P_xy)
    log_P = log_P_y + log_P_dy
    pred = np.argmax(log_P, axis=1) + 1

    return pred



train_file = "data/train.csv"
test_file = "data/test.csv"
ans_file = "data/answer.csv"

# load data
train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(ans_file)

# extract features

# tokenization
train_tokens = [tokenize(text) for text in train_texts]
test_tokens = [tokenize(text) for text in test_texts]

# stemming
train_stemmed = [stem(tokens) for tokens in train_tokens]
test_stemmed = [stem(tokens) for tokens in test_tokens]

train_stemmed = [filter_stopwords(tokens) for tokens in train_stemmed]
test_stemmed = [filter_stopwords(tokens) for tokens in test_stemmed]

train_3_gram = [n_gram(tokens, 3) for tokens in train_stemmed]
test_3_gram = [n_gram(tokens, 3) for tokens in test_stemmed]

# to numerical

train_feats_matrix, test_feats_matrix = to_numerical(train_3_gram, test_3_gram, min_thresh=2)

# 1. Classify using Logistic Regression


test_pred = classify_lr(train_feats_matrix, test_feats_matrix, train_labels)
print("Test Accuracy of LR:", accuracy_score(test_labels.values, test_pred))


# 2. Naive Bayes

P_y, data_label_onehot_matrix = \
compute_prior(train_labels, train_feats_matrix)
print('P_y:', P_y)


P_xy = compute_likelihood(train_feats_matrix, data_label_onehot_matrix)

print('P_xy.shape:', P_xy.shape)


train_pred = classify_nb(train_feats_matrix, P_y, P_xy)

test_pred = classify_nb(test_feats_matrix, P_y, P_xy)

test_acc= accuracy_score(test_labels.values, test_pred)
print("Test Accuracy of NB:", test_acc)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chinh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chinh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Test Accuracy of LR: 0.46
P_y: [0.08678304 0.09426434 0.12169576 0.29625935 0.40099751]
P_xy.shape: (3258, 5)
Test Accuracy of NB: 0.485
