# This project is contributed by Yanfeng Guo (UID:806073779),  Garvit Pugalia (UID: 504628127), Hyosang Ahn (UID: 606073544).

In [None]:
# %load main.py
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
from nltk import word_tokenize, pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn import svm
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score, roc_curve, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
import string
import umap.umap_ as umap
import umap.plot

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
np.random.seed(42)  # make the training and testing samples the same as other groups
random.seed(42)
#################### Question 1 ########################
# the diagrams are missing
dataset = pd.read_csv('Project1-Classification.csv')
print(dataset.shape)
print(dataset.info)
# the total number of alphanumeric characters per data point (row) in the feature full text
num = []
for i in range(dataset.shape[0]):
    num_alpha_numeric = len(re.findall(r"\w", dataset['full_text'][i]))
    # print(num_alpha_numeric)
    num.append(num_alpha_numeric)
plt.hist(num, bins=200)
plt.title('The total number of alphanumeric characters per data point')
plt.xlabel('Count')
plt.ylabel('Frequency')
plt.show()

# The column leaf label – class on the x-axis
plt.hist(dataset['leaf_label'], bins=9, rwidth=0.3)
plt.xticks(fontsize=8)
plt.xlabel('Leaf label category')
plt.ylabel('Frequency')
plt.show()

# The column root label – class on the x-axis
plt.hist(dataset['root_label'])
plt.xlabel('Root label category')
plt.ylabel('Frequency')
plt.show()

In [None]:
#################### Question 2 ########################
train, test = train_test_split(dataset[["full_text","root_label","keywords"]], test_size=0.2)
print('train.shape:', train.shape)
print('test.shape:', test.shape)

In [None]:
#################### Question 3 ########################
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    texter = re.sub('[\d]', '', texter)  # by gyf, remove numbers
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter


# train_clean_text = train['full_text'].apply(lambda x:clean(x))  # clean the text of each data point
train = train.applymap(clean)  # clean the data of each data point in the training set
test = test.applymap(clean)  # clean the data of each data point in the testing set
train_clean_text = train['full_text']
test_clean_text = test['full_text']
print(train_clean_text)
print('train_clean_text:', train_clean_text.shape)


def get_pos(tag):  # get the property of a word from the pos_tag results
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lemmatization(text):
    wnl = WordNetLemmatizer()
    words = word_tokenize(text)  # divide the words of each data point
    tags = pos_tag(words)  # return the property of each word
    lemma = []
    for tag in tags:
        pos = get_pos(tag[1])
        lemma.append(wnl.lemmatize(tag[0], pos).lower())  # do the lemmatization
    return ' '.join(lemma)


train_lemmatized_text = train_clean_text.apply(lambda x:lemmatization(x))
print(train_lemmatized_text)
test_lemmatized_text = test_clean_text.apply(lambda x:lemmatization(x))

cv = CountVectorizer(stop_words='english', min_df=3)
train_count = cv.fit_transform(train_lemmatized_text)  # use fit or fit_transform on the training set
test_count = cv.transform(test_lemmatized_text)  # use transform on the testing set
print(train_count)
print(train_count.toarray())
print(train_count.toarray().shape)

tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_count)
test_tfidf = tfidf.transform(test_count)
train_tfidf_mat = train_tfidf.toarray()
print(train_tfidf_mat.shape)
test_tfidf_mat = test_tfidf.toarray()
print(test_tfidf_mat.shape)

label_train_gt = []
for i in train['root_label']:
    if i == 'sports':
        label_train_gt.append(1)
    elif i == 'climate':
        label_train_gt.append(0)

label_test_gt = []
for i in test['root_label']:
    if i == 'sports':
        label_test_gt.append(1)
    elif i == 'climate':
        label_test_gt.append(0)

In [None]:
#################### Question 4 ########################
k = [1, 10, 50, 100, 200, 500, 1000, 2000]
ratio = []
for i in k:
    lsi = TruncatedSVD(i, random_state=0)
    train_lsi_mat = lsi.fit_transform(train_tfidf_mat)  # fit model and perform dimensionality reduction
    ratio.append(np.sum(lsi.explained_variance_ratio_))
    print(np.sum(lsi.explained_variance_ratio_))
plt.plot(k, ratio)
plt.show()

lsi = TruncatedSVD(50, random_state=0)  # random_state should not be changed, or the training data will change
train_lsi_mat_50 = lsi.fit_transform(train_tfidf_mat)  # train_lsi_mat_50 = U_k @ Sigma_k;
# train_lsi_mat_50: act as the training data of following sections
VT = lsi.components_
# print(train_tfidf_mat - train_lsi_mat_50 @ VT)
lsi_fn = np.linalg.norm(train_tfidf_mat - train_lsi_mat_50 @ VT, 'fro')
print('LSI MSE:', lsi_fn * lsi_fn)

nmf = NMF(n_components=50, random_state=0)
train_nmf_mat_50 = nmf.fit_transform(train_tfidf_mat)  # train_nmf_mat_50 = W
H = nmf.components_
nmf_fn = np.linalg.norm(train_tfidf_mat - train_nmf_mat_50 @ H, 'fro')
print('NMF MSE:', nmf_fn * nmf_fn)

test_lsi_mat_50 = lsi.transform(test_tfidf_mat)
test_nmf_mat_50 = nmf.transform(test_tfidf_mat)

In [None]:
#################### Question 5 ########################
svm_soft_margin = svm.SVC(C=0.0001, kernel='linear', random_state=42)  # gamma=0.0001
svm_hard_margin = svm.SVC(C=1000, kernel='linear', random_state=42)  # gamma=1000
svm_harder_margin = svm.SVC(C=100000, kernel='linear', random_state=42)  # gamma=100000

svm_soft_margin.fit(train_lsi_mat_50, label_train_gt)
svm_hard_margin.fit(train_lsi_mat_50, label_train_gt)
svm_harder_margin.fit(train_lsi_mat_50, label_train_gt)
svm_soft_margin_predict = svm_soft_margin.predict(test_lsi_mat_50)
svm_hard_margin_predict = svm_hard_margin.predict(test_lsi_mat_50)
svm_harder_margin_predict = svm_harder_margin.predict(test_lsi_mat_50)

print('Soft Margin SVM confusion matrix: ', confusion_matrix(label_test_gt, svm_soft_margin_predict))
print('Soft Margin SVM accuracy score: ', accuracy_score(label_test_gt, svm_soft_margin_predict))
print('Soft Margin SVM recall score: ', recall_score(label_test_gt, svm_soft_margin_predict))
print('Soft Margin SVM precision score: ', precision_score(label_test_gt, svm_soft_margin_predict))
print('Soft Margin SVM f1 score: ', f1_score(label_test_gt, svm_soft_margin_predict))
print('Hard Margin SVM confusion matrix: ', confusion_matrix(label_test_gt, svm_hard_margin_predict))
print('Hard Margin SVM accuracy score: ', accuracy_score(label_test_gt, svm_hard_margin_predict))
print('Hard Margin SVM recall score: ', recall_score(label_test_gt, svm_hard_margin_predict))
print('Hard Margin SVM precision score: ', precision_score(label_test_gt, svm_hard_margin_predict))
print('Hard Margin SVM f1 score: ', f1_score(label_test_gt, svm_hard_margin_predict))
print('Harder Margin SVM confusion matrix: ', confusion_matrix(label_test_gt, svm_harder_margin_predict))
print('Harder Margin SVM accuracy score: ', accuracy_score(label_test_gt, svm_harder_margin_predict))
print('Harder Margin SVM recall score: ', recall_score(label_test_gt, svm_harder_margin_predict))
print('Harder Margin SVM precision score: ', precision_score(label_test_gt, svm_harder_margin_predict))
print('Harder Margin SVM f1 score: ', f1_score(label_test_gt, svm_harder_margin_predict))

def draw_roc_curve(fpr, tpr, model_name):
    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Postive Rate (FPR)")
    plt.ylabel("True Postive Rate (TPR)")
    plt.title(model_name)
    plt.show()


fpr_soft, tpr_soft, _ = roc_curve(label_test_gt, svm_soft_margin.decision_function(test_lsi_mat_50))
fpr_hard, tpr_hard, _ = roc_curve(label_test_gt, svm_hard_margin.decision_function(test_lsi_mat_50))
fpr_harder, tpr_harder, _ = roc_curve(label_test_gt, svm_harder_margin.decision_function(test_lsi_mat_50))
draw_roc_curve(fpr_soft, tpr_soft, 'Soft Margin SVM ROC Curve')
draw_roc_curve(fpr_hard, tpr_hard, 'Hard Margin SVM ROC Curve')
draw_roc_curve(fpr_harder, tpr_harder, 'Harder Margin SVM ROC Curve')

# cross validation
gamma = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 100000, 1000000]
cv_scores = []
for i in gamma:
    svm_model = svm.SVC(C=i, kernel='linear', random_state=42)
    score = cross_val_score(estimator=svm_model, X=train_lsi_mat_50, y=label_train_gt, cv=5)
    cv_score = np.mean(score)
    print(cv_score)
    cv_scores.append(cv_score)
print(cv_scores)
best_score_index = cv_scores.index(max(cv_scores))
best_gamma = gamma[best_score_index]
print('best gamma:', best_gamma)  # best gamma = 10000 (notice: must keep the random_state of LSI as 0!)

svm_best_margin = svm.SVC(C=best_gamma, kernel='linear', random_state=42)
svm_best_margin.fit(train_lsi_mat_50, label_train_gt)
svm_best_margin_predict = svm_best_margin.predict(test_lsi_mat_50)
print('Best Margin SVM confusion matrix: ', confusion_matrix(label_test_gt, svm_best_margin_predict))
print('Best Margin SVM accuracy score: ', accuracy_score(label_test_gt, svm_best_margin_predict))
print('Best Margin SVM recall score: ', recall_score(label_test_gt, svm_best_margin_predict))
print('Best Margin SVM precision score: ', precision_score(label_test_gt, svm_best_margin_predict))
print('Best Margin SVM f1 score: ', f1_score(label_test_gt, svm_best_margin_predict))
fpr_best, tpr_best, _ = roc_curve(label_test_gt, svm_best_margin.decision_function(test_lsi_mat_50))
draw_roc_curve(fpr_best, tpr_best, 'Best Margin SVM ROC Curve')

In [None]:
#################### Question 6 ########################
logistic_no_reg = LogisticRegression(penalty=None, solver='lbfgs', random_state=42)  # the logistic classifier without regularization
logistic_no_reg.fit(train_lsi_mat_50, label_train_gt)
logistic_no_reg_predict = logistic_no_reg.predict(test_lsi_mat_50)
print('Logistic Classifier W/o Regularization confusion matrix: ', confusion_matrix(label_test_gt, logistic_no_reg_predict))
print('Logistic Classifier W/o Regularization accuracy score: ', accuracy_score(label_test_gt, logistic_no_reg_predict))
print('Logistic Classifier W/o Regularization recall score: ', recall_score(label_test_gt, logistic_no_reg_predict))
print('Logistic Classifier W/o Regularization precision score: ', precision_score(label_test_gt, logistic_no_reg_predict))
print('Logistic Classifier W/o Regularization f1 score: ', f1_score(label_test_gt, logistic_no_reg_predict))
fpr_logistic_no_reg, tpr_logistic_no_reg, _ = roc_curve(label_test_gt, logistic_no_reg.decision_function(test_lsi_mat_50))
draw_roc_curve(fpr_logistic_no_reg, tpr_logistic_no_reg, 'Logistic Classifier W/o Regularization ROC Curve')

# cross validation for L1 regularization
k = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 100000]
cv_scores = []
for i in k:
    logistic_l1_reg = LogisticRegression(penalty='l1', C=i, solver='liblinear', random_state=42)
    score = cross_val_score(estimator=logistic_l1_reg, X=train_lsi_mat_50, y=label_train_gt, cv=5)
    cv_score = np.mean(score)
    print(cv_score)
    cv_scores.append(cv_score)
print(cv_scores)
best_score_index = cv_scores.index(max(cv_scores))
best_k = k[best_score_index]  # best k for L1 = 100
print('best k for L1 regularization:', best_k)
print('best regularization strength for L1 regularization:', 1/best_k)  # C is the inverse of regularization strength

logistic_best_l1_reg = LogisticRegression(penalty='l1', C=best_k, solver='liblinear', random_state=42)
logistic_best_l1_reg.fit(train_lsi_mat_50, label_train_gt)
logistic_best_l1_reg_predict = logistic_best_l1_reg.predict(test_lsi_mat_50)
print('Best L1 regularization confusion matrix: ', confusion_matrix(label_test_gt, logistic_best_l1_reg_predict))
print('Best L1 regularization accuracy score: ', accuracy_score(label_test_gt, logistic_best_l1_reg_predict))
print('Best L1 regularization recall score: ', recall_score(label_test_gt, logistic_best_l1_reg_predict))
print('Best L1 regularization precision score: ', precision_score(label_test_gt, logistic_best_l1_reg_predict))
print('Best L1 regularization f1 score: ', f1_score(label_test_gt, logistic_best_l1_reg_predict))

# cross validation for L2 regularization
k = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 100000]
cv_scores = []
for i in k:
    logistic_l2_reg = LogisticRegression(penalty='l2', C=i, solver='liblinear', random_state=42)
    score = cross_val_score(estimator=logistic_l2_reg, X=train_lsi_mat_50, y=label_train_gt, cv=5)
    cv_score = np.mean(score)
    print(cv_score)
    cv_scores.append(cv_score)
print(cv_scores)
best_score_index = cv_scores.index(max(cv_scores))
best_k = k[best_score_index]
print('best k for L2 regularization:', best_k)  # best k for L2 = 10000
print('best regularization strength for L2 regularization:', 1/best_k)  # C is the inverse of regularization strength

logistic_best_l2_reg = LogisticRegression(penalty='l2', C=best_k, solver='liblinear', random_state=42)
logistic_best_l2_reg.fit(train_lsi_mat_50, label_train_gt)
logistic_best_l2_reg_predict = logistic_best_l2_reg.predict(test_lsi_mat_50)
print('Best L2 regularization confusion matrix: ', confusion_matrix(label_test_gt, logistic_best_l2_reg_predict))
print('Best L2 regularization accuracy score: ', accuracy_score(label_test_gt, logistic_best_l2_reg_predict))
print('Best L2 regularization recall score: ', recall_score(label_test_gt, logistic_best_l2_reg_predict))
print('Best L2 regularization precision score: ', precision_score(label_test_gt, logistic_best_l2_reg_predict))
print('Best L2 regularization f1 score: ', f1_score(label_test_gt, logistic_best_l2_reg_predict))

In [None]:
#################### Question 7 ########################
gaussian_nb = GaussianNB()
gaussian_nb.fit(train_lsi_mat_50, label_train_gt)
gaussian_nb_predict = gaussian_nb.predict(test_lsi_mat_50)
print('GaussianNB confusion matrix: ', confusion_matrix(label_test_gt, gaussian_nb_predict))
print('GaussianNB accuracy score: ', accuracy_score(label_test_gt, gaussian_nb_predict))
print('GaussianNB recall score: ', recall_score(label_test_gt, gaussian_nb_predict))
print('GaussianNB precision score: ', precision_score(label_test_gt, gaussian_nb_predict))
print('GaussianNB f1 score: ', f1_score(label_test_gt, gaussian_nb_predict))
fpr_gaussian_nb, tpr_gaussian_nb, _ = roc_curve(label_test_gt, gaussian_nb.predict_proba(test_lsi_mat_50)[:, 1])
draw_roc_curve(fpr_gaussian_nb, tpr_gaussian_nb, 'GaussianNB ROC Curve')

In [None]:
#################### Question 8 ########################
def stem(text):  # stem the cleaned text
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    result = []
    for word in words:
        result.append(stemmer.stem(word).lower())
    return ' '.join(result)


train_stemmed_text = train_clean_text.apply(lambda x:stem(x))
test_stemmed_text = test_clean_text.apply(lambda x:stem(x))
print(train_stemmed_text)

steps = [('convector', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
         ('dim_reduce', TruncatedSVD(5, random_state=0)), ('classifier', svm.SVC(C=10000, kernel='linear', random_state=42))]
pipeline = Pipeline(steps)
param_dict = {
    'convector__min_df': (3, 5),
    'dim_reduce': (TruncatedSVD(5, random_state=0),
                   TruncatedSVD(30, random_state=0),
                   TruncatedSVD(80, random_state=0),
                   NMF(n_components=5, random_state=0),
                   NMF(n_components=30, random_state=0),
                   NMF(n_components=80, random_state=0)),
    'classifier': (svm.SVC(C=10000, kernel='linear', random_state=42),
                   LogisticRegression(penalty='l1', C=100, solver='liblinear', random_state=42),
                   LogisticRegression(penalty='l2', C=10000, solver='liblinear', random_state=42),
                   GaussianNB())
}

# grid search for lemmatized data
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring='accuracy', cv=5, verbose=4)
print('start grid search!!!')
grid_search.fit(train_lemmatized_text, label_train_gt)
print('Best estimator for lemmatization:', grid_search.best_estimator_)
print('Best estimator\'s parameters for lemmatization:', grid_search.best_params_)
pd.DataFrame(grid_search.cv_results_).to_csv(path_or_buf="grid_results_lemma.csv", index=False, encoding='utf-8')

# grid search for stemmed data
grid_search_2 = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring='accuracy', cv=5, verbose=4)
print('start grid search!!!')
grid_search_2.fit(train_stemmed_text, label_train_gt)
print('Best estimator for stem:', grid_search_2.best_estimator_)
print('Best estimator\'s parameters for stem:', grid_search_2.best_params_)
pd.DataFrame(grid_search_2.cv_results_).to_csv(path_or_buf="grid_results_stem.csv", index=False, encoding='utf-8')

# report the results of best 5 models
steps_1 = [('convector', CountVectorizer(stop_words='english', min_df=3)), ('tfidf', TfidfTransformer()),
           ('dim_reduce', TruncatedSVD(80, random_state=0)),
           ('classifier', LogisticRegression(C=100, penalty='l1', random_state=42, solver='liblinear'))]
pipeline_1 = Pipeline(steps_1)
pipeline_1.fit(train_stemmed_text, label_train_gt)
pipeline_1_predict = pipeline_1.predict(test_stemmed_text)
print('Best 1 confusion matrix: ', confusion_matrix(label_test_gt, pipeline_1_predict))
print('Best 1 accuracy score: ', accuracy_score(label_test_gt, pipeline_1_predict))
print('Best 1 recall score: ', recall_score(label_test_gt, pipeline_1_predict))
print('Best 1 precision score: ', precision_score(label_test_gt, pipeline_1_predict))
print('Best 1 f1 score: ', f1_score(label_test_gt, pipeline_1_predict))

steps_2 = [('convector', CountVectorizer(stop_words='english', min_df=5)), ('tfidf', TfidfTransformer()),
           ('dim_reduce', TruncatedSVD(80, random_state=0)),
           ('classifier', LogisticRegression(C=100, penalty='l1', random_state=42, solver='liblinear'))]
pipeline_2 = Pipeline(steps_2)
pipeline_2.fit(train_lemmatized_text, label_train_gt)
pipeline_2_predict = pipeline_2.predict(test_lemmatized_text)
print('Best 2 confusion matrix: ', confusion_matrix(label_test_gt, pipeline_2_predict))
print('Best 2 accuracy score: ', accuracy_score(label_test_gt, pipeline_2_predict))
print('Best 2 recall score: ', recall_score(label_test_gt, pipeline_2_predict))
print('Best 2 precision score: ', precision_score(label_test_gt, pipeline_2_predict))
print('Best 2 f1 score: ', f1_score(label_test_gt, pipeline_2_predict))

steps_3 = [('convector', CountVectorizer(stop_words='english', min_df=3)), ('tfidf', TfidfTransformer()),
           ('dim_reduce', NMF(80, random_state=0)),
           ('classifier', LogisticRegression(C=100, penalty='l1', random_state=42, solver='liblinear'))]
pipeline_3 = Pipeline(steps_3)
pipeline_3.fit(train_stemmed_text, label_train_gt)
pipeline_3_predict = pipeline_3.predict(test_stemmed_text)
print('Best 3 confusion matrix: ', confusion_matrix(label_test_gt, pipeline_3_predict))
print('Best 3 accuracy score: ', accuracy_score(label_test_gt, pipeline_3_predict))
print('Best 3 recall score: ', recall_score(label_test_gt, pipeline_3_predict))
print('Best 3 precision score: ', precision_score(label_test_gt, pipeline_3_predict))
print('Best 3 f1 score: ', f1_score(label_test_gt, pipeline_3_predict))

steps_4 = [('convector', CountVectorizer(stop_words='english', min_df=5)), ('tfidf', TfidfTransformer()),
           ('dim_reduce', TruncatedSVD(80, random_state=0)),
           ('classifier', LogisticRegression(C=10000, penalty='l2', random_state=42, solver='liblinear'))]
pipeline_4 = Pipeline(steps_4)
pipeline_4.fit(train_lemmatized_text, label_train_gt)
pipeline_4_predict = pipeline_4.predict(test_lemmatized_text)
print('Best 4 confusion matrix: ', confusion_matrix(label_test_gt, pipeline_4_predict))
print('Best 4 accuracy score: ', accuracy_score(label_test_gt, pipeline_4_predict))
print('Best 4 recall score: ', recall_score(label_test_gt, pipeline_4_predict))
print('Best 4 precision score: ', precision_score(label_test_gt, pipeline_4_predict))
print('Best 4 f1 score: ', f1_score(label_test_gt, pipeline_4_predict))

steps_5 = [('convector', CountVectorizer(stop_words='english', min_df=3)), ('tfidf', TfidfTransformer()),
           ('dim_reduce', TruncatedSVD(80, random_state=0)),
           ('classifier', LogisticRegression(C=100, penalty='l1', random_state=42, solver='liblinear'))]
pipeline_5 = Pipeline(steps_5)
pipeline_5.fit(train_lemmatized_text, label_train_gt)
pipeline_5_predict = pipeline_5.predict(test_lemmatized_text)
print('Best 5 confusion matrix: ', confusion_matrix(label_test_gt, pipeline_5_predict))
print('Best 5 accuracy score: ', accuracy_score(label_test_gt, pipeline_5_predict))
print('Best 5 recall score: ', recall_score(label_test_gt, pipeline_5_predict))
print('Best 5 precision score: ', precision_score(label_test_gt, pipeline_5_predict))
print('Best 5 f1 score: ', f1_score(label_test_gt, pipeline_5_predict))

#################### Question 9 ########################

In [None]:
#################### Question 9 ########################
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(42)  # make the training and testing samples the same as other groups
random.seed(42)

leaf_label = dataset[["leaf_label"]]

leaf_labels = ['chess', 'cricket', 'hockey', 'soccer', 'football', '%22forest%20fire%22', 'flood', 'earthquake', 'drought']

leaf_train, leaf_test = train_test_split(dataset[["full_text", "leaf_label"]], test_size=0.2)
print('leaf_train.shape: ', leaf_train.shape)
print('leaf_test.shape: ', leaf_test.shape)

In [None]:
leaf_train_clean, leaf_test_clean = leaf_train.applymap(clean)['full_text'], leaf_test.applymap(clean)['full_text']
leaf_train_lemma = leaf_train_clean.apply(lambda x: lemmatization(x))
leaf_test_lemma = leaf_test_clean.apply(lambda x: lemmatization(x))
leaf_train_count, leaf_test_count = cv.fit_transform(leaf_train_lemma), cv.transform(leaf_test_lemma)
print('CountVectorizer transformation done')
tfidf = TfidfTransformer()
leaf_train_tfidf, leaf_test_tfidf = tfidf.fit_transform(leaf_train_count).toarray(), tfidf.transform(leaf_test_count).toarray()
print('TFIDF transformation done')
lsi = TruncatedSVD(n_components=50, random_state=42)
nmf = NMF(n_components=50, random_state=42)
leaf_train_lsi, leaf_test_lsi = lsi.fit_transform(leaf_train_tfidf), lsi.transform(leaf_test_tfidf)
print('LSI transformation done')

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
def print_info(test, pred, title='', leaf_labels=leaf_labels):
    cm = confusion_matrix(test, pred)
    print("Confusion Matrix:\n")
    disp = ConfusionMatrixDisplay.from_predictions(test, pred, display_labels=np.array(leaf_labels), xticks_rotation='vertical')
    disp.ax_.set_title(title)
    plt.show()
    print("Accuracy Score: ", accuracy_score(test, pred))
    print("Recall Score: ", recall_score(test, pred, average='macro'))
    print("Precision Score: ", precision_score(test, pred, average='macro'))
    print("F1 Score: ", f1_score(test, pred, average='macro'))
    print("\n")

In [None]:
def label_to_id(labels):
    label_gt = []
    for label in labels:
        for j, _class in enumerate(leaf_labels):
            if label == _class:
                label_gt.append(j)
                break
    return np.array(label_gt)

In [None]:
# Naive Bayes
gaussian_nb = GaussianNB()
y_train, y_test = label_to_id(leaf_train['leaf_label']), label_to_id(leaf_test['leaf_label'])
pred = gaussian_nb.fit(leaf_train_lsi, y_train).predict(leaf_test_lsi)
print_info(y_test, pred, title='Naïve Bayes')

In [None]:
# Multiclass SVM One vs One
svm_vs_one = OneVsOneClassifier(svm.LinearSVC(random_state=42))
pred = svm_vs_one.fit(leaf_train_lsi, y_train).predict(leaf_test_lsi)
print_info(y_test, pred, title='One VS One')



In [None]:
# Multiclass SVM One vs Rest
svm_vs_rest = OneVsRestClassifier(svm.LinearSVC(class_weight='balanced', random_state=42))
pred = svm_vs_rest.fit(leaf_train_lsi, y_train).predict(leaf_test_lsi)
print_info(y_test, pred, title='One VS The Rest')


In [None]:
new_leaf_labels = ['chess', 'cricket', 'hockey', 'football', '%22forest%20fire%22', 'flood', 'earthquake', 'drought']

def label_to_id(labels):
    label_gt = []
    for label in labels:
        temp = label
        if label == 'soccer':
            label = 'football'
        for j, _class in enumerate(new_leaf_labels):
            if label == _class:
                label_gt.append(j)
                break
        label = temp
    return np.array(label_gt)

y_train, y_test = label_to_id(leaf_train['leaf_label']), label_to_id(leaf_test['leaf_label'])

print("One VS One\n")
# Multiclass SVM One vs One
svm_vs_one = OneVsOneClassifier(svm.LinearSVC(class_weight='balanced', random_state=42))
pred = svm_vs_one.fit(leaf_train_lsi, y_train).predict(leaf_test_lsi)
print_info(y_test, pred, title='One VS One', leaf_labels=new_leaf_labels)

print("One VS Rest\n")
# Multiclass SVM One vs Rest
svm_vs_rest = OneVsRestClassifier(svm.LinearSVC(class_weight='balanced', random_state=42))
pred = svm_vs_rest.fit(leaf_train_lsi, y_train).predict(leaf_test_lsi)
print_info(y_test, pred, title='One VS The Rest', leaf_labels=new_leaf_labels)


In [None]:
#################### Question 11 #######################

# Get GLoVE embeddings from specified dimension file
def get_glove_embeddings(dimension):
    embeddings_dict = {}
    with open("glove.6B." + str(dimension) + "d.txt", 'r', encoding="utf-8") as f: # if 'r' fails with unicode error, please use 'rb'
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# Clean, remove stopwords, and lemmatize the full text. We have already cleaned the data earlier.
stop_words = stopwords.words('english')

def clean_glove_text(text):
    text = text.lower()
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

# Get keywords column as an additional representation of the text
train_keywords = train['keywords']
test_keywords = test['keywords']

train_clean_text = train_clean_text.apply(lambda x: clean_glove_text(x))
train_lemmatized_text = train_clean_text.apply(lambda x: lemmatization(x))
train_clean_keywords = train_keywords.apply(lambda x: clean_glove_text(x))
train_lemmatized_keywords = train_clean_keywords.apply(lambda x: lemmatization(x))

test_clean_text = test_clean_text.apply(lambda x: clean_glove_text(x))
test_lemmatized_text = test_clean_text.apply(lambda x: lemmatization(x))
test_clean_keywords = test_keywords.apply(lambda x: clean_glove_text(x))
test_lemmatized_keywords = test_clean_keywords.apply(lambda x: lemmatization(x))

# Use Glove embeddings to get a vector representing the document
def get_glove_representation(text, embeddings_dict):
    glove_words = []
    for word in text.split():
        try:
            glove_word = embeddings_dict[word]
            glove_words.append(glove_word)
        except:
            pass

    glove_words = np.array(glove_words)
    return glove_words.mean(axis = 0) # average across all word embeddings in the text

For 300-dimension embeddings

In [None]:
embeddings_dict = get_glove_embeddings(300)
train_glove_text = train_lemmatized_text.apply(lambda x: get_glove_representation(x, embeddings_dict))
train_glove_text_300 = train_glove_text.values.tolist()
train_glove_keywords = train_lemmatized_keywords.apply(lambda x: get_glove_representation(x, embeddings_dict))
train_glove_keywords_300 = train_glove_keywords.values.tolist()
train_glove_300 = np.mean([train_glove_text_300, train_glove_keywords_300], axis = 0)

test_glove_text = test_lemmatized_text.apply(lambda x: get_glove_representation(x, embeddings_dict))
test_glove_text_300 = test_glove_text.values.tolist()
test_glove_keywords = test_lemmatized_keywords.apply(lambda x: get_glove_representation(x, embeddings_dict))
test_glove_keywords_300 = test_glove_keywords.values.tolist()
test_glove_300 = np.mean([test_glove_text_300, test_glove_keywords_300], axis = 0)

# Run SVC with cross-validation with different hyperparameters
gamma = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]
cv_scores = []

for i in gamma:
    svm_model = svm.SVC(C=i, kernel='linear', random_state=42)
    score = cross_val_score(estimator=svm_model, X=train_glove_300, y=label_train_gt, cv=5)
    cv_score = np.mean(score)
    cv_scores.append(cv_score)

print(cv_scores)
best_score_index = cv_scores.index(max(cv_scores))
best_gamma = gamma[best_score_index]
print('best gamma:', best_gamma)

# Evaluate the model with the best hyperparameters
svm_model = svm.SVC(C=best_gamma, kernel='linear', random_state=42)
svm_model.fit(train_glove_300, label_train_gt)
svm_model_predict = svm_model.predict(test_glove_300)
accuracy_glove_300 = accuracy_score(label_test_gt, svm_model_predict)
print_info(label_test_gt, svm_model_predict, leaf_labels=["climate", "sports"])

Testing number of dimensions in GLoVE-embeddings versus testing accuracy

In [None]:
#################### Question 12 #######################

# Testing with different GLoVE embedding dimensions. We don't tune hyperparameters, and just use the best gamma from Q11.

dimensions = [50, 100, 200]
accuracies = []

for dimension in dimensions:
    embeddings_dict = get_glove_embeddings(dimension)
    
    train_glove_text = train_lemmatized_text.apply(lambda x: get_glove_representation(x, embeddings_dict))
    glove_text = train_glove_text.values.tolist()
    train_glove_keywords = train_lemmatized_keywords.apply(lambda x: get_glove_representation(x, embeddings_dict))
    glove_keywords = train_glove_keywords.values.tolist()
    train_glove = np.mean([glove_text, glove_keywords], axis = 0)

    test_glove_text = test_lemmatized_text.apply(lambda x: get_glove_representation(x, embeddings_dict))
    glove_text = test_glove_text.values.tolist()
    test_glove_keywords = test_lemmatized_keywords.apply(lambda x: get_glove_representation(x, embeddings_dict))
    glove_keywords = test_glove_keywords.values.tolist()
    test_glove = np.mean([glove_text, glove_keywords], axis = 0)

    # Evaluate the model with the best hyperparameters
    svm_model = svm.SVC(C=best_gamma, kernel='linear', random_state=42)
    svm_model.fit(train_glove, label_train_gt)
    svm_model_predict = svm_model.predict(test_glove)
    accuracy_glove = accuracy_score(label_test_gt, svm_model_predict)
    accuracies.append(accuracy_glove)

# Add original accuracy score
dimensions.append(300)
accuracies.append(accuracy_glove_300)

# Plot accuracy versus dimension of GLoVE embeddings
plt.plot(dimensions, accuracies)
plt.xlabel('Dimension of Embeddings')
plt.ylabel('Test Accuracy')
plt.title('Dimension of GLoVE Embeddings vs. Test Accuracy')
plt.show()

In [None]:
#################### Question 13 #######################

def create_umap_plot(x, y, title): # ref: https://umap-learn.readthedocs.io/en/latest/plotting.html
    embeddings = umap.UMAP().fit(x)
    categorical_labels = []

    for i in y:
        if i == 0:
            categorical_labels.append("climate")
        else:
            categorical_labels.append("sports")

    umap.plot.points(embeddings, labels=np.array(categorical_labels), theme='viridis')
    plt.title(title)
    plt.show()

# Create UMAP plots for training and testing data with dimensions = 300
create_umap_plot(train_glove_300, label_train_gt, "GLoVE Embeddings for Training Set")

In [None]:
create_umap_plot(test_glove_300, label_test_gt, "GLoVE Embeddings for Testing Set")

In [None]:
# Create UMAP plot for random vectors with dimensions = 300
random_x = np.random.normal(0, 1, size=(len(train_glove_300), 300))
normalized_x = random_x / np.linalg.norm(random_x)
create_umap_plot(normalized_x, label_train_gt, "Random Normalized Vectors")