##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets
### NOTE: This file is used to apply 10-fold cross validation of models

## Read the CSV datafiles (Train and Test)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [2]:
train_data = pd.read_csv("Train.csv", sep=',')
train_data.drop(columns = 'Unnamed: 0', inplace = True)
test_data = pd.read_csv("Test.csv", sep=',')

# separating instance and label for Train and Test
X_train_raw = train_data['text']
Y_train = train_data['sentiment']
X_test_raw = test_data['text']

#check the result
print("Train length:",len(X_train_raw))
print("Test length:",len(X_test_raw))

Train length: 21802
Test length: 6099


## Data Preprocessing

In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
import unidecode
import contractions 

In [4]:
def contain_digit(word):
    '''
    Check and return true if a word contains digits.
    '''
    for char in word:
        if char.isdigit():
            return True
    return False


def preprocess(text):
    '''
    Preprocess the raw text data into tokenized lists of words.
    Input: a single tweet
    Output: a list of filtered terms
    '''
    # expand contractions (e.g. can't -> cannot)
    revised_text = contractions.fix(text)
    
    # remove links from the text
    revised_text = re.sub(r'\w+:\/{2}[\w-]+(\.[\w\/-]+)*', '', revised_text)
    
    # remove non-ASCII characters
    revised_text = re.sub(r'[^\x00-\x7F]', r' ', revised_text)

    # remove any spacing characters
    revised_text = re.sub(r'[\n\t\s]+', r' ', revised_text)
    
    # tokenize the text into words
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(revised_text)
    
    # remove stopwords, but keep 'not' and 'no' in text as they indicate negation
    keep = ['no', 'not']
    stop_words = set(stopwords.words('english'))
    revised_lst = [w for w in tokens if w in keep or w not in stop_words]
    
    # remove punctuations in text
    revised_lst = [w for w in revised_lst if w not in string.punctuation]
    
    # remove words that contain numbers
    revised_lst = [w for w in revised_lst if not contain_digit(w)]
    
    # remove words that are only a single character long
    # reduce words back into their stem form except hashtags
    stemmer = SnowballStemmer("english")
    revised_lst = [w if w[0] == '#' else stemmer.stem(w) for w in revised_lst if len(w) != 1]

    return revised_lst

## Feature Engineering

### N-gram TF-IDF Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [6]:
def n_gram_tfidf(X_train_raw, X_test_raw, n = 1):
    '''
    Apply n-gram algorithms while doing TF-IDF vectorization.
    n: {1: 'unigram', 2: 'bigram', n: '1-n gram'}, default = 1
    '''
    if n==1:
        # unigram
        tfidf_vectorizer = TfidfVectorizer(analyzer=preprocess)
        X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)
        X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)
        
    else:
        # join the tokenized words into sentences
        train_x_cleaned = []
        test_cleaned = []
        
        for i in X_train_raw:
            train_x_cleaned.append(' '.join(preprocess(i)))
        for i in X_test_raw:
            test_cleaned.append(' '.join(preprocess(i)))
    
        if n==2:
            # bigram
            vectorizer = TfidfVectorizer(ngram_range=(2,2))
            X_train_tfidf = vectorizer.fit_transform(train_x_cleaned)
            X_test_tfidf = vectorizer.transform(test_cleaned)
        
        else:
            # 1-n gram
            vectorizer = TfidfVectorizer(ngram_range=(1,n))
            X_train_tfidf = vectorizer.fit_transform(train_x_cleaned)
            X_test_tfidf = vectorizer.transform(test_cleaned)
            
    return X_train_tfidf, X_test_tfidf

### sampling

In [7]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [8]:
def sampling(X_train_tfidf, Y_train, sampling_method = None):
    '''
    Apply sampling method to the cleaned training data.
    sampling_method: {'under', 'over', None}, default = None
        - 'under': random under sampling
        - 'over': random over sampling
        - None: no sampling method applied 
    '''
    if sampling_method == 'under':
        rus = RandomUnderSampler(random_state=42) 
        return rus.fit_resample(X_train_tfidf, Y_train)
    elif sampling_method == 'over':
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(X_train_tfidf, Y_train)
    else:
        return X_train_tfidf, Y_train

### Feature Selection

In [9]:
from sklearn.feature_selection import SelectKBest, chi2

In [10]:
def kBest_chi2(i: int, X_train_smp, Y_train, X_test_tfidf):
    '''
    Select the first i best features using Chi-square test. 
    '''
    x2 = SelectKBest(chi2, k=i)
    X_train_kBest = x2.fit_transform(X_train_smp,Y_train)
    X_test_kBest = x2.transform(X_test_tfidf)
    return X_train_kBest, X_test_kBest

## k-fold Cross Validation

In [11]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

In [12]:
zero_r = DummyClassifier(strategy="most_frequent")
bnb = BernoulliNB()
knn5 = KNeighborsClassifier(n_neighbors=5)
svm = SVC(kernel='rbf', C=5, decision_function_shape='ovo')
lg_clf = LogisticRegression(solver='saga', multi_class='multinomial', C=6, 
                            class_weight='balanced', max_iter=1000, penalty = 'l2')
estimators = {
    ('lg', LogisticRegression(solver='saga', multi_class='multinomial', C=6, max_iter=1000, penalty = 'l2')),
    ('svm', SVC(kernel='rbf', C=5)),
    ('bnb', BernoulliNB()),
}

stk_clf = StackingClassifier(estimators, final_estimator=LogisticRegression(solver='saga', 
                            multi_class='multinomial', C=6, max_iter=10000, penalty = 'l2'))

In [13]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
def cross_validation(X_train_raw, Y_train, fold=10):
    total_acc = {"BNB":[0]*10, "KNN":[0]*10, "LG":[0]*10, "SVM":[0]*10, "STK":[0]*10}
    total_recall = {"BNB":[0]*10, "KNN":[0]*10, "LG":[0]*10, "SVM":[0]*10, "STK":[0]*10}
    total_precision = {"BNB":[0]*10, "KNN":[0]*10, "LG":[0]*10, "SVM":[0]*10, "STK":[0]*10}
    total_f1 = {"BNB":[0]*10, "KNN":[0]*10, "LG":[0]*10, "SVM":[0]*10, "STK":[0]*10}
    
    # iteratively divide data into train/test set
    num_test_data = int(X_train_raw.shape[0]/fold)
    for i in range(fold):
        X_val = X_train_raw[i*num_test_data:(i+1)*num_test_data]
        X_train = pd.concat([X_train_raw[:i*num_test_data], X_train_raw[(i+1)*num_test_data:]], axis=0)
        y_val = Y_train[i*num_test_data:(i+1)*num_test_data]
        y_train = pd.concat([Y_train[:i*num_test_data], Y_train[(i+1)*num_test_data:]], axis=0)
      
        X_train_tfidf, X_val_tfidf = n_gram_tfidf(X_train, X_val, n=1)  # adjust n-gram approach here
        feature_size = X_train_tfidf.shape[1]
        
        # sampling 
        X_train_smp, Y_train_smp = sampling(X_train_tfidf, y_train, sampling_method = 'under') # adjust sampling methods here
        # sub-sample the validation set to obtain a balanced dataset
        X_val_tfidf, y_val = sampling(X_val_tfidf, y_val, sampling_method = 'under')

        # choose k best (top 10%) features using chi2 test
        X_train_kBest, X_val_kBest = kBest_chi2(int(0.1*feature_size), X_train_smp, Y_train_smp, X_val_tfidf)
        
        clf_dict = {bnb: "BNB", knn5: "KNN", lg_clf: "LG", svm: "SVM", stk_clf: "STK"}
        for clf in clf_dict:
            acc = []
            X_train_kBest, X_val_kBest = kBest_chi2(int(0.1*feature_size), X_train_smp, Y_train_smp, X_val_tfidf)
            
            clf.fit(X_train_kBest, Y_train_smp)
            prediction = clf.predict(X_val_kBest)

            total_acc[clf_dict[clf]][i] = accuracy_score(y_val, prediction)
            total_f1[clf_dict[clf]][i] = f1_score(y_val, prediction, average='macro')
            total_precision[clf_dict[clf]][i] = precision_score(y_val, prediction, average='macro')
            total_recall[clf_dict[clf]][i] = recall_score(y_val, prediction, average='macro')
            
            
    for clf in clf_dict:
        print(clf_dict[clf])
        print("-------------------------------------------------------")
        print("The average accuracy is: ", sum(total_acc[clf_dict[clf]])/fold)
        print("The average recall is: ", sum(total_recall[clf_dict[clf]])/fold)
        print("The average precision is: ", sum(total_precision[clf_dict[clf]])/fold)
        print("The average f1 is: ", sum(total_f1[clf_dict[clf]])/fold)
        print("\n")
    return 

In [14]:
cross_validation(X_train_raw, Y_train)

BNB
-------------------------------------------------------
The average accuracy is:  0.6251553523262963
The average recall is:  0.6251553523262963
The average precision is:  0.619061009525415
The average f1 is:  0.6197547473181317


KNN
-------------------------------------------------------
The average accuracy is:  0.49629535197386854
The average recall is:  0.49629535197386854
The average precision is:  0.5401614117000091
The average f1 is:  0.49746491135229975


LG
-------------------------------------------------------
The average accuracy is:  0.6300765624417339
The average recall is:  0.630076562441734
The average precision is:  0.6297998011282645
The average f1 is:  0.6297496042769579


SVM
-------------------------------------------------------
The average accuracy is:  0.6217596364266736
The average recall is:  0.6217596364266736
The average precision is:  0.621470883561944
The average f1 is:  0.6212804913256159


STK
-------------------------------------------------------
T