##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets
### Note: This file is used to generate prediction CSV files for data from "Test.csv".

## Read the CSV datafiles (Train and Test)

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("Train.csv", sep=',')
train_data.drop(columns = 'Unnamed: 0', inplace = True)
test_data = pd.read_csv("Test.csv", sep=',')

# separating instance and label for Train and Test
X_train_raw = train_data['text']
Y_train = train_data['sentiment']
X_test_raw = test_data['text']

#check the result
print("Train length:",len(X_train_raw))
print("Test length:",len(X_test_raw))

Train length: 21802
Test length: 6099


## Data Preprocessing

In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
import unidecode
import contractions 

In [4]:
def contain_digit(word):
    '''
    Check and return true if a word contains digits.
    '''
    for char in word:
        if char.isdigit():
            return True
    return False


def preprocess(text):
    '''
    Preprocess the raw text data into tokenized lists of words.
    Input: a single tweet
    Output: a list of filtered terms
    '''
    # expand contractions (e.g. can't -> cannot)
    revised_text = contractions.fix(text)
    
    # remove links from the text
    revised_text = re.sub(r'\w+:\/{2}[\w-]+(\.[\w\/-]+)*', '', revised_text)
    
    # remove non-ASCII characters
    revised_text = re.sub(r'[^\x00-\x7F]', r' ', revised_text)

    # remove any spacing characters
    revised_text = re.sub(r'[\n\t\s]+', r' ', revised_text)
    
    # tokenize the text into words
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(revised_text)
    
    # remove stopwords, but keep 'not' and 'no' in text as they indicate negation
    keep = ['no', 'not']
    stop_words = set(stopwords.words('english'))
    revised_lst = [w for w in tokens if w in keep or w not in stop_words]
    
    # remove punctuations in text
    revised_lst = [w for w in revised_lst if w not in string.punctuation]
    
    # remove words that contain numbers
    revised_lst = [w for w in revised_lst if not contain_digit(w)]
    
    # remove words that are only a single character long
    # reduce words back into their stem form except hashtags
    stemmer = SnowballStemmer("english")
    revised_lst = [w if w[0] == '#' else stemmer.stem(w) for w in revised_lst if len(w) != 1]

    return revised_lst

## Feature Engineering

### N-gram TF-IDF Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [6]:
def n_gram_tfidf(X_train_raw, X_test_raw, n = 1):
    '''
    Apply n-gram algorithms while doing TF-IDF vectorization.
    n: {1: 'unigram', 2: 'bigram', n: '1-n gram'}, default = 1
    '''
    if n==1:
        # unigram
        tfidf_vectorizer = TfidfVectorizer(analyzer=preprocess)
        X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)
        X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)
        
    else:
        # join the tokenized words into sentences
        train_x_cleaned = []
        test_cleaned = []
        
        for i in X_train_raw:
            train_x_cleaned.append(' '.join(preprocess(i)))
        for i in X_test_raw:
            test_cleaned.append(' '.join(preprocess(i)))
    
        if n==2:
            # bigram
            vectorizer = TfidfVectorizer(ngram_range=(2,2))
            X_train_tfidf = vectorizer.fit_transform(train_x_cleaned)
            X_test_tfidf = vectorizer.transform(test_cleaned)
        
        else:
            # 1-n gram
            vectorizer = TfidfVectorizer(ngram_range=(1,n))
            X_train_tfidf = vectorizer.fit_transform(train_x_cleaned)
            X_test_tfidf = vectorizer.transform(test_cleaned)
            
    return X_train_tfidf, X_test_tfidf

### sampling

In [7]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [8]:
def sampling(X_train_tfidf, Y_train, sampling_method = None):
    '''
    Apply sampling method to the cleaned training data.
    sampling_method: {'under', 'over', None}, default = None
        - 'under': random under sampling
        - 'over': random over sampling
        - None: no sampling method applied 
    '''
    if sampling_method == 'under':
        rus = RandomUnderSampler(random_state=42) 
        return rus.fit_resample(X_train_tfidf, Y_train)
    elif sampling_method == 'over':
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(X_train_tfidf, Y_train)
    else:
        return X_train_tfidf, Y_train

### Feature Selection

In [9]:
from sklearn.feature_selection import SelectKBest, chi2

In [10]:
def kBest_chi2(i: int, X_train_smp, Y_train, X_test_tfidf):
    '''
    Select the first i best features using Chi-square test. 
    '''
    x2 = SelectKBest(chi2, k=i)
    X_train_kBest = x2.fit_transform(X_train_smp,Y_train)
    X_test_kBest = x2.transform(X_test_tfidf)
    return X_train_kBest, X_test_kBest

In [11]:
# apply tfidf vectorization
X_train_tfidf, X_test_tfidf = n_gram_tfidf(X_train_raw, X_test_raw, 1)
feature_size = X_train_tfidf.shape[1]
print("Train feature space size (using TFIDF):",X_train_tfidf.shape)
print("Test feature space size (using TFIDF):",X_test_tfidf.shape)
print("\n")

# sampling 
X_train_smp, Y_train_smp = sampling(X_train_tfidf, Y_train, sampling_method = 'under') # adjust sampling methods here

# choose k best (top 10%) features using chi2 test
X_train_kBest, X_test_kBest = kBest_chi2(int(0.1*feature_size), X_train_smp, Y_train_smp, X_test_tfidf)
print("Train feature space size (after feature selection):", X_train_kBest.shape)
print("\n")

Train feature space size (using TFIDF): (21802, 22333)
Test feature space size (using TFIDF): (6099, 22333)


Train feature space size (after feature selection): (11145, 2233)




### Stacking

In [12]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

estimators = {
    ('lg', LogisticRegression(solver='saga', multi_class='multinomial', C=6, max_iter=1000, penalty = 'l2')),
    ('svm', SVC(kernel='rbf', C=5)),
    ('bnb', BernoulliNB()),
}

stk_clf = StackingClassifier(estimators, final_estimator=LogisticRegression(solver='saga', 
                            multi_class='multinomial', C=6, max_iter=10000, penalty = 'l2'))

stk_clf.fit(X_train_kBest, Y_train_smp)
prediction = stk_clf.predict(X_test_kBest)
result = {'id': test_data['id'].tolist(), 'sentiment': prediction}
result_df = pd.DataFrame(result)
result_df.to_csv('prediction.csv', index=False)