In [46]:
import nltk
import numpy as np
import pandas as pd
import re
import random
import spacy

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

nlp = spacy.load("en_core_web_sm")
minmax_scaler = MinMaxScaler()
maxabs_scaler = MaxAbsScaler()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hurin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hurin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hurin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hurin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<h3>Baseline Model</h3>

In [2]:
# load the data
train = pd.read_csv('train.csv')
X_train = train['Text']
y_train = train['Verdict']

X_training, X_validation, y_training, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the BoW Vectorizer
bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))

# Fit and transform the training data, and transform the testing data
X_training_bow = bow_vectorizer.fit_transform(X_training)
X_validation_bow = bow_vectorizer.transform(X_validation)

# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_training_bow, y_training)

# Make predictions on the testing set
y_prediction = nb_classifier.predict(X_validation_bow)

# Evaluate the model
print(classification_report(y_validation, y_prediction))

f1_score(y_validation, y_prediction, average='macro')

              precision    recall  f1-score   support

          -1       0.80      0.90      0.85      2926
           0       0.50      0.22      0.30       502
           1       0.64      0.60      0.62      1073

    accuracy                           0.75      4501
   macro avg       0.64      0.57      0.59      4501
weighted avg       0.73      0.75      0.73      4501



0.5879760408201458

In [3]:
# get the prediction for the test set
test = pd.read_csv('test.csv')
X_test = test['Text']

X_test_bow = bow_vectorizer.transform(X_test)

result = nb_classifier.predict(X_test_bow)

test['Verdict'] = pd.Series(result)
test.drop(columns=['Text'], inplace=True)
test.to_csv('A0233573E_Naive_Bayes.csv', index=False)

<h3>Improvising the Model with Data Pre-processing</h3>

Dataset engineering: Remove empty sentences, duplicate sentences and identical sentences with conflicting verdict labels

In [13]:
def preprocess_dataset(train):
    # Remove rows where 'Text' is '#NAME'
    train = train[train['Text'] != '#NAME']
    train = train[train['Text'] != '#NAME?']

    # Remove duplicate sentences with same verdict label
    train = train.drop_duplicates(subset=['Text', 'Verdict'])

    # Identify identical sentences
    duplicates_all = train[train.duplicated('Text', keep=False)]

    # Filter out sentences with more than one unique 'Verdict' value
    conflicting = duplicates_all.groupby('Text').filter(lambda x: x['Verdict'].nunique() > 1)

    # Find the unique texts that have conflicting verdicts
    conflicting_texts = conflicting['Text'].unique()

    # Remove all instances of these sentences from the dataset
    train = train[~train['Text'].isin(conflicting_texts)]

    # Sort by 'Text' for better readability
    train = train.sort_values('Text').reset_index(drop=True)

    # Save the cleaned dataset
    train.to_csv('after dataset preprocessing.csv', index=False)

    return train

Preprocess training text data

In [87]:
def preprocess_text(text):
    # Convert text to lowercase
    # text = text.lower()
    
    # Remove punctuation
    # text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text) #NLTK

    # document = nlp(text) #SpaCy
    # tokens = [token.text for token in document]
    
    # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if word not in stop_words]

    # stemming
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]
    
    # lemmatization
    # lemmatizer = WordNetLemmatizer()
    # tokens= [lemmatizer.lemmatize(word) for word in tokens]
    
    # Re-join tokens into a single string
    preprocessed_text = ' '.join(tokens)

    # preprocessed_text = text
    
    return preprocessed_text
    # return tokens


Data augmentation: Create more sentences with synonym replacement

In [192]:
def synonym_replacement(sentence, replacement_rate=0.1):
    words = nltk.word_tokenize(sentence)
    new_words = []
    for word in words:
        if random.random() < replacement_rate:
            synonyms = [lemma.name() for syn in wordnet.synsets(word) for lemma in syn.lemmas() if lemma.name() != word]
            if synonyms:
                new_words.append(random.choice(synonyms))
                continue
        new_words.append(word)
    return ' '.join(new_words)


Data augmentation: Create more sentences by adding noise (typos) to existing sentences

In [200]:
def add_typo_noise(sentence, noise_rate=0.05):
    letters = list(sentence)
    for i in range(len(letters)):
        if random.random() < noise_rate and letters[i].isalpha():
            swap_index = i + random.choice([-1, 1])  # Swap with the previous or next character
            if 0 <= swap_index < len(letters):
                letters[i], letters[swap_index] = letters[swap_index], letters[i]
    return ''.join(letters)



Data augmentation function

In [202]:
def augment_sentences(train):

    replacement_rate = 0.1 # for synonym replacement
    noise_rate = 0.05 # for adding typos

    # Count sentences per verdict category
    verdict_counts = train['Verdict'].value_counts()
    target_count = verdict_counts.max()

    augmented_sentences = []
    augmented_verdicts = []

    for verdict, count in train['Verdict'].value_counts().items():
        # If this verdict category is already at or above the target, just continue
        if count >= target_count:
            continue

        # Calculate how many additional sentences are needed
        additional_needed = target_count - count

        # Filter the DataFrame for the current verdict category
        verdict_df = train[train['Verdict'] == verdict]

        # While loop to keep augmenting until we reach the required additional count
        while additional_needed > 0:
            for _, row in verdict_df.iterrows():
                if additional_needed <= 0:
                    break  # Break if no more sentences are needed
                # augmented_sentence = synonym_replacement(row['Text'], replacement_rate)
                augmented_sentence = add_typo_noise(row['Text'], noise_rate)
                # Avoid adding the exact original sentence
                if augmented_sentence != row['Text']:
                    augmented_sentences.append(augmented_sentence)
                    augmented_verdicts.append(verdict)
                    additional_needed -= 1

    # Append the new sentences to the original DataFrame
    augmented_df = pd.DataFrame({'Text': augmented_sentences, 'Verdict': augmented_verdicts})
    return pd.concat([train, augmented_df], ignore_index=True)




<h3>Improvising the Model with Feature Engineering</h3>

Sentence length and number of each type of POS tags

In [33]:
def feature_engineer_nltk(X_training):

    list_of_features = []

    pos_tags_of_interest = ['NN', 'VB', 'JJ', 'RB']  # nouns, verbs, adjectives, adverbs

    for sentence in X_training:
        # Tokenize the sentence
        tokens = word_tokenize(sentence)
        
        # Get POS tags for the tokens in the sentence
        pos_tags = pos_tag(tokens)
        
        # Count the occurrences of each POS tag of interest
        pos_counts = {tag: 0 for tag in pos_tags_of_interest}
        for word, tag in pos_tags:
            if tag in pos_tags_of_interest:
                pos_counts[tag] += 1
        
        # Calculate sentence length
        sentence_length = len(tokens)

        # Combine sentence length and POS tag counts into a single feature array for this sentence
        features = [sentence_length] + [pos_counts[tag] for tag in pos_tags_of_interest]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

In [25]:
def feature_engineer_spacy(X_training):

    pos_tags_of_interest = ['NOUN', 'VERB', 'ADJ', 'ADV']  # nouns, verbs, adjectives, adverbs
    
    features_list = []

    for doc in nlp.pipe(X_training):
        # Count the occurrences of each POS tag of interest
        pos_counts = {tag: 0 for tag in pos_tags_of_interest}
        for token in doc:
            if token.pos_ in pos_tags_of_interest:
                pos_counts[token.pos_] += 1
        
        # Calculate sentence length
        sentence_length = len(doc)

        # Combine sentence length and POS tag counts into a single feature array for this sentence
        features = [sentence_length] + [pos_counts[tag] for tag in pos_tags_of_interest]
        features_list.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(features_list)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

Sentiment analysis with Textblob

In [36]:
def sentiment_analysis_textblob(X_training):
    
    list_of_features = []

    for sentence in X_training:

        # Analyze sentiment with TextBlob
        blob = TextBlob(sentence)
        sentiment_polarity = blob.sentiment.polarity
        sentiment_subjectivity = blob.sentiment.subjectivity

        # Combine sentence length, POS tag counts, and sentiment features into a single feature array for this sentence
        features = [sentiment_polarity, sentiment_subjectivity]
        list_of_features.append(features)

    features_array = np.array(list_of_features)

    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

Sentiment analysis with Vader

In [48]:
sia = SentimentIntensityAnalyzer()

def sentiment_analysis_vader(X_training):
    list_of_features = []

    for sentence in X_training:
        # Apply VADER sentiment analysis
        vader_scores = sia.polarity_scores(sentence)

        # Extract the scores for positive, negative, neutral, and compound sentiments
        vader_positive = vader_scores['pos']
        vader_negative = vader_scores['neg']
        vader_neutral = vader_scores['neu']
        vader_compound = vader_scores['compound']

        # Combine the VADER sentiment scores into a single feature array for this sentence
        features = [vader_positive, vader_negative, vader_neutral, vader_compound]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled


Unique words to sentence length ratio with NLTK

In [54]:
def unique_words_ratio_nltk(X_training):
    list_of_features = []

    for sentence in X_training:
        # Tokenize the sentence
        tokens = word_tokenize(sentence)
        
        # Calculate unique words vs sentence length ratio
        unique_words_ratio = len(set(tokens)) / len(tokens) if tokens else 0
        
        # Combine the ratio into the feature array for this sentence
        features = [unique_words_ratio]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

Unique words to sentence length ratio with SpaCy

In [56]:
def unique_words_ratio_spacy(X_training):
    list_of_features = []

    for doc in nlp.pipe(X_training):
        # Calculate unique words vs sentence length ratio
        unique_words_ratio = len(set(token.text for token in doc)) / len(doc) if doc else 0
        
        # Combine the ratio into the feature array for this sentence
        features = [unique_words_ratio]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

Sentiment analysis with Textblob + Finding ratio of unique words to length of sentence with NLTK

In [88]:
def textblob_and_unique_words_nltk(X_training):
    
    list_of_features = []

    for sentence in X_training:

        # Analyze sentiment with TextBlob
        blob = TextBlob(sentence)
        sentiment_polarity = blob.sentiment.polarity
        sentiment_subjectivity = blob.sentiment.subjectivity

        # Tokenize the sentence
        tokens = word_tokenize(sentence)
        
        # Calculate unique words vs sentence length ratio
        unique_words_ratio = len(set(tokens)) / len(tokens) if tokens else 0
        
        # Combine the ratio into the feature array for this sentence
        features = [sentiment_polarity, sentiment_subjectivity] + [unique_words_ratio]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

<h3>Main Code!</h3>

In [89]:
# load the data
train = pd.read_csv('train.csv')

# train = preprocess_dataset(train)

X_train = train['Text']
y_train = train['Verdict']

# Apply augmentation
# train_augmented = augment_sentences(train)

# X_train_augmented = train_augmented['Text']
# y_train_augmented = train_augmented['Verdict']

X_train_preprocessed = X_train.apply(preprocess_text)
# X_train_preprocessed = X_train_augmented.apply(preprocess_text)

X_training, X_validation, y_training, y_validation = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state=42)
# X_training, X_validation, y_training, y_validation = train_test_split(X_train_preprocessed, y_train_augmented, test_size=0.2, random_state=42)

# Initialize the BoW Vectorizer
bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 2))

# Fit and transform the training data, and transform the testing data
X_training_bow = bow_vectorizer.fit_transform(X_training)
X_validation_bow = bow_vectorizer.transform(X_validation)

# X_training_tfidf = tfidf_vectorizer.fit_transform(X_training)
# X_validation_tfidf = tfidf_vectorizer.transform(X_validation)

# MAX ABS
# Fit and transform the training data
# X_training_bow_scaled = maxabs_scaler.fit_transform(X_training_bow.toarray())
# Only need to transform validation data
# X_validation_bow_scaled = maxabs_scaler.transform(X_validation_bow.toarray())

# MIN MAX
# Fit and transform the training data
# X_training_bow_scaled = minmax_scaler.fit_transform(X_training_bow.toarray())
# Only need to transform validation data
# X_validation_bow_scaled = minmax_scaler.transform(X_validation_bow.toarray())

# X_training_engineered = feature_engineer_nltk(X_training)
# X_training_engineered = feature_engineer_spacy(X_training)
# X_training_engineered = sentiment_analysis_textblob(X_training)
# X_training_engineered = sentiment_analysis_vader(X_training)
# X_training_engineered = unique_words_ratio_nltk(X_training)
# X_training_engineered = unique_words_ratio_spacy(X_training)
X_training_engineered = textblob_and_unique_words_nltk(X_training)
X_training_combined = np.hstack([X_training_bow.toarray(), X_training_engineered])

# X_validation_engineered = feature_engineer_nltk(X_validation)
# X_validation_engineered = feature_engineer_spacy(X_validation)
# X_validation_engineered = sentiment_analysis_textblob(X_validation)
# X_validation_engineered = sentiment_analysis_vader(X_validation)
# X_validation_engineered = unique_words_ratio_nltk(X_validation)
# X_validation_engineered = unique_words_ratio_spacy(X_validation)
X_validation_engineered = textblob_and_unique_words_nltk(X_validation)
X_validation_combined = np.hstack([X_validation_bow.toarray(), X_validation_engineered])

# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_training_combined, y_training)
# nb_classifier.fit(X_training_bow, y_training)
# nb_classifier.fit(X_training_bow_scaled, y_training)
# nb_classifier.fit(X_training_tfidf, y_training)

# Make predictions on the testing set
y_prediction = nb_classifier.predict(X_validation_combined)
# y_prediction = nb_classifier.predict(X_validation_bow)
# y_prediction = nb_classifier.predict(X_validation_bow_scaled)
# y_prediction = nb_classifier.predict(X_validation_tfidf)

# Evaluate the model
print(classification_report(y_validation, y_prediction))

f1_score(y_validation, y_prediction, average='macro')

              precision    recall  f1-score   support

          -1       0.80      0.91      0.85      2926
           0       0.49      0.20      0.29       502
           1       0.66      0.59      0.62      1073

    accuracy                           0.76      4501
   macro avg       0.65      0.57      0.59      4501
weighted avg       0.73      0.76      0.73      4501



0.5873756107527343

In [90]:
# get the prediction for the test set
test = pd.read_csv('test.csv')

# Don't preprocess testing dataset
# test = preprocess_dataset(test)

X_test = test['Text']

X_test_preprocessed = X_test.apply(preprocess_text)

X_test_bow = bow_vectorizer.transform(X_test_preprocessed)
# X_test_bow_scaled = maxabs_scaler.transform(X_test_bow.toarray())
# X_test_bow_scaled = minmax_scaler.transform(X_test_bow.toarray())
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# X_test_engineered = feature_engineer_nltk(X_test)
# X_test_engineered = feature_engineer_spacy(X_test)
# X_test_engineered = sentiment_analysis_textblob(X_test)
# X_test_engineered = sentiment_analysis_vader(X_test)
# X_test_engineered = unique_words_ratio_nltk(X_test)
# X_test_engineered = unique_words_ratio_spacy(X_test)
X_test_engineered = textblob_and_unique_words_nltk(X_test)
X_test_combined = np.hstack([X_test_bow.toarray(), X_test_engineered])

result = nb_classifier.predict(X_test_combined)
# result = nb_classifier.predict(X_test_bow)
# result = nb_classifier.predict(X_test_bow_scaled)
# result = nb_classifier.predict(X_test_tfidf)

test['Verdict'] = pd.Series(result)
test.drop(columns=['Text'], inplace=True)
test.to_csv('A0233573E_Naive_Bayes_with lowercase.csv', index=False)