In [1]:
import re
import csv
import string
from string import digits
import pandas as pd
import numpy as np
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import balanced_accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, matthews_corrcoef, classification_report
from sklearn.metrics import cohen_kappa_score, precision_recall_fscore_support
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('spacytextblob')

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
en_core = spacy.load('en_core_web_sm')
from sklearn.utils import shuffle
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Paul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## NLP Project by Johanna Dahlke, Dmitry Degtyar, Daniel Neufeld, Paul Engelmann

## Original dataset read in

In [2]:
def binary_readin():
    data = pd.read_csv("tweet_emotions.csv")
    count = 0
    for index, sent in enumerate(data.sentiment):
        if sent == "fun" or sent == "enthusiasm" or sent == "love" or sent == "happiness" or sent =="relief":
            data.loc[index,['sentiment']] = "positive"
        elif sent == "hate" or sent == "anger" or sent == "empty" or sent == "worry" or sent == "sadness" or sent == "boredom":
            data.loc[index,['sentiment']] = "negative"
        else:
            #if count % 6 != 0:
            data.drop([index], axis=0, inplace=True)

    data.to_csv('tweet_emotions_binary.csv')

In [3]:
def ternary_readin():
    data = pd.read_csv("tweet_emotions.csv")
    count = 0
    for index, sent in enumerate(data.sentiment):
        if sent == "fun" or sent == "enthusiasm" or sent == "love" or sent == "happiness" or sent =="relief":
            data.loc[index,['sentiment']] = "positive"
        elif sent == "hate" or sent == "anger" or sent == "empty" or sent == "worry" or sent == "sadness" or sent == "boredom":
            data.loc[index,['sentiment']] = "negative"
        elif sent == "surprise":
            data.drop([index], axis=0, inplace=True)
    data.to_csv('tweet_emotions_three_classes.csv')

## Data Cleaning

In [4]:
def remove_punctuation(line_text):
    line_text_without_punct = re.sub(r'[^\w\s]', '', line_text)
    return line_text_without_punct


def remove_numbers(line_text):
    table = str.maketrans('', '', digits)
    line_text_without_nums = line_text.translate(table)
    return line_text_without_nums

def remove_url(line_text):
    #(https?:\/\/) matches http:// or https://
    #(\s)* optional whitespaces
    #(www\.)? optionally matches www.
    # (\s)* optionally matches whitespaces
    #'((\w|\s)+\.)* matches 0 or more of one or more word characters followed by a period
    #([\w\-\s]+\/)* matches 0 or more of one or more words(or a dash or a space) followed by '\'
    #([\w\-]+) any remaining path at the end of the url followed by an optional ending
    #((\?)?[\w\s]*=\s*[\w\%&]*)* matches ending query params (even with white spaces,etc)

    line_text_without_url = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', line_text)
    return line_text_without_url

def remove_username(line_text):
    line_text_without_username = re.sub("([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)",'', line_text)
    return line_text_without_username

def remove_stopwords(line_text):
    line_text_without_stopwords = ' '.join([word for word in line_text.split()
                                           if word not in (stopwords.words('english'))])
    return line_text_without_stopwords

def translate_abbreviations(line_text):
    text_list = line_text.split()
    with open("slang_abbreviations.txt", 'r') as my_csv_file:
        data_from_file = csv.reader(my_csv_file, delimiter="=")
        for index, _str in enumerate(text_list):
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in data_from_file:
                if _str.upper() == row[0]:
                    text_list[index] = row[1]
    final_string = ' '.join(text_list)
    return final_string

def remove_hashtags(line_text):
    line_text_without_hashtags = line_text.replace('#','')
    return line_text_without_hashtags

def lemmatizing(line_text):
    final_string = " ".join([word.lemma_ for word in en_core(line_text)])
    return final_string

def clean_line(line_text):
    text_zero = remove_numbers(line_text)
    text_one = remove_username(text_zero)
    text_two = remove_url(text_one)
    text_three = remove_punctuation(text_two)
    text_four = remove_hashtags(text_three)
    text_five = remove_stopwords(text_four)
    text_six = translate_abbreviations(text_five)
    text_final = lemmatizing(text_six)
    return text_final.lower()

In [5]:
def extract_binary_sentiment(corpus):
    doc = nlp(corpus)
    polarity = doc._.blob.polarity
    subject = doc._.blob.subjectivity 
    if polarity > 0:
        return "positive"
    else:
        return "negative"
    
def extract_ternary_sentiment(corpus):
    doc = nlp(corpus)
    polarity = doc._.blob.polarity
    subject = doc._.blob.subjectivity 
    if polarity >= -0.01 and polarity <= 0.01:
        return "neutral"
    elif polarity > 0.1:
        return "positive"
    else:
        return "negative"

## Plots

In [6]:
def learning_curves_display(data_choice, vect_choice, classifiers, X, y, ylim=(0.5, 1.01), cv=None, n_jobs=None, 
                        train_sizes=np.linspace(0.1, 1.0, 5)):
    
    size = len(classifiers)
    for clf in classifiers:
        if data_choice == "complete":
            ylim = (0.1, 1.01)
        else:
            ylim = (0.1, 1.01)
        plot_learning_curve(data_choice, vect_choice, clf, X, y, ylim, cv, n_jobs, train_sizes)

In [7]:
def metrics_display(data_choice, vect_choice, classifiers, X, y):
    X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42, shuffle=True)
    
    results = {}
    for clf in classifiers:
        clf_name = type(clf).__name__
        results[clf_name] = calc_metrics(data_choice, vect_choice, clf, X_train, y_train, X_valid, y_valid)
    return results

In [8]:
def calc_metrics(data_choice, vect_choice, clf, X_train, y_train, X_valid, y_valid):
    figsize = ()
    if data_choice == "complete":
        figsize=(15, 12)
    else:
        figsize=(10, 6)
    
    fig, axes = plt.subplots(figsize=figsize)
    
    clf_name = type(clf).__name__
    axes.set_title(f'{clf_name}, {data_choice.capitalize()} dataset, {vect_choice.capitalize()}')
    results = {}

    print(f"Started training: {clf_name}")

    clf.fit(X_train, y_train)

    print(f"Finished training: {clf_name}")

    y_pred_train = clf.predict(X_train)
    y_pred = clf.predict(X_valid)

    cm = ConfusionMatrixDisplay.from_predictions(y_valid, y_pred)
    
    axes.xaxis.label.set_color('white')
    axes.yaxis.label.set_color('white')
    axes.title.set_color('white')
    axes.tick_params(axis='x', colors='white')
    axes.tick_params(axis='y', colors='white')
    
    cm.plot(ax=axes)
    results["accuracy"] = balanced_accuracy_score(y_valid, y_pred)

    all_scores = precision_recall_fscore_support(y_valid, y_pred, pos_label="positive", average="weighted")#macro, micro or weighted?
    results["precision"] = all_scores[0] 
    results["recall"] = all_scores[1] 
    results["f1"] = all_scores[2] 

    results["matthew_coeff"] = matthews_corrcoef(y_valid, y_pred)
    results["cohen_kappa"] = cohen_kappa_score(y_valid, y_pred)   
    
    #fig.patch.set_facecolor('white')
    plt.tight_layout()
    fig.savefig(f'confusion_matrix_{clf_name}_{data_choice}_{vect_choice}.png')   
    print(f'Saved to confusion_matrix_{clf_name}_{data_choice}_{vect_choice}.png')
    
    
    return results

In [9]:
def plot_learning_curve(data_choice, vect_choice, clf, X, y, ylim=(0.5, 1.01), cv=None, n_jobs=None, 
                        train_sizes=np.linspace(0.1, 1.0, 5)):

    fig, ax = plt.subplots(figsize=(10,6))
    ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")

    clf_name = type(clf).__name__
    ax.set_title(f"{clf_name}, {data_choice.capitalize()} dataset, {vect_choice.capitalize()}")
    train_sizes, train_scores, test_scores = learning_curve(clf, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)                                                                                                           

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    ax.grid()
    ax.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    ax.fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    
    ax.xaxis.label.set_color('white')
    ax.yaxis.label.set_color('white')
    ax.title.set_color('white')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    
    ax.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
    ax.plot(train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score")
    ax.legend(loc="best")

    
    
    fig.savefig(f'learning_curve_{clf_name}_{data_choice}_{vect_choice}.png')
    print(f'Saved to learning_curve_{clf_name}_{data_choice}_{vect_choice}.png')
    plt.show()

In [10]:
def get_binary_df():
    return pd.read_csv("tweet_emotions_binary_cleaned.csv")

def get_ternary_df():
    return pd.read_csv("tweet_emotions_three_classes_cleaned.csv")

def get_complete_df():
    return pd.read_csv("tweet_emotions_cleaned.csv")

In [11]:
#train the models for the learning curves and metrics
def train_models(classifiers, data_choice="binary", vect_choice="tfidf", text="text", labels="sentiment"):
    df = None
    
    if data_choice == "binary":
        df = get_binary_df()
    elif data_choice == "ternary":
        df = get_ternary_df()
    elif data_choice == "complete":
        df = get_complete_df()
        labels = "sentiment"
    else:
        df = get_binary_df()
    
    df = remove_empty_rows(df)
    
    X = None
    if vect_choice == "tfidf":
        vectorizer = TfidfVectorizer()
        vectorizer.fit(df["cleaned_content"])
        X = vectorizer.transform(df[text])
        print("Amount of features: ", len(vectorizer.get_feature_names_out()))
        print("Features: ", vectorizer.get_feature_names_out())
    elif vect_choice == "spacy":
        X = [nlp(text).vector for text in df["cleaned_content"]]
    
    y = df[labels]

    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    learning_curves_display(data_choice, vect_choice, classifiers, X, y, cv=cv, n_jobs=4)
    
    results = metrics_display(data_choice, vect_choice, classifiers, X, y)
    
    df_results = pd.DataFrame(results)
    
    plt.show()
    
    return df_results

    

In [12]:
#bar plot for accuracy and f1 comparison
def create_bar_plot(data_choice, vect_choices, score_type):
    fig, ax = plt.subplots(figsize=(8 , 4))
    
    scores = {}
    colors = {"tfidf": "navy", "spacy": "lightblue"}
    min_ax = 10000000000
    max_ax = 0
    
    for vect_choice in vect_choices:
        df = pd.read_csv(f"{data_choice}_{vect_choice}_results_raw.csv", index_col=0)
        scores[vect_choice] = df.loc[score_type] # acc1 for each clf
        min_ax = min(min_ax, min(df.loc[score_type]))
        max_ax = max(max_ax, max(df.loc[score_type]))
        
    df = pd.DataFrame.from_dict(scores)
    ax.set_ylim((min_ax-0.01, max_ax+0.03))
    ax.set_title(f'{score_type.capitalize()} Comparison, {data_choice.capitalize()} Dataset')
    ax.set_ylabel(f'{score_type.capitalize()} Scores')
    ax.xaxis.label.set_color('white')
    ax.yaxis.label.set_color('white')
    ax.title.set_color('white')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white') 
    df.plot.bar(ax=ax)
    plt.xticks(rotation=0)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax = plt.gca()
    for p in ax.patches[0:]:
        plt.gca().text(p.get_x() + p.get_width()/2, p.get_height()+0.0025, str(round(p.get_height(), 2)), 
                 ha='center', va='baseline', rotation=0 ,color='black', fontsize=10)
    
    fig.tight_layout()
    fig.savefig(f'{score_type}_scores_comparison_{data_choice}.png')
    print(f'Saved to {score_type}_scores_comparison_{data_choice}.png')
    
    plt.show()

In [13]:
#if any rows were completely empty after cleaning them, this removes them
def remove_empty_rows(df, text="cleaned_content", labels="sentiment"):
    df[text].replace("", np.nan, inplace=True)
    df.dropna(subset=[text], inplace=True)
    df[labels].replace("", np.nan, inplace=True)
    df.dropna(subset=[labels], inplace=True)
    return df

In [14]:
#plot matthew correlation table
def plot_matthew(data_choice, vect_choices, score_type="matthew_coeff"):
    fig, ax = plt.subplots(figsize=(5, 5))
    fig.patch.set_visible(False)
    ax.axis('off')
    
    score_names = {}
    scores = {}
    
    score_names["matthew_coeff"] = "Matthews Correlation Coefficient"
    
    for vect_choice in vect_choices:
        df = pd.read_csv(f"{data_choice}_{vect_choice}_results_raw.csv", index_col=0)
        scores[vect_choice] = df.loc[score_type]
    df = pd.DataFrame.from_dict(scores)
    df = df.round(2)
    df = df.transpose()
    names = ("SVM", "RF", "MLP")
    ax.set_title(score_names[score_type], y=0.575)
    ax.title.set_color('white')
    ax.table(cellText=df.values, colLabels=names, rowLabels=vect_choices, loc='center', cellLoc="center")
    
    fig.tight_layout()
    fig.savefig(f'{data_choice}_matthews_corr.png')
    print(f'Saved to {data_choice}_matthews_corr.png')
    
    plt.show()
    
# plots the accuracy, precision, recall and f1 score as a table
def plot_f1(data_choice, vect_choices):
    fig, ax = plt.subplots(1, 2, figsize=(7, 5))
    fig.patch.set_visible(False)
    
    
    score_names = {}
    scores = {}
    
    score_names["precision"] = "Precision"
    score_names["accuracy"] = "Accuracy"
    score_names["recall"] = "Recall"
    score_names["f1"] = "F1 Score"
    
    
    for i in range(0, len(vect_choices)):
        vect_choice = vect_choices[i]
        ax[i].axis('off')
        df = pd.read_csv(f"{data_choice}_{vect_choice}_results_raw.csv", index_col=0)
        df = df.iloc[:4]
        df = df.round(2)
        names = ("SVM", "RF", "MLP")
        rowLabels = ("Accuracy", "Precision", "Recall", "F1")
        ax[i].set_title(f"{vect_choice.capitalize() }", y=0.61)
        ax[i].title.set_color('white')
        ax[i].table(cellText=df.values, colLabels=names, loc='center', rowLabels=rowLabels, cellLoc="center")
      
    fig.tight_layout()
    fig.savefig(f'{data_choice}_prec_recall_f1.png')
    print(f'Saved to {data_choice}_prec_recall_f1.png')
    
    plt.show()

In [15]:
#cleans all data with our previously defined functions
#has already been done with the included csv's, so no need to further clean. 
#this is just here for completeness sake
def clean_all():
    df_multiple = pd.read_csv("tweet_emotions.csv")
    df_binary = pd.read_csv("tweet_emotions_binary.csv")
    for index, row in df_binary.iterrows():
            df_binary.at[index,'sentiment'] = extract_binary_sentiment(df_binary.at[index, "content"])
    
    df_three_classes = pd.read_csv("tweet_emotions_three_classes.csv")
    for index, row in df_three_classes.iterrows():
            df_three_classes.at[index,'sentiment'] = extract_ternary_sentiment(df_three_classes.at[index, "content"])
    
    df_multiple['cleaned_content'] = df_multiple.content.apply(clean_line)
    df_multiple = remove_empty_rows(df_multiple)
    
    df_binary['cleaned_content'] = df_binary.content.apply(clean_line)
    df_binary = df_binary.loc[:, ~df_binary.columns.str.contains('^Unnamed')]
    df_binary = remove_empty_rows(df_binary)
    
    df_three_classes['cleaned_content'] = df_three_classes.content.apply(clean_line)
    df_three_classes = df_three_classes.loc[:, ~df_three_classes.columns.str.contains('^Unnamed')]
    df_three_classes = remove_empty_rows(df_three_classes)
    
    
    df_binary.to_csv('tweet_emotions_binary_cleaned.csv')
    df_three_classes.to_csv('tweet_emotions_three_classes_cleaned.csv')
    df_multiple.to_csv('tweet_emotions_cleaned.csv')

In [17]:
classifiers = [svm.LinearSVC(), RandomForestClassifier(), MLPClassifier(max_iter=50, early_stopping=True)]
data_choices = ["binary", "ternary", "complete"]
vect_choices = ["tfidf", "spacy"] 

for data_choice in data_choices:
    for vect_choice in vect_choices:
        df_results = train_models(classifiers, data_choice, vect_choice, text="cleaned_content", labels="sentiment")
        df_results.to_csv(f"{data_choice}_{vect_choice}_results_raw.csv")
          
    create_bar_plot(data_choice, vect_choices, "f1")
    create_bar_plot(data_choice, vect_choices, "accuracy")
        
    plot_matthew(data_choice, vect_choices, "matthew_coeff")
    plot_f1(data_choice, vect_choices)

