In [34]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import nltk

# Bag of words and Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.preprocessing import label_binarize

# Classification methods
from sklearn.naive_bayes import MultinomialNB   # Naive Bayes
from sklearn.tree import DecisionTreeClassifier  # Decision Tree
from sklearn.neural_network import MLPClassifier # Multi-Layer Perceptron
from sklearn.neighbors import KNeighborsClassifier # KNN


# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import classification_report

Test Data

In [54]:
data_path = 'Full_dataset/'
rdfTrain = pd.read_csv(data_path + 'train.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTrain.head()

Unnamed: 0,text,label,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


Validation Data

In [56]:
# validation set
data_path = 'Full_dataset/'
rdfDev = pd.read_csv(data_path + 'dev.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfDev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5426 non-null   object
 1   label   5426 non-null   object
 2   id      5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


Test Data

In [58]:
# test set
data_path = 'Full_dataset/'
rdfTest = pd.read_csv(data_path + 'test.tsv', sep = '\t', header = None, names = ['text', 'label', 'id'])
rdfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5427 non-null   object
 1   label   5427 non-null   object
 2   id      5427 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


In [59]:
def label_neutral(row):
    if row['label'] == '27':
        return 1
    else:
        return 0

In [60]:
pos_labels = ['admiration','approval', 'amusement', 'caring', 'desire', 
              'excitement', 'gratitude', 'joy', 'love','optimism', 'pride', 'relief']
neg_labels = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust',
              'embarrassment','fear', 'grief', 'nervousness', 'remorse', 'sadness']
ambi_labels = ['confusion', 'curiosity', 'realization', 'surprise']

In [61]:
emotion_to_idx = {
    'admiration' : '0',
    'amusement' : '1',
    'anger' : '2',
    'annoyance' : '3',
    'approval' : '4',
    'caring' : '5',
    'confusion' : '6',
    'curiosity' : '7',
    'desire' : '8',
    'disappointment' : '9',
    'disapproval' : '10',
    'disgust' : '11',
    'embarrassment' : '12',
    'excitement' : '13',
    'fear' : '14',
    'gratitude' : '15',
    'grief' : '16',
    'joy' : '17',
    'love' : '18',
    'nervousness' : '19',
    'optimism' : '20',
    'pride' : '21',
    'realization' : '22',
    'relief' : '23',
    'remorse' : '24',
    'sadness' : '25',
    'surprise' : '26',
    'neutral' : '27'
}

In [62]:
label_to_emotion = {j:i for i, j in emotion_to_idx.items()}
label_to_emotion

{'0': 'admiration',
 '1': 'amusement',
 '2': 'anger',
 '3': 'annoyance',
 '4': 'approval',
 '5': 'caring',
 '6': 'confusion',
 '7': 'curiosity',
 '8': 'desire',
 '9': 'disappointment',
 '10': 'disapproval',
 '11': 'disgust',
 '12': 'embarrassment',
 '13': 'excitement',
 '14': 'fear',
 '15': 'gratitude',
 '16': 'grief',
 '17': 'joy',
 '18': 'love',
 '19': 'nervousness',
 '20': 'optimism',
 '21': 'pride',
 '22': 'realization',
 '23': 'relief',
 '24': 'remorse',
 '25': 'sadness',
 '26': 'surprise',
 '27': 'neutral'}

In [63]:
def label_pos_neg_neutral(row):
    '''
    Sentiment Analysis label:
    0 - Negative
    1 - Positive
    2 - Neutral/Ambigous
    '''
    sentiment = [0, 0, 0]
    labels = str(row['label']).split(",")
    for l in labels:
        label = label_to_emotion[l] 
        if label in pos_labels:
            sentiment[1] += 1
        elif label in neg_labels:
            sentiment[0] += 1
        else:
            sentiment[2] += 1
    return np.argmax(np.array(sentiment))

In [64]:
anger_list = [ "anger", "annoyance", "disapproval", "disgust"]
fear_list = ["fear", "nervousness"]
joy_list = ["joy", "amusement", "approval", "excitement", "gratitude","love", "optimism", "relief", "pride", "admiration", "desire", "caring"]
sadness_list = ["sadness", "disappointment", "embarrassment", "grief", "remorse"]
surprise_list = ["surprise", "realization", "confusion", "curiosity"]

In [65]:
def label_emotion_group(row):
    '''
    Groupping Emotion Label:
    0 - Anger, 1 - Fear, 2- Joy, 
    3 - Sadness, 4 - Surprise, 5 - Neutral/Ambigous
    '''
    sentiment = [0]*6
    labels = str(row['label']).split(",")
    for l in labels:
        if l == '27': 
            return 5 # Neutral
        
        label = label_to_emotion[l]
        if label in anger_list:
            sentiment[0] += 1
        elif label in fear_list:
            sentiment[1] += 1
        elif label in joy_list:
            sentiment[2] += 1
        elif label in sadness_list:
            sentiment[3] += 1
        elif label in surprise_list:
            sentiment[4] += 1
        else:
            sentiment[5] += 1
    return np.argmax(np.array(sentiment))

In [66]:
label_idx = {
    '0' :  '0', # admiration, desire
    '10' :  '1', # disapproval, disgust, disappointment, embarrassment 
    '2' : '2',  # anger, annoyance 
    '13' : '3', # excitement, amusement
    '18' : '4', # love, caring
    '4' : '5',  # approval
    '15' : '6', # gratitude
    '7' : '7',  # curiosity
    '25' : '8', # sadness , grief, remorse 
    '17' : '9', # joy , pride, relief
    '20' : '10', # optimism
    '6' : '11', # confusion
    '22' : '12', # realization
    '26' : '13', # surprise
    '14' :  '14' # fear, nervousness
}

In [67]:
# Ignoring neutral and merging emotions 
def multi_class(df):
    data = []
    for idx, row in df.iterrows():
        labels = str(row['label']).split(",")
        for l in labels:
            l_txt = label_to_emotion[l]
            if l_txt == 'neutral':
                continue
            elif l_txt == 'desire':
                l = emotion_to_idx['admiration']
            elif l_txt == 'amusement':
                l = emotion_to_idx['excitement']
            elif l_txt == 'pride' or l_txt == 'relief':
                l = emotion_to_idx['joy']
            elif l_txt == 'caring':
                l = emotion_to_idx['love']
            elif l_txt == 'embarrassment' or l_txt == 'disgust' or l_txt == 'disappointment':
                l = emotion_to_idx['disapproval']
            elif l_txt == 'nervousness':
                l = emotion_to_idx['fear']
            elif l_txt == 'remorse' or l_txt == 'grief':
                l = emotion_to_idx['sadness']
            elif l_txt == 'annoyance':
                l = emotion_to_idx['anger']
            idx = label_idx[l]
            data.append([row['text'],idx])
    data = np.array(data)
    new_df = pd.DataFrame(data=data, columns=['text','labels'])
    return new_df

In [68]:
def emotion_label(df):
    data = []
    for idx, row in df.iterrows():
        labels = str(row['label']).split(",")
        for l in labels:
            data.append([row['text'],l])
    data = np.array(data)
    new_df = pd.DataFrame(data=data, columns=['text','labels'])
    return new_df

In [69]:
def transformData(rdfTrain, rdfDev, rdfTest, n_categories = 2):
    
    dfTrain, dfDev, dfTest = rdfTrain, rdfDev, rdfTest
    if n_categories == 2:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_neutral(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_neutral(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_neutral(row), axis = 1)
    
    if n_categories == 3:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_pos_neg_neutral(row), axis = 1)
        
    if n_categories == 6:
        dfTrain["labels"] = dfTrain.apply(lambda row: label_emotion_group(row), axis = 1)
        dfDev["labels"] = dfDev.apply(lambda row: label_emotion_group(row), axis = 1)
        dfTest["labels"] = dfTest.apply(lambda row: label_emotion_group(row), axis = 1)
    
    if n_categories == 28:
        dfTrain = multi_class(dfTrain)
        print(dfTrain.head())
        dfDev = multi_class(dfDev)
        dfTest = multi_class(dfTest)
    
    print("Training distribution: ", dfTrain.labels.value_counts())
    print("Dev data distribution: ", dfDev.labels.value_counts())
    print("Test data distribution: ", dfTest.labels.value_counts())
        
    return dfTrain, dfDev, dfTest

In [102]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 2)

Training distribution:  labels
0    30587
1    12823
Name: count, dtype: int64
Dev data distribution:  labels
0    3834
1    1592
Name: count, dtype: int64
Test data distribution:  labels
0    3821
1    1606
Name: count, dtype: int64


In [72]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 3)

Training distribution:  labels
2    17021
1    16628
0     9761
Name: count, dtype: int64
Dev data distribution:  labels
1    2106
2    2096
0    1224
Name: count, dtype: int64
Test data distribution:  labels
2    2147
1    2026
0    1254
Name: count, dtype: int64


In [73]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 6)

Training distribution:  labels
2    16327
5    14219
0     5829
4     3888
3     2489
1      658
Name: count, dtype: int64
Dev data distribution:  labels
2    2067
5    1766
0     748
4     466
3     289
1      90
Name: count, dtype: int64
Test data distribution:  labels
2    1977
5    1787
0     777
4     494
3     304
1      88
Name: count, dtype: int64


In [70]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 28)

                                                text labels
0                     WHY THE FUCK IS BAYLESS ISOING      2
1                        To make her feel threatened     14
2                             Dirty Southern Wankers      2
3  OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...     13
4  Yes I heard abt the f bombs! That has to be wh...      6
Training distribution:  labels
0     4771
1     4387
2     4037
3     3181
4     3173
5     2939
6     2662
7     2191
8     1948
9     1716
10    1581
11    1368
12    1110
13    1060
14     760
Name: count, dtype: int64
Dev data distribution:  labels
1     587
0     565
2     498
4     405
3     399
5     397
6     358
7     248
8     224
10    209
9     205
11    152
13    129
12    127
14    111
Name: count, dtype: int64
Test data distribution:  labels
0     587
1     578
2     518
4     373
3     367
6     352
5     351
7     284
8     218
9     188
10    186
11    153
12    145
13    141
14    101
Name: count, dtype: int64


In [103]:
dfTrain = dfTrain[['text', 'labels']]
dfDev = dfDev[['text', 'labels']]
dfTest = dfTest[['text', 'labels']]

In [104]:
data_path = 'Data/'
dfTrain.to_csv(data_path + 'train.tsv', sep='\t', header=False, index=False)
dfDev.to_csv(data_path + 'dev.tsv', sep='\t', header=False, index=False)
dfTest.to_csv(data_path + 'test.tsv', sep='\t', header=False, index=False)

Splitting Data

In [105]:
def splitData(trainFeatures, devFeatures, dfTrain, dfDev):
    xTrain, yTrain = trainFeatures, dfTrain['labels']
    xDev, yDev = devFeatures, dfDev['labels']
    
    print("Dev : ", xDev.shape, yDev.shape)
    print("Train : ", xTrain.shape, yTrain.shape)
    
    return xTrain, yTrain, xDev, yDev

TF-IDF & Bag of Words

In [106]:
def featureGeneration(dfTrain, dfDev, method = 'BOW'):
    if method == 'BOW':
        #tokenizer to remove unwanted elements from out data like symbols and numbers
        #token = RegexpTokenizer(r'[a-zA-Z0-9]+')
        token = TweetTokenizer(strip_handles=True, reduce_len=True)
        cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
        trainFeatures = cv.fit_transform(dfTrain['text'])
        devFeatures = cv.transform(dfDev['text'])
    
    if method == 'TF-IDF':
        tf = TfidfVectorizer()
        trainFeatures = tf.fit_transform(dfTrain['text'])
        devFeatures = tf.transform(dfDev['text'])
        
    return trainFeatures, devFeatures

Naive-Bayes

In [107]:
def naiveBayes(xTrain, yTrain, xDev, yDev):
    # Model Generation Using Multinomial Naive Bayes
    clf = MultinomialNB().fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    print("MultinomialNB Accuracy:", metrics.accuracy_score(yDev, predicted))

In [129]:
def model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes'):
    
    num_classes = len(yTrain.unique())
    print("Num classes: ", num_classes)
    
    if method == 'Naive Bayes':
        clf = MultinomialNB()
    
    if method == 'Decision Trees':
        clf = DecisionTreeClassifier()

    if method == 'MLP':
        clf = MLPClassifier()
    
    if method == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=5)
        
    clf = clf.fit(xTrain, yTrain)
    predicted= clf.predict(xDev)
    
#   Evaluation metrics

    Y_test = label_binarize(yDev, classes=[0, 1, 2])
    Y_score = label_binarize(predicted, classes=[0, 1, 2])
    
    
    target_names = [str(i) for i in range(num_classes)]
    print(classification_report(yDev, predicted, target_names=target_names))
    acc = metrics.accuracy_score(Y_test, Y_score)
    precision = metrics.precision_score(Y_test, Y_score, average='macro')
    recall = metrics.recall_score(Y_test, Y_score,average='macro' )
    #roc_auc = metrics.roc_auc_score(Y_test, Y_score,average='macro')
    f1 = metrics.f1_score(Y_test, Y_score, average='macro')
    confusion_matrix = metrics.confusion_matrix(yDev, predicted)

    #print(str(acc) + "\t" + str(precision) + "\t" + str(recall) + "\t" + str(f1) + "\t" + str(roc_auc))
    print(str(acc) + "\t" + str(precision) + "\t" + str(recall) + "\t" + str(f1))
    print(confusion_matrix)
        
    print("Accuracy:", acc)
    #     print("Precision:", precision)
    #     print("Recall:", recall)
    #     print("ROC AUC: ", roc_auc)
    #     print("f1-score:", f1)
    #     print("Confusion Matrix: \n", confusion_matrix)
    return clf

Grouping emotions to 3 - Naive Bayes, DT, MLP, KNN

In [86]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 3)

Training distribution:  labels
2    17021
1    16628
0     9761
Name: count, dtype: int64
Dev data distribution:  labels
1    2106
2    2096
0    1224
Name: count, dtype: int64
Test data distribution:  labels
2    2147
1    2026
0    1254
Name: count, dtype: int64


In [87]:
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'BOW')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)



Dev :  (5426, 27774) (5426,)
Train :  (43410, 27774) (43410,)


In [88]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes')

Num classes:  3
              precision    recall  f1-score   support

           0       0.60      0.42      0.50      1224
           1       0.67      0.78      0.72      2106
           2       0.63      0.64      0.63      2096

    accuracy                           0.64      5426
   macro avg       0.64      0.61      0.62      5426
weighted avg       0.64      0.64      0.64      5426

0.6444894950239587	0.6350129472893912	0.6132510170796605	0.6174241641027124	0.7134700171986642
[[ 515  273  436]
 [ 107 1637  362]
 [ 231  520 1345]]
Accuracy: 0.6444894950239587


In [89]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Decision Trees')

Num classes:  3
              precision    recall  f1-score   support

           0       0.50      0.45      0.47      1224
           1       0.70      0.69      0.70      2106
           2       0.60      0.64      0.62      2096

    accuracy                           0.62      5426
   macro avg       0.60      0.59      0.59      5426
weighted avg       0.61      0.62      0.61      5426

0.6162919277552524	0.5984983702851232	0.5927899425068013	0.5949064650316477	0.6977563793690509
[[ 548  236  440]
 [ 177 1462  467]
 [ 368  394 1334]]
Accuracy: 0.6162919277552524


In [90]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'MLP')

Num classes:  3
              precision    recall  f1-score   support

           0       0.56      0.48      0.52      1224
           1       0.72      0.72      0.72      2106
           2       0.63      0.68      0.65      2096

    accuracy                           0.65      5426
   macro avg       0.64      0.63      0.63      5426
weighted avg       0.65      0.65      0.65      5426

0.6516771102100996	0.6366019399760511	0.6281485369134424	0.631184819132013	0.7242738440035495
[[ 591  217  416]
 [ 157 1525  424]
 [ 313  363 1420]]
Accuracy: 0.6516771102100996


In [91]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'KNN')

Num classes:  3
              precision    recall  f1-score   support

           0       0.39      0.25      0.31      1224
           1       0.66      0.51      0.58      2106
           2       0.50      0.72      0.59      2096

    accuracy                           0.53      5426
   macro avg       0.52      0.49      0.49      5426
weighted avg       0.54      0.53      0.52      5426

0.5333579063767048	0.5179227356682982	0.49409991808210635	0.4909140688000709	0.6251523304782824
[[ 306  226  692]
 [ 220 1074  812]
 [ 253  329 1514]]
Accuracy: 0.5333579063767048


Grouping emotions into 6 - Naive Bayes, DT, MLP, KNN

In [113]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 6)

Training distribution:  labels
2    16327
5    14219
0     5829
4     3888
3     2489
1      658
Name: count, dtype: int64
Dev data distribution:  labels
2    2067
5    1766
0     748
4     466
3     289
1      90
Name: count, dtype: int64
Test data distribution:  labels
2    1977
5    1787
0     777
4     494
3     304
1      88
Name: count, dtype: int64


In [114]:
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'BOW')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)



Dev :  (5426, 27774) (5426,)
Train :  (43410, 27774) (43410,)


In [115]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes')

Num classes:  6
              precision    recall  f1-score   support

           0       0.55      0.27      0.36       748
           1       1.00      0.01      0.02        90
           2       0.60      0.83      0.70      2067
           3       0.63      0.09      0.16       289
           4       0.56      0.12      0.19       466
           5       0.51      0.58      0.54      1766

    accuracy                           0.56      5426
   macro avg       0.64      0.32      0.33      5426
weighted avg       0.56      0.56      0.52      5426

0.6291927755252488	0.7139981126140297	0.37263246885888396	0.3603987969531645	0.6224049948391865
[[ 204    0  222    4    7  311]
 [  12    1   41    1    2   33]
 [  25    0 1724    2    6  310]
 [  13    0  137   27    0  112]
 [  18    0  160    1   54  233]
 [ 102    0  606    8   28 1022]]
Accuracy: 0.6291927755252488


In [116]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Decision Trees')

Num classes:  6
              precision    recall  f1-score   support

           0       0.39      0.35      0.37       748
           1       0.43      0.26      0.32        90
           2       0.69      0.71      0.70      2067
           3       0.45      0.36      0.40       289
           4       0.41      0.39      0.40       466
           5       0.52      0.56      0.54      1766

    accuracy                           0.56      5426
   macro avg       0.48      0.44      0.45      5426
weighted avg       0.55      0.56      0.55      5426

0.6391448580906746	0.5021654666184694	0.4380319314836518	0.4625710230531997	0.6707497415247259
[[ 262    9  147   28   43  259]
 [  15   23   18    2    5   27]
 [ 101    7 1464   42   64  389]
 [  41    1   50  104    7   86]
 [  37    6   76    7  181  159]
 [ 213    8  370   48  143  984]]
Accuracy: 0.6391448580906746


In [117]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'MLP')

Num classes:  6
              precision    recall  f1-score   support

           0       0.45      0.42      0.44       748
           1       0.47      0.47      0.47        90
           2       0.70      0.73      0.71      2067
           3       0.46      0.40      0.43       289
           4       0.39      0.33      0.36       466
           5       0.53      0.55      0.54      1766

    accuracy                           0.57      5426
   macro avg       0.50      0.48      0.49      5426
weighted avg       0.57      0.57      0.57      5426

0.6607077036490969	0.5403835431682017	0.5401689998249376	0.5400464512075439	0.7227040943988747
[[ 317   15  125   23   35  233]
 [  10   42   17    2    2   17]
 [  99   12 1509   49   49  349]
 [  40    4   45  115   14   71]
 [  32    4   74   14  155  187]
 [ 208   12  385   47  143  971]]
Accuracy: 0.6607077036490969


In [118]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'KNN')

Num classes:  6
              precision    recall  f1-score   support

           0       0.33      0.18      0.24       748
           1       0.40      0.02      0.04        90
           2       0.65      0.53      0.59      2067
           3       0.51      0.12      0.20       289
           4       0.39      0.21      0.28       466
           5       0.42      0.70      0.52      1766

    accuracy                           0.48      5426
   macro avg       0.45      0.30      0.31      5426
weighted avg       0.50      0.48      0.46      5426

0.5842241061555473	0.46054222797866	0.2463336371771444	0.2881349944169438	0.5838940121498468
[[ 137    0  152    8   17  434]
 [   5    2   17    1    6   59]
 [ 103    1 1103   16   28  816]
 [  14    1   55   35    7  177]
 [  35    0   70    3  100  258]
 [ 121    1  296    5  100 1243]]
Accuracy: 0.5842241061555473


Grouping emotions into 28 - Naive Bayes, DT, MLP, KNN

In [122]:
dfTrain, dfDev, dfTest = transformData(rdfTrain, rdfDev, rdfTest, n_categories = 28)

                                                text labels
0                     WHY THE FUCK IS BAYLESS ISOING      2
1                        To make her feel threatened     14
2                             Dirty Southern Wankers      2
3  OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...     13
4  Yes I heard abt the f bombs! That has to be wh...      6
Training distribution:  labels
0     4771
1     4387
2     4037
3     3181
4     3173
5     2939
6     2662
7     2191
8     1948
9     1716
10    1581
11    1368
12    1110
13    1060
14     760
Name: count, dtype: int64
Dev data distribution:  labels
1     587
0     565
2     498
4     405
3     399
5     397
6     358
7     248
8     224
10    209
9     205
11    152
13    129
12    127
14    111
Name: count, dtype: int64
Test data distribution:  labels
0     587
1     578
2     518
4     373
3     367
6     352
5     351
7     284
8     218
9     188
10    186
11    153
12    145
13    141
14    101
Name: count, dtype: int64


In [123]:
trainFeatures, devFeatures = featureGeneration(dfTrain, dfDev, method = 'BOW')
xTrain, yTrain, xDev, yDev = splitData(trainFeatures, devFeatures, dfTrain, dfDev)



Dev :  (4614, 21975) (4614,)
Train :  (36884, 21975) (36884,)


In [130]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Naive Bayes')

Num classes:  15
              precision    recall  f1-score   support

           0       0.36      0.73      0.48       565
           1       0.31      0.56      0.40       587
           2       0.52      0.12      0.20       209
           3       0.52      0.07      0.13       152
           4       0.50      0.01      0.02       127
           5       0.50      0.03      0.06       129
           6       0.75      0.05      0.10       111
           7       0.43      0.51      0.47       498
           8       0.53      0.51      0.52       399
           9       0.50      0.49      0.49       405
          10       0.34      0.18      0.24       397
          11       0.67      0.70      0.68       358
          12       0.57      0.40      0.47       248
          13       0.64      0.37      0.47       224
          14       0.49      0.10      0.16       205

    accuracy                           0.43      4614
   macro avg       0.51      0.32      0.33      4614
weighted 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [131]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'Decision Trees')

Num classes:  15
              precision    recall  f1-score   support

           0       0.43      0.59      0.50       565
           1       0.35      0.43      0.39       587
           2       0.42      0.44      0.43       209
           3       0.31      0.33      0.32       152
           4       0.19      0.13      0.15       127
           5       0.36      0.38      0.37       129
           6       0.55      0.33      0.42       111
           7       0.42      0.36      0.39       498
           8       0.55      0.51      0.53       399
           9       0.48      0.50      0.49       405
          10       0.28      0.21      0.24       397
          11       0.80      0.72      0.76       358
          12       0.46      0.54      0.50       248
          13       0.54      0.46      0.50       224
          14       0.38      0.26      0.31       205

    accuracy                           0.44      4614
   macro avg       0.43      0.41      0.42      4614
weighted 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [None]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'MLP')

In [132]:
clf = model(xTrain, yTrain, xDev, yDev, method = 'KNN')

Num classes:  15
              precision    recall  f1-score   support

           0       0.31      0.54      0.40       565
           1       0.24      0.40      0.30       587
           2       0.31      0.11      0.17       209
           3       0.17      0.09      0.12       152
           4       0.16      0.03      0.05       127
           5       0.15      0.09      0.11       129
           6       0.21      0.05      0.07       111
           7       0.36      0.26      0.30       498
           8       0.53      0.31      0.39       399
           9       0.37      0.52      0.43       405
          10       0.20      0.12      0.15       397
          11       0.68      0.76      0.72       358
          12       0.31      0.48      0.38       248
          13       0.47      0.27      0.34       224
          14       0.34      0.16      0.21       205

    accuracy                           0.34      4614
   macro avg       0.32      0.28      0.28      4614
weighted 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
