In [1]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier

In [4]:
import csv
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import parent_modules
import preprocessor

%load_ext autoreload
%load_ext nb_black
%autoreload 2

from definitions import *

# Read training data
train_index = list()
y_train = list()    
with open(os.path.join(DATA_DIR, "train.csv"), 'r') as f:
    for line in f:
        t = line.split(',')
        train_index.append(int(t[0]))
        y_train.append(int(t[1]))

# Read test data
test_index = list()  
with open(os.path.join(DATA_DIR, "test.csv"), 'r') as f:
    for line in f:
        t = line.split(',')
        test_index.append(int(t[0]))

# Load the textual content of the messages into the dictionary "posts"
loaded_posts = dict()
with open(os.path.join(DATA_DIR, "posts.tsv"), 'r') as f:
    for line in f:
        t = line.split('\t')
        loaded_posts[int(t[0])] = t[2][:-1]



<IPython.core.display.Javascript object>

In [5]:
total_entries = len(y_train)
class_counts = {i: len(list(filter(lambda x: x == i, y_train))) for i in range(15)}
class_weights = {
    cls: np.round((1 / class_counts[cls]) * total_entries / 2, 4) for cls in range(15)
}
class_weights

{0: 1.1758,
 1: 11.5366,
 2: 2.781,
 3: 4.7218,
 4: 6.207,
 5: 36.725,
 6: 70.3245,
 7: 367.25,
 8: 54.1844,
 9: 58.5,
 10: 13.1946,
 11: 20.4028,
 12: 22.1087,
 13: 31.9348,
 14: 20.2776}

<IPython.core.display.Javascript object>

In [6]:
class_counts

{0: 5622,
 1: 573,
 2: 2377,
 3: 1400,
 4: 1065,
 5: 180,
 6: 94,
 7: 18,
 8: 122,
 9: 113,
 10: 501,
 11: 324,
 12: 299,
 13: 207,
 14: 326}

<IPython.core.display.Javascript object>

In [7]:
# Data Preprocessing
import re


def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " 's", string)
    string = re.sub(r"\'ve", " 've", string)
    string = re.sub(r"n\'t", " n't", string)
    string = re.sub(r"\'re", " 're", string)
    string = re.sub(r"\'d", " 'd", string)
    string = re.sub(r"\'ll", " 'll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().split()


posts = {idx: " ".join(clean_str(post)) for idx, post in loaded_posts.items()}

<IPython.core.display.Javascript object>

In [11]:
print("---Before\n----", loaded_posts[0], "\n---After---\n", posts[0])

---Before
---- Since everyone is on #lockdown because of #COVID19, I want to spread some positivity and surprise some people! 🥰  I am giving five people that retweets this a free #AnimalCrossing Nintendo Switch Bundle, must be following me for a DM! 🎁  Good luck &amp; ! ❤️ #ACNH https://t.co/sIAkfuxZhK 
---After---
 Since everyone is on lockdown because of COVID19 , I want to spread some positivity and surprise some people ! I am giving five people that retweets this a free AnimalCrossing Nintendo Switch Bundle , must be following me for a DM ! Good luck amp ! ACNH https t co sIAkfuxZhK


<IPython.core.display.Javascript object>

In [7]:
# Create 2 lists: one containing the messages of the training set and the other containing the messages of the
# test set
train_posts = [posts[idx] for idx in train_index]
test_posts = [posts[idx] for idx in test_index]

# Create the training matrix. Each row corresponds to a message and each column to a word present in at least 5
# messages of the training set. The value of each entry in a row is equal to the tf-idf weight of that word in the
# corresponding message
vectorizer = TfidfVectorizer(stop_words="english", min_df=5)
X_train = vectorizer.fit_transform(train_posts)

# Create the test matrix following the same approach as in the case of the training matrix
X_test = vectorizer.transform(test_posts)

print("Train matrix dimensionality: ", X_train.shape)
print("Test matrix dimensionality: ", X_test.shape)

<1x5699 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

<IPython.core.display.Javascript object>

In [8]:
X_train_dev, X_test_dev, y_train_dev, y_test_dev = train_test_split(
    X_train, y_train, test_size=0.2
)

<IPython.core.display.Javascript object>

In [9]:
clf_lin = LogisticRegression(
    solver="newton-cg", multi_class="multinomial", class_weight=class_weights
)
clf_rd_frst = RandomForestClassifier(
    max_depth=150, criterion="entropy", class_weight=class_weights
)
models = list()
models.append(("clf_lin", clf_lin))
models.append(("clf_rd_frst", clf_rd_frst))
ensemble = VotingClassifier(estimators=models, voting="soft")

<IPython.core.display.Javascript object>

## Evaluation on dev

In [10]:
dev_ensemble = ensemble.fit(X_train_dev, y_train_dev)
vot_pred_dev = ensemble.predict_proba(X_test_dev)
log = log_loss(y_test_dev, vot_pred_dev)
print("log loss lin:", log)

log loss lin: 1.1189103394551934


<IPython.core.display.Javascript object>

## Prediction on test

In [11]:
full_ensemble = ensemble.fit(X_train, y_train)
vot_pred_train = full_ensemble.predict_proba(X_train)
vot_pred_test = full_ensemble.predict_proba(X_test)

<IPython.core.display.Javascript object>

In [14]:
# Write predictions to a file
ml_classification_train_csv = os.path.join(
    ML_CLASSIFIERS_DIR, "text_train_predictions.csv"
)
ml_classification_pred_csv = os.path.join(ML_CLASSIFIERS_DIR, "text_predictions.csv")

with open(ml_classification_train_csv, "w+") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")
    for i, idx in enumerate(train_index):
        lst = vot_pred_train[i, :].tolist()
        writer.writerow(lst)


# Write predictions to a file
with open(ml_classification_pred_csv, "w+") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")
    for i, idx in enumerate(test_index):
        lst = vot_pred_test[i, :].tolist()
        writer.writerow(lst)

<IPython.core.display.Javascript object>