In [2]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [3]:
import pandas as pd

import torch
import torch.optim as optim

import torchtext
from torchtext import data
import spacy
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

import argparse
import os


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Baseline(nn.Module):

    def __init__(self, embedding_dim, vocab):
        super(Baseline, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(vocab.vectors)
        self.fc = nn.Linear(embedding_dim, 6)

    def forward(self, x, lengths=None):
        embedded = self.embedding(x)

        average = embedded.mean(0)
        output = self.fc(average)
        output = nn.functional.sigmoid(output)

        return output

In [5]:
!unzip processed_data.zip

Archive:  processed_data.zip
  inflating: processed_data/valid.csv  
  inflating: processed_data/test.csv  
  inflating: processed_data/train.csv  


In [6]:
def evaluate(model, data_iter, loss_fnc):
    sumloss = 0
    sumcorrect = 0
    for i, texts in enumerate(data_iter):
        words, length = texts.comment_text
        labels = []
        for i in range(len(texts.toxic)):
            labels += [[int(texts.toxic[i]), int(texts.severe_toxic[i]), int(texts.obscene[i]), int(texts.threat[i]), int(texts.insult[i]), int(texts.identity_hate[i])]]
        out = model(words, length)
        temploss = loss_fnc(input=out.squeeze(), target=torch.FloatTensor(labels))
        sumloss += temploss
        temp = ((out > 0.5).squeeze().long() == torch.FloatTensor(labels))
        correct = 0
        for i in temp:
          if torch.equal(i, torch.BoolTensor([True, True, True, True, True, True])):
            correct += 1
    loss = torch.mean(float(sumloss) / (i + 1))
    return float(correct)/len(data_iter.dataset), loss

In [7]:
def plot_data(train_acc, valid_acc, test_acc, train_loss, valid_loss, test_loss):

    plt.figure()
    plt.title("Accuracy")
    plt.plot(train_acc, label="Training")
    plt.plot(valid_acc, label="Validation")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.figtext(0.5, -0.05, "Training Final Accuracy: " + str(train_acc[-1]), wrap=True, horizontalalignment='center', fontsize=12)
    plt.figtext(0.5, -0.1, "Validation Final Accuracy: " + str(valid_acc[-1]), wrap=True, horizontalalignment='center', fontsize=12)
    plt.show()

    plt.figure()
    plt.title("Loss")
    plt.plot(train_loss, label="Training")
    plt.plot(valid_loss, label="Validation")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [8]:
from torch.autograd import Variable

def baselinetrain(batch_size, lr, epochs, emb_dim):
    ######
    # 3.2 Processing of the data
    # the code below assumes you have processed and split the data into
    # the three files, train.tsv, validation.tsv and test.tsv
    # and those files reside in the folder named "data".
    ######

    COMMENT_TEXT    = data.Field(sequential=True,lower=True, tokenize='spacy', include_lengths=True)
    TOXIC           = data.Field(sequential=False, use_vocab=False)
    SEVERE_TOXIC    = data.Field(sequential=False, use_vocab=False)
    OBSCENE         = data.Field(sequential=False, use_vocab=False)
    THREAT          = data.Field(sequential=False, use_vocab=False)
    INSULT          = data.Field(sequential=False, use_vocab=False)
    IDENTITY_HATE   = data.Field(sequential=False, use_vocab=False)

    train_data, val_data, test_data = data.TabularDataset.splits(
            path='processed_data/', train='train.csv',
            validation='valid.csv', test='test.csv', format='csv',
            skip_header=True, fields=[('id', None), ('comment_text', COMMENT_TEXT), ('toxic', TOXIC), ('severe_toxic', SEVERE_TOXIC), ('obscene', OBSCENE), ('threat', THREAT), ('insult', INSULT), ('identity_hate', IDENTITY_HATE)])

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
          (train_data, val_data, test_data), batch_sizes=(batch_size, batch_size, batch_size),
	sort_key=lambda x: len(x.comment_text), device=None, sort_within_batch=True, repeat=False)

    COMMENT_TEXT.build_vocab(train_data, val_data, test_data)

    COMMENT_TEXT.vocab.load_vectors(torchtext.vocab.GloVe(name='6B', dim=100))
    vocab = COMMENT_TEXT.vocab

    print("Shape of Vocab:",COMMENT_TEXT.vocab.vectors.shape)

    filters = (2, 4)
    loss_fnc = torch.nn.MSELoss()
    model = Baseline(emb_dim, COMMENT_TEXT.vocab)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    train_acc = []
    train_loss = []
    valid_acc = []
    valid_loss = []
    test_acc = []
    test_loss = []
    for epoch in range(epochs):
        for i, texts in enumerate(train_iter):
            words, length = texts.comment_text
            print(words)
            labels = []
            for i in range(len(texts.toxic)):
              labels += [[int(texts.toxic[i]), int(texts.severe_toxic[i]), int(texts.obscene[i]), int(texts.threat[i]), int(texts.insult[i]), int(texts.identity_hate[i])]]
            optimizer.zero_grad()
            out = model(words, length)
            temploss = loss_fnc(input=out.squeeze(), target=torch.FloatTensor(labels)
            temploss.backward()
            optimizer.step()

        train = evaluate(model, train_iter, loss_fnc)
        train_acc.append(train[0])
        train_loss.append(train[1].numpy())
        val = evaluate(model, val_iter, loss_fnc)
        valid_acc.append(val[0])
        valid_loss.append(val[1].numpy())
        test = evaluate(model, test_iter, loss_fnc)
        test_acc.append(test[0])
        test_loss.append(test[1].numpy())
        print("Epoch: {}| Train acc: {} | Train loss: {} |  Valid acc: {} |  Valid loss: {}".format(epoch + 1, train_acc[epoch], train_loss[epoch],valid_acc[epoch], valid_loss[epoch]))

    plot_data(train_acc, valid_acc, test_acc, train_loss, valid_loss, test_loss)


In [None]:
baselinetrain(64, 0.1, 30, 100)

In [29]:
import pandas as pd
import re
from torch.autograd import Variable
from torch import nn
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.nn import init
from torch import optim
import numpy as np


class MLP(nn.Module):
    def __init__(self, feat_size):
        super(MLP, self).__init__()
        self.pipeline = nn.Sequential(
            nn.Linear(in_features=feat_size, out_features=20),
            nn.ReLU(inplace=True),
            nn.Dropout(p=.2),
            nn.Linear(in_features=20, out_features=10),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=10, out_features=6)
        )

    def forward(self, x):
        logits = self.pipeline(x)
        probs = torch.sigmoid(logits)
        return probs


def prepocess():
    train = pd.read_csv('/content/processed_data/train.csv')
    test = pd.read_csv('/content/processed_data/test.csv')
    print(train.shape, test.shape)

    train_rows = train.shape[0]

    y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
    train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)

    data = pd.concat([train, test])
    del train
    del test

    # stop_words = set(nltk.corpus.stopwords.words('english'))

    def preprocess_input(comment):
        comment = comment.strip()
        comment = comment.lower()
        words = re.split('\s', comment)
        # words = [word for word in words if not word in stop_words]
        sentence = ' '.join(words)
        return sentence

    data.comment_text = data.comment_text.apply(lambda row: preprocess_input(row))
    # from nltk.tokenize import word_tokenize

    """min_df: 3--->10; result:.9724--->.9735"""
    vect = TfidfVectorizer(min_df=10, max_df=0.7,
                           analyzer='word',
                           ngram_range=(1, 2),
                           strip_accents='unicode',
                           smooth_idf=True,
                           sublinear_tf=True,
                           max_features=30000
                           )

    vect = vect.fit(data['comment_text'])

    # print(vect.vocabulary_)
    # print(len(vect.vocabulary_))
    # exit()

    data_tranformed = vect.transform(data['comment_text'])
    test = data_tranformed[train_rows:]
    train = data_tranformed[:train_rows]
    y_train = np.array(y_train).astype(np.float32)

    return train, y_train, test

    # print("train.shape ", train.shape)
    #
    #
    # cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    # y_pred = pd.read_csv('./data/sample_submission.csv')
    #
    # for c in cols:
    #     clf = LogisticRegression(C=4, solver='sag')
    #     clf.fit(train, y_train[c])
    #     y_pred[c] = clf.predict_proba(test)[:,1]
    #     pred_train = clf.predict_proba(train)[:,1]
    #     print(c, '--> log loss:', log_loss(y_train[c], pred_train))
    #
    #
    # y_pred.to_csv('my_submission.csv', index=False)


def init_param(self):
    if isinstance(self, (nn.Conv2d, nn.Linear)):
        init.xavier_uniform(self.weight.data)
        init.constant(self.bias.data, 0)


def batch_generator(batch_size, batch_x, batch_y=None, shuffle=True):
    num_examples = batch_x.shape[0]
    indices = list(range(num_examples))
    if shuffle:
        np.random.shuffle(indices)
    counter = 0
    mini_batch_x = []
    mini_batch_y = []
    for idx in indices:
        mini_batch_x.append(batch_x[idx].toarray())
        if batch_y is not None:
            mini_batch_y.append(batch_y[idx])
        counter += 1
        if counter == batch_size:
            counter = 0
            xs = np.concatenate(mini_batch_x, axis=0)
            ys = None if batch_y is None else np.stack(mini_batch_y, axis=0)
            yield xs, ys
            mini_batch_x = []
            mini_batch_y = []

    if len(mini_batch_x) > 0:
        xs = np.concatenate(mini_batch_x, axis=0)
        ys = None if batch_y is None else np.stack(mini_batch_y, axis=0)
        yield xs, ys


if __name__ == '__main__':
    # prepare data
    train, y_train, test = prepocess()

    mlp = MLP(feat_size=100)
    mlp.apply(init_param)
    mlp.cuda()
    optimizer = optim.SGD(params=mlp.parameters(), lr=3e-3, momentum=.9, weight_decay=1e-4)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=300, gamma=.9)

    criterion = nn.BCELoss()
    min_loss = 10
    for i in range(100):
        losses = []
        mlp.train()
        print("training....")
        for xs, ys in batch_generator(100, train, y_train):
            print(xs.shape)
            xs = Variable(torch.FloatTensor(xs).cuda())
            ys = Variable(torch.FloatTensor(ys).cuda())
            prob = mlp(xs)
            loss = criterion(prob, ys)
            losses.append(loss.data)
            lr_scheduler.step()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss_ = torch.mean(torch.cat(losses))
        print("epoch {}, loss {}".format(i, loss_))
        mlp.eval()

        probs = []
        print("testing...")
        for xs, _ in batch_generator(100, test):
            xs = Variable(torch.FloatTensor(xs).cuda(), volatile=True)
            prob = mlp(xs)
            probs.append(prob.data)
        probs = torch.cat(probs, dim=0).cpu().numpy()
        print(probs.shape)
        predictions = pd.read_csv("./data/sample_submission.csv")

        for j, c in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
            predictions[c] = probs[:, j]
        if loss_ < min_loss:
            min_loss = loss_
            predictions.to_csv("submission_tf_idf_mlp_%d.csv" % i, index=False)
        print("submission saved!")
        print("******************************************")

(25960, 8) (3246, 8)
training....
(100, 24177)




RuntimeError: ignored