In [1]:
%load_ext autoreload
%autoreload 2


# Import Libraries

In [2]:
import os
from pathlib import Path

import torch
import pandas as pd
import gensim.downloader as api
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_sequence

from dataset import *
from model import *
from trainer import Trainer


torch.manual_seed(42)

<torch._C.Generator at 0x7f0cc937eb30>

# Read Data

In [3]:
path = "../"
train = pd.read_csv(os.path.join(path, "train.csv"))
test = pd.read_csv(os.path.join(path, "test.csv"))

train.head()

Unnamed: 0,rate,text
0,4,Очень понравилось. Были в начале марта с соба...
1,5,В целом магазин устраивает.\nАссортимент позво...
2,5,"Очень хорошо что открылась 5 ка, теперь не над..."
3,3,Пятёрочка громко объявила о том как она заботи...
4,3,"Тесно, вечная сутолока, между рядами трудно ра..."


# Label encoding

In [4]:
le = LabelEncoder()

train.rate = le.fit_transform(train.rate)
train.head()

Unnamed: 0,rate,text
0,3,Очень понравилось. Были в начале марта с соба...
1,4,В целом магазин устраивает.\nАссортимент позво...
2,4,"Очень хорошо что открылась 5 ка, теперь не над..."
3,2,Пятёрочка громко объявила о том как она заботи...
4,2,"Тесно, вечная сутолока, между рядами трудно ра..."


# Create pretrained tokenizers

In [5]:
tok = Tokenizer()
tok_texts = [tok.tokenize(t) for t in train.text.values]
vocab = Vocab(tok_texts, max_vocab_size=30000)

# Splitting Data

In [6]:
train_texts, train_labels, val_texts, val_labels = train_test_split(train)
train_dataset = TextDataset([tok.tokenize(t)
                            for t in train_texts], train_labels, vocab)
val_dataset = TextDataset([tok.tokenize(t)
                          for t in val_texts], val_labels, vocab)

# Create embeddings from tokens

In [7]:
os.environ["GENSIM_DATA_DIR"] = str(Path.cwd())
gensim_model = api.load("word2vec-ruscorpora-300")
emb_matrix = prepare_emb_matrix(gensim_model, vocab)

# Init Model and Config

In [8]:
config = {
    "freeze": False,
    "cell_type": "LSTM",
    "cell_dropout": 0.2,
    "num_layers": 2,
    "hidden_size": 128,
    "out_activation": "relu",
    "bidirectional": True,
    "out_dropout": 0.2,
    "out_sizes": [200],
}

trainer_config = {
    "lr": 3e-3,
    "n_epochs": 5,
    "weight_decay": 1e-6,
    "batch_size": 128,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}
clf_model = RecurrentClassifier(config, vocab, emb_matrix)

# Create Dataloaders and Train

In [9]:
train_dataloader = DataLoader(train_dataset,
                              batch_size=trainer_config["batch_size"],
                              shuffle=True,
                              num_workers=0,
                              collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset,
                            batch_size=trainer_config["batch_size"],
                            shuffle=False,
                            num_workers=0,
                            collate_fn=val_dataset.collate_fn)
t = Trainer(trainer_config)
t.fit(clf_model, train_dataloader, val_dataloader)

Epoch 1/5


  0%|          | 0/324 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Epoch 2/5


  0%|          | 0/324 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Epoch 3/5


  0%|          | 0/324 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Epoch 4/5


  0%|          | 0/324 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Epoch 5/5


  0%|          | 0/324 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

RecurrentClassifier(
  (embeddings): Embedding(30003, 300, padding_idx=0)
  (cell): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (out_dropout): Dropout(p=0.2, inplace=False)
  (out_proj): Sequential(
    (0): Linear(in_features=512, out_features=200, bias=True)
    (1): Linear(in_features=200, out_features=6, bias=True)
  )
)

# Save Model

In [10]:
t.save("baseline_model.ckpt")

# Load pretrained Model

In [11]:
t = Trainer.load("baseline_model.ckpt")

# Define predict function

In [12]:
def predict(model, text):
    tok_text = tok.tokenize(text)
    indexed_text = torch.tensor(vocab.vectorize(tok_text)).to(t.device)
    genre = model(pack_sequence([indexed_text])).argmax().item()
    return genre

# Get testset predictions

In [13]:
test_dataloader = DataLoader(TextDataset([tok.tokenize(t) for t in test.text.values], [-1] * test.shape[0], vocab),
                             batch_size=trainer_config["batch_size"],
                             shuffle=False,
                             num_workers=0,
                             collate_fn=val_dataset.collate_fn)

predictions = t.predict(test_dataloader)

# Create submission

In [14]:
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sample_submission["rate"] = predictions
sample_submission.rate = le.inverse_transform(sample_submission.rate)
sample_submission.head()

Unnamed: 0,rate
0,5
1,5
2,5
3,5
4,5


In [15]:
sample_submission.to_csv("submission.csv", index=False)