# NLI with Multi-variable Classification and TF-IDF

## 1. import packages

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

## 2. preprocessing

### 2-1. Target indexing

In [None]:
df = pd.read_csv("train_data.csv")[:1000]

In [None]:
df["pre_hypo"] = df["premise"] + " | " + df["hypothesis"]

In [None]:
#df.dropna()

In [None]:
label_list = df["label"].value_counts().keys().tolist()
num_labels = len(label_list)

In [None]:
def label_encoder(x):
  result = 0
  if str(x) == "contradiction":
    result = 0
  elif str(x) == "neutral":
    result = 1
  elif str(x) == "entailment":
    result = 2
  else:
    print("error!! "+x+ "label does not exist in our dataset!")
  return result

def label_decoder(x):
  result = "neutral"
  if x == 0:
    result = "contradiction"
  elif x == 1:
    result = "neutral"
  elif x == 2:
    result = "entailment"
  else:
    print("error!! "+str(x)+ "label does not exist in our dataset!")
  return result

In [None]:
df["label"]

0      contradiction
1      contradiction
2         entailment
3            neutral
4            neutral
           ...      
995       entailment
996       entailment
997          neutral
998       entailment
999    contradiction
Name: label, Length: 1000, dtype: object

In [None]:
df["label_num"] = df["label"].apply(lambda x: label_encoder(str(x)))

In [None]:
df.head(2)

Unnamed: 0,index,premise,hypothesis,label,pre_hypo,label_num
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",0
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",0


### 2-2. Bow encoding for inputs

In [None]:
corpus = df["pre_hypo"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
df["pre_hypo_vector"]= X.todense().tolist()

In [None]:
input_labels = vectorizer.get_feature_names_out()
num_inputs = len(input_labels)

## 3. Build Dataset

In [None]:
class MyNLILDataset(Dataset):
  def __init__(self, data):
    super(MyNLILDataset).__init__()
    self.df = data
    self.sample_num = data["index"]
    self.x_data = data["pre_hypo_vector"].tolist()
    self.y_data = data["label_num"]

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    a_index = torch.LongTensor([self.sample_num[index]])
    a_sample = torch.FloatTensor(self.x_data[index])
    a_target = torch.LongTensor([self.y_data[index]])
    return {"sample_num": a_index, "samples": a_sample, "targets": a_target}



## 4. Build Model

In [None]:
class MyLinearModel(torch.nn.Module):
  def __init__(self, num_input, num_output):
    super().__init__()
    self.linear = nn.Linear(in_features= num_input, out_features= num_output)
    #self.softmax = nn.Softmax()

  def forward(self, x):
    return self.linear(x)

## 5. Declear the datasets, models and optimizer

In [None]:
# load datasets
train_dataset = MyNLILDataset(df)
train_dataloader = DataLoader(train_dataset, batch_size=4)

# make a new model and optimizer
model = MyLinearModel(num_inputs, num_labels)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)


## 6. Training

In [None]:
num_epoch = 100

for idx in range(num_epoch):
  total_loss = []
  total_sample_num = []
  total_predict = []
  total_correct = 0
  total_samples = 0

  for idx, batch in tqdm(enumerate(train_dataloader)):

    input = batch["samples"]
    target = batch["targets"]
    y_hat = model(input)
    loss = F.cross_entropy(y_hat, target.squeeze())
    total_loss.append(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #just for evaluation
    total_sample_num.append(batch["sample_num"])
    total_predict.append(y_hat.argmax(dim=1).numpy().tolist())
    num_correct = sum(y_hat.argmax(dim=1).numpy() == target.reshape(-1).numpy())
    num_samples = len(target)
    total_samples = total_samples + num_samples
    total_correct = total_correct + num_correct

  print("Loss: "+ str(sum(total_loss)), "Accuracy: " + str(round(total_correct/total_samples, 4)))


0it [00:00, ?it/s]

Loss: 258.62143510580063 Accuracy: 0.493


0it [00:00, ?it/s]

Loss: 239.82137322425842 Accuracy: 0.646


0it [00:00, ?it/s]

Loss: 222.80801820755005 Accuracy: 0.769


0it [00:00, ?it/s]

Loss: 207.42388927936554 Accuracy: 0.867


0it [00:00, ?it/s]

Loss: 193.51507604122162 Accuracy: 0.919


0it [00:00, ?it/s]

Loss: 180.93499845266342 Accuracy: 0.95


0it [00:00, ?it/s]

Loss: 169.54683044552803 Accuracy: 0.964


0it [00:00, ?it/s]

Loss: 159.2248299717903 Accuracy: 0.972


0it [00:00, ?it/s]

Loss: 149.8548131287098 Accuracy: 0.979


0it [00:00, ?it/s]

Loss: 141.33402779698372 Accuracy: 0.984


0it [00:00, ?it/s]

KeyboardInterrupt: ignored

## 7. Evaluation

In [None]:
df = pd.read_csv("test_data.csv")
df["pre_hypo"] = df["premise"] + " | " + df["hypothesis"]
df["label_num"] = 0

In [37]:
df.head(2)

Unnamed: 0,index,premise,hypothesis,label,pre_hypo,label_num,pre_hypo_vector
0,0,다만 조금 좁아서 케리어를 펼치기 불편합니다.,케리어를 펼치기에 공간이 충분했습니다.,entailment,다만 조금 좁아서 케리어를 펼치기 불편합니다. | 케리어를 펼치기에 공간이 충분했습니다.,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,그리고 위치가 시먼역보다는 샤오난먼역에 가까워요,시먼역보다는 샤오난먼역에 먼저 도착할 수 있어요,entailment,그리고 위치가 시먼역보다는 샤오난먼역에 가까워요 | 시먼역보다는 샤오난먼역에 먼저 ...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
corpus = df["pre_hypo"]
X = vectorizer.transform(corpus)
df["pre_hypo_vector"]= X.todense().tolist()

In [None]:
# load test datasets
test_dataset = MyNLILDataset(df)
test_dataloader = DataLoader(test_dataset, batch_size=1)

In [None]:
total_sample_num = []
total_predict = []

for idx, batch in tqdm(enumerate(test_dataloader)):
    with torch.no_grad():
      sample_num = batch["sample_num"]
      input = batch["samples"]
      target = batch["targets"]
      y_hat = model(input)

      total_sample_num += sample_num.numpy().tolist()
      total_predict += y_hat.argmax(dim=1).numpy().tolist()


0it [00:00, ?it/s]

In [None]:
df["label_num"] = total_predict

In [None]:
df["label"] = df["label_num"].apply(lambda x: label_decoder(int(x)))

In [None]:
df.head(2)

Unnamed: 0,index,premise,hypothesis,label,pre_hypo,label_num,pre_hypo_vector
0,0,다만 조금 좁아서 케리어를 펼치기 불편합니다.,케리어를 펼치기에 공간이 충분했습니다.,entailment,다만 조금 좁아서 케리어를 펼치기 불편합니다. | 케리어를 펼치기에 공간이 충분했습니다.,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,그리고 위치가 시먼역보다는 샤오난먼역에 가까워요,시먼역보다는 샤오난먼역에 먼저 도착할 수 있어요,entailment,그리고 위치가 시먼역보다는 샤오난먼역에 가까워요 | 시먼역보다는 샤오난먼역에 먼저 ...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
df[["index","label"]].to_csv("sample_submission.csv", index=False)