In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch 
from torch.optim import Adam

from transformers import DistilBertConfig,DistilBertTokenizer,DistilBertModel
import transformers
from transformers import AutoModel, AutoTokenizer

import torch.nn as nn
import torch.nn.functional as F 
from torch.utils.data import Dataset, DataLoader


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
file_id_vs_txt = {}
dataset_dict = {}

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        if ".txt" in filename:
            file_id = filename[0:filename.find(".txt")]
            file_id_vs_txt[file_id] = open(os.path.join(dirname, filename), 'r').read()
        elif ".csv" in  "train.csv":
            dataset_dict[filename[0: filename.find(".csv")]] = pd.read_csv(os.path.join(dirname, filename))
            

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
texts_list = [file_id_vs_txt[row[1]["essay_id"]] for row in dataset_dict["train"].iterrows()] 
dataset_dict["train"]["text"] = texts_list

texts_list = [file_id_vs_txt[row[1]["essay_id"]] for row in dataset_dict["test"].iterrows()] 
dataset_dict["test"]["text"] = texts_list


In [3]:
id_to_vec = {
    "Effective": [0, 0 , 1],
    "Adequate": [0, 1, 0],
    "Ineffective": [1, 0, 0]
}

label_vecs_train = []

dataset_dict["train"]["discourse_effectiveness_vec"] = dataset_dict["train"]["discourse_effectiveness"].map(id_to_vec)


In [4]:
# dataset_dict["train"]['discourse_effectiveness'].unique()
print(len(dataset_dict["train"]))

36765


In [5]:
!pip install transformers

[0m

In [6]:

model_path_or_name = '/kaggle/distilbert-base-uncased'

# instantiate model & tokenizer
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")
model.save_pretrained(model_path_or_name)
tokenizer.save_pretrained(model_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
model = AutoModel.from_pretrained(model_path_or_name)

#tokenizer.save_pretrained(model_path_or_name)
#model.save_pretrained(model_path_or_name)


config = DistilBertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,dropout=0.1,num_labels=3,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
max_seq_length = 120

class ClassifierModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_labels = config.num_labels

        self.distilbert = model
        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)
        self.softmax = nn.Softmax()
        
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, labels=None):
        distilbert_output = self.distilbert(input_ids=input_ids,
                                            attention_mask=attention_mask,
                                            head_mask=head_mask)
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0]                   
        pooled_output = self.pre_classifier(pooled_output)   
        pooled_output = nn.ReLU()(pooled_output)             
        pooled_output = self.dropout(pooled_output)        
        
        logits = self.classifier(pooled_output) 
        logits = self.softmax(logits)   
        return logits

        #nn.init.xavier_normal_(self.classifier.weight)


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
class ClassifierDataset(Dataset):
    def __init__(self, df, mode="train"):
        self.df = df
        self.mode = mode
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        X = f"{self.df.at[idx, 'text']} || {self.df.at[idx, 'discourse_text']} || {self.df.at[2, 'discourse_type']}"
        
        tokenized_comment = tokenizer.tokenize(X)

        if len(tokenized_comment) > max_seq_length:
            tokenized_comment = tokenized_comment[:max_seq_length]

        ids_review  = tokenizer.convert_tokens_to_ids(tokenized_comment)

        padding = [0] * (max_seq_length - len(ids_review))

        ids_review += padding

        assert len(ids_review) == max_seq_length

        #print(ids_review)
        ids_review = torch.tensor(ids_review)
        if self.mode == "train":
            y = self.df.at[idx, "discourse_effectiveness_vec"] 
            
            sample = {
                "X": ids_review,
                "y": torch.tensor(y).float()
            }
        else:
            sample = {
                "X": ids_review,
                "discourse_id": self.df.at[idx, "discourse_id"]
            }
        
        return sample

model = ClassifierModel(config)
train_dataset = ClassifierDataset(dataset_dict["train"],mode="train")
test_dataset = ClassifierDataset(dataset_dict["test"], mode="test")


In [8]:
lr = 0.0001
num_epochs = 40
optimizer = Adam(model.parameters(), lr)
loss = nn.CrossEntropyLoss() 

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

ClassifierModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [10]:
dataset_dict["train"]

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,text,discourse_effectiveness_vec
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","[0, 1, 0]"
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","[0, 1, 0]"
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","[0, 1, 0]"
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","[0, 1, 0]"
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","[0, 1, 0]"
...,...,...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,Some people may ask multiple people for advice...,"[0, 1, 0]"
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,Some people may ask multiple people for advice...,"[0, 1, 0]"
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,Some people may ask multiple people for advice...,"[0, 1, 0]"
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,Some people may ask multiple people for advice...,"[1, 0, 0]"


In [11]:

ops = model(input_ids=train_dataset[0]["X"][None, :].to(device))

train_dataloader = DataLoader(train_dataset, 30, shuffle=True)

for epoch in range(num_epochs):
    for bidx, sample in enumerate(train_dataloader):
        with torch.set_grad_enabled(True):
            y_hat = model(input_ids=sample["X"].to(device))
            y = sample["y"].to(device)
            optimizer.zero_grad()
            crentr_loss = loss(y, y_hat)
            crentr_loss.backward()
            optimizer.step()

            print(f"Finished with batch idx {bidx},  {epoch} epochs")
    print(f"Finished with {epoch} epochs")
    


Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


Finished with batch idx 0,  0 epochs
Finished with batch idx 1,  0 epochs
Finished with batch idx 2,  0 epochs
Finished with batch idx 3,  0 epochs
Finished with batch idx 4,  0 epochs
Finished with batch idx 5,  0 epochs
Finished with batch idx 6,  0 epochs
Finished with batch idx 7,  0 epochs
Finished with batch idx 8,  0 epochs
Finished with batch idx 9,  0 epochs
Finished with batch idx 10,  0 epochs
Finished with batch idx 11,  0 epochs
Finished with batch idx 12,  0 epochs
Finished with batch idx 13,  0 epochs
Finished with batch idx 14,  0 epochs
Finished with batch idx 15,  0 epochs
Finished with batch idx 16,  0 epochs
Finished with batch idx 17,  0 epochs
Finished with batch idx 18,  0 epochs
Finished with batch idx 19,  0 epochs
Finished with batch idx 20,  0 epochs
Finished with batch idx 21,  0 epochs
Finished with batch idx 22,  0 epochs
Finished with batch idx 23,  0 epochs
Finished with batch idx 24,  0 epochs
Finished with batch idx 25,  0 epochs
Finished with batch id

In [12]:
test_dataloader = DataLoader(test_dataset, 64)
results_list = []

for sample in test_dataloader:
    y_hat = model(input_ids=sample["X"].to(device))
    y_hat = y_hat.tolist()
    for idx, lis in enumerate(y_hat):
        lis.insert(0, sample["discourse_id"][idx])
    # print(y_hat)
    results_list.extend(y_hat)

df = pd.DataFrame(results_list, columns=["discourse_id" ,"Ineffective", "Adequate" ,"Effective"]) 
df.to_csv("submission.csv",  index=False)

