In [1]:
!pip install contractions
!pip install evaluate
!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

import pandas as pd
import contractions
import re
import spacy
import warnings
import torch
import numpy as np
import os
import datasets

from datasets import Dataset, DatasetDict
from transformers import get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
pd.set_option('display.max_colwidth', 400)

os.makedirs("/kaggle/model")

[0mjupyter_http_over_ws extension initialized. Listening on /http_over_websocket
[32m[I 22:05:07.244 NotebookApp][m Skipped non-installed server(s): bash-language-server, dockerfile-language-server-nodejs, javascript-typescript-langserver, jedi-language-server, julia-language-server, pyright, python-language-server, r-languageserver, sql-language-server, texlab, typescript-language-server, unified-language-server, vscode-css-languageserver-bin, vscode-html-languageserver-bin, vscode-json-languageserver-bin, yaml-language-server
[35m[C 22:05:09.103 NotebookApp][m Running as root is not recommended. Use --allow-root to bypass.
/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv
/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv


In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased' , num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased" , num_labels=3);

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
df_train = pd.read_csv('/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv')
df_valid = pd.read_csv('/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv')

df_train = df_train.filter({'Title' , 'Body' , 'Tags' , 'Y'})
df_train['Y'] = df_train['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2})

df_valid = df_valid.filter({'Title' , 'Body' , 'Tags' , 'Y'})
df_valid['Y'] = df_valid['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2})

In [5]:
class Preprocessing:
    def __get_tag_col(self , text):
        text = text.replace('<' , ' ')
        text = text.replace('>' , ' ')
        text = text.split()
        text = ' '.join(text)
        return text.split()

    def __get_body_tag_text(self , text):
        soup = BeautifulSoup(text ,features='xml')
        return soup.get_text()
    
    def __handle_contractions(self , text):
        c = []
        for word in text.split():
            c.append(contractions.fix(word)) 
        c = ' '.join(c)
        return c

    def __get_ents(self , text):
        remove_ent = ['CARDINAL' , 'PERSON' , 'TIME' , 'DATE']
        doc = nlp(text)
        ent_list = []

        for t in doc.ents:
            if t.label_ not in remove_ent:
                ent_list.append((t.text , t.label_))
                
        return ent_list
        
    def __lower_and_punc_removal(self , text):
        text = text.lower()
        text = re.sub(r'[^(a-zA-Z)\s]','', text)
        return text

    def __init__(self , df):
        self.df = df
        self.df['Tags_list'] = self.df['Tags'].apply(self.__get_tag_col) # treat as entities
        self.tag_list = []
        for tags in self.df['Tags_list']:
            for e in tags:
                self.tag_list.append(e)
        self.tag_list = pd.Series(self.tag_list)
        self.df['Body_Between_Tags'] = self.df['Body'].apply(self.__get_body_tag_text)
        self.df['Body_ENTS'] = self.df['Body_Between_Tags'].apply(self.__get_ents)
        self.df['Title_ENTS'] = self.df['Title'].apply(self.__get_ents)
        self.df['Body_Text_Cleaned'] = self.df['Body_Between_Tags'].apply(self.__handle_contractions)
        self.df['Final_clean'] = self.df['Body_Between_Tags'].apply(self.__lower_and_punc_removal)

In [6]:
PreprocessedObject_train = Preprocessing(df=df_train)
PreprocessedObject_valid = Preprocessing(df=df_valid)

PreprocessedObject_train.df.dropna(inplace=True)
PreprocessedObject_valid.df.dropna(inplace=True)
PreprocessedObject_train.df.reset_index(inplace=True)
PreprocessedObject_valid.df.reset_index(inplace=True)

df_train_final = PreprocessedObject_train.df
df_valid_final = PreprocessedObject_valid.df

In [7]:
df_train_final.drop(['index' , 'Tags', 'Body', 'Title', 'Tags_list', 'Body_Between_Tags', 'Body_ENTS', 'Title_ENTS', 'Body_Text_Cleaned'] , inplace=True , axis=1)

In [8]:
df_valid_final.drop(['index' , 'Tags', 'Body', 'Title', 'Tags_list', 'Body_Between_Tags', 'Body_ENTS', 'Title_ENTS', 'Body_Text_Cleaned'], inplace=True , axis=1)

In [9]:
datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(df_train_final),
    "test": Dataset.from_pandas(df_valid_final)
    })

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["Final_clean"], padding="max_length", truncation=True)
tokenized_datasets = datasets_train_test.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['Final_clean'])
tokenized_datasets = tokenized_datasets.rename_column("Y", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15000
    })
})

In [12]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


model.train()
for epoch in range(num_epochs):
    print('Epoch ' , epoch + 1 , '/' , num_epochs)
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

Epoch  1 / 1


100%|██████████| 5625/5625 [1:10:05<00:00,  1.34it/s]


In [13]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.8750666666666667}