### imports 

In [1]:

# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
from sklearn.model_selection import train_test_split

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [2]:
train_test_ratio = 0.10
train_valid_ratio = 0.80

first_n_words = 350

destination_folder = "./data"

In [3]:
def trim_string(x):
    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

### preprocess data 

In [4]:
df_raw = pd.read_csv("./data/raw_train.csv")

In [10]:
# df_raw[df_raw["OpenStatus"] != "open"]
df_raw.nunique()


PostId                                 3370528
PostCreationDate                       3295724
OwnerUserId                             622689
OwnerCreationDate                       620219
ReputationAtPostCreation                 27151
OwnerUndeletedAnswerCountAtPostTime       2857
Title                                  3363198
BodyMarkdown                           3370209
Tag1                                     20059
Tag2                                     30027
Tag3                                     32954
Tag4                                     31243
Tag5                                     27850
PostClosedDate                           70070
OpenStatus                                   5
dtype: int64

In [None]:
not_open

In [31]:
df_short = df_raw.loc[0:10000]
df_short[df_short["OpenStatus"] != "open"]


#does the tite match the question
# using open status to give reccomendations to a person 
# generate a better title based on the body markdwn 


Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus
7,23,08/01/2008 12:09:41,48,08/01/2008 13:25:15,1,0,Latest information on PHP upcoming releases,I'm trying to track the progress of PHP 5.3 an...,php,,,,,05/18/2012 11:12:42,not constructive
30,126,08/01/2008 16:10:30,58,08/01/2008 13:56:33,11,1,How would you access Object properties from wi...,"What is the ""purist"" or ""correct"" way to acces...",oo,java,php,theory,,05/08/2012 18:11:27,not constructive
31,129,08/01/2008 16:22:42,48,08/01/2008 13:25:15,11,1,How to export data from SQL Server to MySQL,I've been banging my head against SQL Server 2...,csv,ansi,sql,php,mssql,07/03/2012 14:30:16,off topic
37,173,08/01/2008 18:33:08,83,08/01/2008 16:31:56,16,4,How do I version my MS SQL database in SVN?,I've been wanting to get my databases under ve...,subversion,svn,sql,mssql,versioncontrol,06/29/2012 15:08:28,not constructive
41,177,08/01/2008 18:37:55,83,08/01/2008 16:31:56,16,4,How do I programmatically create a PDF in my ....,Please recommend a good library for programmat...,pdf,.net,,,,04/25/2012 11:32:29,not constructive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9822,84799,09/17/2008 15:58:44,7938,09/15/2008 14:44:30,66,18,What is the single best free Eclipse plugin fo...,Some Eclipse plugins are mandated by your envi...,java,eclipse-plugin,eclipse,ide,,09/20/2011 02:06:42,not constructive
9860,85114,09/17/2008 16:36:57,3957,09/01/2008 02:50:44,447,26,How do you structure a development sprint?,So I have a backlog of features and we are abo...,sprint,agile,team,teamwork,,05/05/2012 18:53:11,off topic
9918,85636,09/17/2008 17:31:29,15054,09/17/2008 05:31:20,43,3,How do I erase my disk? (In a secure way) (Ubu...,I mean really erase. In a secure way. (It's Ub...,linux,security,ubuntu,disk,,01/25/2012 19:28:04,off topic
9972,86177,09/17/2008 18:31:46,5802,09/11/2008 12:26:41,240,24,Books for an ASP.NET Developer moving to Java,Similar to this question http://stackoverflow....,books,java,ap.net,asp.net-mvc,nhibernate,09/28/2011 11:33:16,not constructive


In [26]:

df_short['label'] = (df_short['OpenStatus'] == 'open').astype('int')
df_short['titletext'] = df_short['Title'] + ". " + df_short['BodyMarkdown']

# Trim text and titletext to first_n_words
df_short['BodyMarkdown'] = df_short['BodyMarkdown'].apply(trim_string)
df_short['titletext'] = df_short['titletext'].apply(trim_string) 


# Split according to label
df_open = df_short[df_short['label'] == 1]
df_closed = df_short[df_short['label'] == 0]
# Train-test split
df_open_full_train, df_open_test = train_test_split(df_open, train_size = train_test_ratio, random_state = 1)
df_closed_full_train, df_closed_test = train_test_split(df_closed, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_open_train, df_open_valid = train_test_split(df_open_full_train, train_size = train_valid_ratio, random_state = 1)
df_closed_train, df_closed_valid = train_test_split(df_closed_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_open_train, df_closed_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_open_valid, df_closed_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_open_test, df_closed_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv("./data" + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['label'] = (df_short['OpenStatus'] == 'open').astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['titletext'] = df_short['Title'] + ". " + df_short['BodyMarkdown']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['BodyMarkdown'] = df_short['BodyMarkdown'].apply(tr

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model parameter
MAX_SEQ_LEN = 500
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path="./data", train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                             train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, train=False, shuffle=False, sort=False)

In [29]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [30]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, title, text, titletext), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            titletext = titletext.type(torch.LongTensor)  
            titletext = titletext.to(device)
            output = model(titletext, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (labels, title, text, titletext), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(device)
                        titletext = titletext.type(torch.LongTensor)  
                        titletext = titletext.to(device)
                        output = model(titletext, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

model = BERT()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

https://www.tensorflow.org/text/tutorials/classify_text_with_bert

word embeddings + LSTM https://towardsdatascience.com/text-classification-on-disaster-tweets-with-lstm-and-word-embedding-df35f039c1db

https://www.kaggle.com/c/predict-closed-questions-on-stack-overflow/data

In [None]:
# # Prepare columns
# df_raw['label'] = (df_raw['OpenStatus'] == 'open').astype('int')
# df_raw['titletext'] = df_raw['Title'] + ". " + df_raw['BodyMarkdown']
# # df_raw = df_raw.reindex(columns=['label', 'Title', 'BodyMarkdown', 'titletext'])

# # # Drop rows with empty text
# # df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# # Trim text and titletext to first_n_words
# df_raw['BodyMarkdown'] = df_raw['BodyMarkdown'].apply(trim_string)
# df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# # Split according to label
# df_open = df_raw[df_raw['label'] == 1]
# df_closed = df_raw[df_raw['label'] == 0]

# # Train-test split
# df_open_full_train, df_open_test = train_test_split(df_open, train_size = train_test_ratio, random_state = 1)
# df_closed_full_train, df_closed_test = train_test_split(df_closed, train_size = train_test_ratio, random_state = 1)

# # Train-valid split
# df_open_train, df_open_valid = train_test_split(df_open_full_train, train_size = train_valid_ratio, random_state = 1)
# df_closed_train, df_closed_valid = train_test_split(df_closed_full_train, train_size = train_valid_ratio, random_state = 1)

# # Concatenate splits of different labels
# df_train = pd.concat([df_open_train, df_closed_train], ignore_index=True, sort=False)
# df_valid = pd.concat([df_open_valid, df_closed_valid], ignore_index=True, sort=False)
# df_test = pd.concat([df_open_test, df_closed_test], ignore_index=True, sort=False)

# # Write preprocessed data
# df_train.to_csv("./data" + '/train.csv', index=False)
# df_valid.to_csv(destination_folder + '/valid.csv', index=False)
# df_test.to_csv(destination_folder + '/test.csv', index=False)

