### imports 

In [16]:

# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
from sklearn.model_selection import train_test_split

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [17]:
train_test_ratio = 0.10
train_valid_ratio = 0.80

first_n_words = 350

destination_folder = "./data"

In [18]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

### preprocess data 

In [19]:

df_raw = pd.read_csv("./data/raw_train.csv")

In [23]:
df_raw

Unnamed: 0,label,title,text,titletext
0,1,,,"Decimal vs Double?. I'm new to C#, and I want ..."
1,1,,,Percentage width child in absolutely positione...
2,1,,,Tools for porting J# code to C#. Are there any...
3,1,,,How do I calculate someone's age in c#?. Given...
4,1,,,retrieve data from NSUserDefaults to TableView...
...,...,...,...,...
3370523,1,,,Dividing an array by filter function. I have a...
3370524,1,,,JavaScript Link Extractor. I am interested in ...
3370525,1,,,Selenium Remote Webdriver insane memory usage....
3370526,1,,,Searching through an array of dictionaries. I'...


In [20]:
# Prepare columns
df_raw['label'] = (df_raw['OpenStatus'] == 'open').astype('int')
df_raw['titletext'] = df_raw['Title'] + ". " + df_raw['BodyMarkdown']
df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# Trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)
df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# Split according to label
df_open = df_raw[df_raw['label'] == 1]
df_closed = df_raw[df_raw['label'] == 0]

# Train-test split
df_open_full_train, df_open_test = train_test_split(df_open, train_size = train_test_ratio, random_state = 1)
df_closed_full_train, df_closed_test = train_test_split(df_closed, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_open_train, df_open_valid = train_test_split(df_open_full_train, train_size = train_valid_ratio, random_state = 1)
df_closed_train, df_closed_valid = train_test_split(df_closed_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_open_train, df_closed_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_open_valid, df_closed_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_open_test, df_closed_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv("./data" + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)



AttributeError: Can only use .str accessor with string values!

In [None]:
df["BodyMarkdown"]

0          I'm new to C#, and I want to use a trackbar fo...
1          I've got an absolutely positioned div containi...
2          Are there any conversion tools for porting Vis...
3          Given a DateTime representing their birthday, ...
4          I save values of two labels through NSUserDefa...
                                 ...                        
3370523    I have a Javascript array that I would like to...
3370524    I am interested in extracting links from sites...
3370525    I've created a small python script to run test...
3370526    I'm making an iPhone app which displays inform...
3370527    First try\r\n\r\n    Dim holdValues() As Integ...
Name: BodyMarkdown, Length: 3370528, dtype: object

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model parameter
MAX_SEQ_LEN = 500
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)

https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

https://www.tensorflow.org/text/tutorials/classify_text_with_bert

word embeddings + LSTM https://towardsdatascience.com/text-classification-on-disaster-tweets-with-lstm-and-word-embedding-df35f039c1db
