In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2022-04-13 23:13:16.771989: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-13 23:13:16.772119: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
labels = {'Democrat': 0,'Republic': 1}

class Dataset(torch.utils.data.Dataset):
  def __init__(self, df):
    self.labels = []
    self.texts = []

    # get labels and texts
    self.labels = [labels[label] for label in df.iloc[:,0]]
    self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,
                              return_tensors="pt") for text in df.iloc[:,1]]
      

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    # Fetch a batch of labels
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    # Fetch a batch of inputs
    return self.texts[idx]

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)

    return batch_texts, batch_y

In [4]:
class DistillBertClassifier(nn.Module):
  def __init__(self, dropout=0.5):
    super().__init__()
    config = DistilBertConfig(dropout=0.5, attention_dropout=0.5, output_hidden_states=True)
    self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, 2)
    self.relu = nn.ReLU()

  def forward(self, input_id, mask):
    pooled_output = self.bert(input_ids = input_id, attention_mask=mask, return_dict=False)
    dropout_output = self.dropout(pooled_output[0])
    linear_output = self.linear(dropout_output)
    final_layer = self.relu(linear_output)
    result = final_layer.view(1, 1024)

    return result

In [10]:
EPOCHS = 5
LR = 1e-6

np.random.seed(112)
df = pd.read_csv('party_transcript.csv')
# df = df.drop(columns=['Handle'])
# df = df.sample(n=3000, random_state=2)
dataset = Dataset(df)

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

In [11]:
model = DistillBertClassifier()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
      model = model.cuda()
      criterion = criterion.cuda()

    for epoch_num in range(epochs):

      total_acc_train = 0
      total_loss_train = 0

      for train_input, train_label in tqdm(train_dataloader):

        train_label = train_label.to(device)
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
        
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
      total_acc_val = 0
      total_loss_val = 0

      with torch.no_grad():
        for val_input, val_label in val_dataloader:
          val_label = val_label.to(device)
          mask = val_input['attention_mask'].to(device)
          input_id = val_input['input_ids'].squeeze(1).to(device)

          output = model(input_id, mask)

          batch_loss = criterion(output, val_label)
          total_loss_val += batch_loss.item()
          
          acc = (output.argmax(dim=1) == val_label).sum().item()
          total_acc_val += acc
      
      print(
          f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
          | Train Accuracy: {total_acc_train / len(train_data): .3f} \
          | Val Loss: {total_loss_val / len(val_data): .3f} \
          | Val Accuracy: {total_acc_val / len(val_data): .3f}')

In [13]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
  
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
train(model, df_train, df_val, LR, EPOCHS)

In [None]:
evaluate(model, df_test)
torch.save(model.state_dict(), "./model_state")

In [23]:
# test on our own data
model = DistillBertClassifier()
model.load_state_dict(torch.load('./model_state', map_location=torch.device('cpu')))
evaluate(model, df_test)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.547


In [64]:
df = pd.read_csv('sentence_fip.csv')
texts = df.iloc[:, 1]
texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in texts]
test_dataloader = torch.utils.data.DataLoader(texts, batch_size=1)

In [None]:
import csv

labels = {0:1, 1:0}

with torch.no_grad():
    line = 0
    f = open('./prediction', 'w')
    writer = csv.writer(f)
    writer.writerow(['FIPS', 'Pred'])
    for test_input in test_dataloader:
        mask = test_input['attention_mask'].to("cuda")
        input_id = test_input['input_ids'].squeeze(1).to("cuda")
        predict = model(input_id, mask).argmax(dim=1)
        writer.writerow([df.iloc[line, 0], labels[int(predict)]])
        line += 1
    f.close()