In [2]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import torch
from torch.utils import data
import transformers as ppb 
from transformers import glue_convert_examples_to_features, InputExample
from tqdm import tqdm
import os
from collections import defaultdict

In [4]:
df_train = pd.read_csv('task2_data/train.csv', sep=',').drop(columns=['Unnamed: 0'])
df_dev = pd.read_csv('task2_data/dev.csv', sep=',').drop(columns=['Unnamed: 0'])

In [6]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def text_tokenization(df):
    df_sub = df[['company_profile', 'description', 'requirements', 'fraudulent']].dropna()
    tok_profile = tokenizer(df_sub['company_profile'].values.tolist(), padding=True, truncation=True, return_tensors='pt')
    tok_description = tokenizer(df_sub['description'].values.tolist(), padding=True, truncation=True, return_tensors='pt')
    tok_requirements = tokenizer(df_sub['requirements'].values.tolist(), padding=True, truncation=True, return_tensors='pt')
    return tok_profile, tok_description, tok_requirements

In [8]:
train_profile, train_description, train_requirements = text_tokenization(df_train)

In [9]:
dev_profile, dev_description, dev_requirements = text_tokenization(df_dev)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [14]:
!mkdir -p train_tensors
!mkdir -p dev_tensors

In [None]:
train_profile_pred = model(train_profile['input_ids'].to(device),train_profile['attention_mask'].to(device))
train_profile_pred = train_profile_pred[0][:,0,:]
torch.save(train_profile_pred, f'train_tensors/train_profile_pred.pt')

train_description_pred = model(train_description['input_ids'].to(device),train_description['attention_mask'].to(device))
train_description_pred = train_description_pred[0][:,0,:]
torch.save(train_description_pred, f'train_tensors/train_description_pred.pt')

train_requirements_pred = model(train_requirements['input_ids'].to(device),train_requirements['attention_mask'].to(device))
train_requirements_pred = train_requirements_pred[0][:,0,:]
torch.save(train_requirements_pred, f'train_tensors/train_requirements_pred.pt')

In [None]:
dev_profile_pred = model(dev_profile['input_ids'].to(device),dev_profile['attention_mask'].to(device))
dev_profile_pred = dev_profile_pred[0][:,0,:]
torch.save(dev_profile_pred, f'dev_tensors/dev_profile_pred.pt')

dev_description_pred = model(dev_description['input_ids'].to(device),dev_description['attention_mask'].to(device))
dev_description_pred = dev_description_pred[0][:,0,:]
torch.save(dev_description_pred, f'dev_tensors/dev_description_pred.pt')

dev_requirements_pred = model(dev_requirements['input_ids'].to(device),dev_requirements['attention_mask'].to(device))
dev_requirements_pred = dev_requirements_pred[0][:,0,:]
torch.save(dev_requirements_pred, f'dev_tensors/dev_requirements_pred.pt')