!pip install transformers

In [1]:
import numpy as np
import pandas as pd
import time
import sys
import copy
import torch 
from scipy.sparse import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pyarrow as pa

import torch
import torch.nn as nn
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset,DataLoader
from transformers import DistilBertConfig,DistilBertTokenizer,DistilBertModel

In [2]:
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("data/amazon-product-reviews/Reviews.csv")
df.head(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [14]:
df.iloc[1]['Summary']

'Not as Advertised'

In [4]:
texts = df['Text']
scores = df['Score'] # rating between 1-5

In [5]:
X_train = texts.values

In [6]:
config = DistilBertConfig()
config

DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.6.0",
  "vocab_size": 30522
}

In [7]:
max_seq_length = 256
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
def tokenize(x):
    
    
    encoding = tokenizer.encode_plus(
          x,
          add_special_tokens=True,
          max_length=max_seq_length,
          return_token_type_ids=False,
          pad_to_max_length=True,
          #padding=True,
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )
    return encoding['input_ids'].flatten(), encoding['attention_mask'].flatten()
    
    '''
    #tokenized_comment = tokenizer.tokenize(x)
    if len(tokenized_comment) > max_seq_length:
        tokenized_comment = tokenized_comment[:max_seq_length]

    ids_review  = tokenizer.convert_tokens_to_ids(tokenized_comment)

    padding = [0] * (max_seq_length - len(ids_review))

    ids_review += padding

    assert len(ids_review) == max_seq_length

    #print(ids_review)
    ids_review = torch.tensor(ids_review)

    #hcc = self.y[index] # toxic comment        
    #list_of_labels = [torch.from_numpy(hcc)]
    #return ids_review, list_of_labels[0]
    return ids_review
    '''

In [9]:

#attention_mask

In [10]:
class DistilBertSequence(nn.Module):
    def __init__(self, config):
        super().__init__()
        #self.num_labels = config.num_labels

        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        #self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        #self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        #self.dropout = nn.Dropout(config.seq_classif_dropout)
        #nn.init.xavier_normal_(self.classifier.weight)
    
        
    def forward(self, input_ids=None, attention_mask=None): #,head_mask=None,labels=None):
        distilbert_output = self.distilbert(input_ids=input_ids,
                                            attention_mask=attention_mask)
        hidden_state = distilbert_output[0]                    
        pooled_output = hidden_state[:, 0]  
        return pooled_output
        #pooled_output = self.pre_classifier(pooled_output)   
        #pooled_output = nn.ReLU()(pooled_output)             
        #pooled_output = self.dropout(pooled_output)        
        #logits = self.classifier(pooled_output) 
        #return logits

In [11]:
# tokenize and encode with distil
with torch.no_grad():
    token_ids, attention_mask = tokenize(X_train[0])
    model = DistilBertSequence(config)
    x = token_ids.reshape(1,len(token_ids))
    
    out = model.forward(input_ids=x, attention_mask=attention_mask)
    print(out)



Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[-3.5643e-02, -3.5159e-02,  1.5530e-02, -2.3242e-01,  2.2430e-02,
         -7.0673e-02,  1.9809e-01,  3.2892e-01, -6.6505e-02, -1.9768e-01,
          1.2504e-01,  1.3313e-01,  1.4475e-01,  1.1562e-01, -3.0118e-01,
          1.3435e-01,  3.3235e-02,  5.0848e-01,  1.9977e-01,  1.1225e-01,
          8.9052e-02, -6.2093e-01,  1.2021e-01,  3.3409e-01,  5.6048e-02,
          1.2262e-01,  2.0878e-01,  1.4484e-01,  2.3509e-01, -1.4653e-01,
          3.7198e-01,  1.2625e-01, -1.4292e-01, -2.4212e-01,  2.0895e-01,
         -1.8431e-02,  1.6187e-01, -2.4050e-01, -2.8422e-01,  5.2403e-02,
         -1.2108e-01,  2.8366e-01,  6.1746e-02,  1.5687e-01, -9.4009e-02,
         -1.7512e-01, -3.0254e+00, -1.1415e-01,  1.1723e-01, -1.5747e-01,
          1.9896e-01, -1.5239e-01,  1.0354e-02,  2.8528e-01,  4.2127e-01,
          5.2762e-01, -6.2068e-01,  2.3257e-01, -2.4997e-01, -5.2612e-02,
          1.0061e-01,  1.5803e-01, -3.2713e-02,  1.8643e-01, -1.3009e-01,
          3.0736e-01, -1.2108e-01,  4.