In [1]:
import os
from glob import glob
from datetime import datetime
from collections import namedtuple
from pickle import dump, load

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import transformers
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
import optuna

from arabert.preprocess import ArabertPreprocessor

In [2]:
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
model = AutoModelForSequenceClassification.from_pretrained("models/finalized_models/2021-09-30-train-0.8921535648994515",output_hidden_states=True)
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
def word_index(sentence, word):
     return sentence.split(" ").index(word)
 
 
def get_hidden_states(encoded, token_ids_word, model, layers):
     with torch.no_grad():
         output = model(**encoded)
 
     states = output.hidden_states
     output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
     word_tokens_output = output[token_ids_word]
 
     return word_tokens_output.mean(dim=0)
 
 
def get_embeddings(sentence, idx, tokenizer, model, layers):
     encoded = tokenizer.encode_plus(sentence, return_tensors="pt")
     token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
 
     return get_hidden_states(encoded, token_ids_word, model, layers)
   
layers = [-4, -3, -2, -1]
sentence = "هلا كيف الحال" 
idx = word_index(sentence, "هلا")
word_embedding = get_embeddings(sentence, idx, tokenizer, model, layers)

word_embedding  

tensor([-7.1698e-01,  2.8024e-02, -7.8250e-01, -4.1261e-01, -2.0064e+00,
        -1.8946e+00,  6.1162e-02,  3.5752e-01, -9.4477e-01,  4.2975e-01,
        -2.6864e+00,  7.0915e-01, -5.5534e-01,  6.2803e-01, -2.3906e+00,
         7.3605e-01, -6.6854e-01, -7.3385e-01,  4.4926e-01, -8.7689e-01,
         3.2938e-02, -1.2178e-01,  1.6147e+00,  1.5512e+00, -2.1013e-01,
        -2.1563e+00,  7.0385e-01, -1.3519e+00,  2.0527e+00, -1.3488e+00,
         9.2105e-01, -1.4275e+00, -7.1441e-01, -8.2480e-01,  2.1275e-01,
        -1.1187e+00,  1.0586e+00, -5.6239e-01,  1.3157e+00, -6.0936e-01,
         5.2843e-01, -1.0474e+00,  1.4777e-01, -4.1643e-01, -6.2539e-01,
        -1.3893e+00,  4.1672e+00,  8.2867e-01, -4.6922e-01, -7.5671e-01,
         9.8978e-01, -1.1431e-01, -2.8773e-01, -1.1870e+00, -2.1438e+00,
        -1.3279e+00, -2.2794e+00, -3.0283e-02,  3.1005e-01,  1.5610e+00,
        -5.0476e-01,  8.8375e-02,  1.2676e+00,  5.8302e-01, -6.5239e-01,
         4.2560e+00,  5.1048e-01,  3.7323e-01, -1.6

In [5]:
word_embedding.shape

torch.Size([768])

In [6]:
model.base_model.embeddings(np.array("هلا"))

TypeError: 'int' object is not callable

In [12]:
model.get_input_embeddings()

Embedding(64000, 768, padding_idx=0)