In [7]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from torch import nn
import os

In [8]:
# Define the model
class BertForMultivariateRegression(nn.Module):
    def __init__(self, model_name_or_path, num_labels=5):
        super(BertForMultivariateRegression, self).__init__()
        self.num_labels = num_labels
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.bert = AutoModel.from_pretrained(model_name_or_path, config=self.config)
        self.regression = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels),
            #nn.ReLU(),
            #nn.Dropout(0.1),
            #nn.Linear(128, ),
        )
        self.init_weights()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)

    def init_weights(self):
        for module in self.regression:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output
        logits = self.regression(pooler_output)

        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits

    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)
        self.config.save_pretrained(save_directory)
        torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))

    @classmethod
    def from_pretrained(cls, save_directory, model_name_or_path, num_labels=5):
        config = AutoConfig.from_pretrained(save_directory)
        model = cls(model_name_or_path, num_labels=num_labels)
        state_dict = torch.load(os.path.join(save_directory, "pytorch_model.bin"), map_location=torch.device('cpu'))
        model.load_state_dict(state_dict)
        return model

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model
model = BertForMultivariateRegression.from_pretrained("./saved_model", model_name_or_path="nlpaueb/bert-base-greek-uncased-v1", num_labels=5)
model = model.to(device)
model.eval()


Using device: cuda


config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/454M [00:00<?, ?B/s]

BertForMultivariateRegression(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")

# Load the prediction data
prediction_data = pd.read_csv("cleaned_draw2texts.csv", sep=",", engine='python')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/530k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
len(prediction_data)

119

In [12]:
prediction_data

Unnamed: 0,text
0,"Και πριν αυτος παυση λαλων , ιδου , εξηρχετο η..."
1,"Και την τριτην ημεραν , ημεραν των γενεθλιων τ..."
2,Οι οφθαλμοι σας ειδον τι εκαμεν ο Κυριος εξ αι...
3,Και εβοησεν ο Αχιμαας και ειπε προς τον βασιλε...
4,Και ηλθε Σαφαν ο γραμματευς προς τον βασιλεα κ...
...,...
114,Με την πρόταση αυτή η Επιτροπή επιχειρεί να πρ...
115,Αυτές οι προτάσεις για μεγαλύτερη διαφάνεια στ...
116,"Κυρία Πρόεδρε , κύριε Πρόεδρε του Συμβουλίου M..."
117,Αυτό αποτελεί μια σημαντική ευκαιρία να τεθούν...


In [18]:
# Load the scaling parameters
#with open("scaling_params.json", "r") as f:
#    scaling_params = json.load(f)

# Function to check if text exceeds token limit
def exceeds_token_limit(text, max_length=512):
    return len(tokenizer.encode(text)) > max_length

In [14]:
# Filter out texts exceeding token limit
valid_mask = prediction_data['text'].apply(lambda x: not exceeds_token_limit(x))
filtered_data = prediction_data[valid_mask]

# Tokenize the valid input texts
inputs = tokenizer(filtered_data['text'].tolist(), padding=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

In [15]:
# Make predictions
with torch.no_grad():
    predictions = model(**inputs)
    predictions = predictions.cpu().numpy()

# Create a DataFrame with the predictions
predicted_data = pd.DataFrame(predictions, columns=['1', '2', '3', '4', '5'])

In [19]:
scaling_params = {'min_vals': [0.9014031435288774,
  -0.20944372492040664,
  0.3389173248677211,
  0.5175195569772773,
  0.9111702659370386],
 'scale': [0.09026005554938518,
  0.11147839708128467,
  0.05688428995404781,
  0.0417984150486127,
  0.014517997301216296],
 'data_min': [-21.0658317453454,
  -7.0915647854459,
  -23.5375588927861,
  -36.3056722416953,
  -131.641453451498],
 'data_max': [1.09236423433371,
  10.8491309220973,
  11.6215333911404,
  11.5430320135724,
  6.11859419863091]}

In [20]:
# Scale the predictions to their original ranges
for i in range(5):
    scaler = MinMaxScaler(feature_range=(scaling_params['data_min'][i], scaling_params['data_max'][i]))
    predicted_data[[str(i+1)]] = scaler.fit_transform(predicted_data[[str(i+1)]])

# Add the original text to the predictions
predicted_data['text'] = filtered_data['text'].reset_index(drop=True)

In [17]:
# Create a DataFrame for all original data, with NaN for removed texts
full_predicted_data = pd.DataFrame(index=prediction_data.index, columns=['text', '1', '2', '3', '4', '5'])
full_predicted_data.loc[valid_mask, ['1', '2', '3', '4', '5']] = predicted_data[['1', '2', '3', '4', '5']]
full_predicted_data['text'] = prediction_data['text']

In [18]:
# Display the first few rows of the predictions, including removed texts
print(full_predicted_data.head(10))

# Print the number of removed texts
num_removed = (~valid_mask).sum()
print(f"\nNumber of texts removed due to exceeding token limit: {num_removed}")

# Save the results to a CSV file
full_predicted_data.to_csv("cleaned_draw2text_predicted.csv", index=False)
print("\nPredictions saved to 'cleaned_draw2text_predicted.csv'")

                                                text          1         2  \
0  Και πριν αυτος παυση λαλων , ιδου , εξηρχετο η... -16.040192  0.746675   
1  Και την τριτην ημεραν , ημεραν των γενεθλιων τ... -10.927856 -0.993885   
2  Οι οφθαλμοι σας ειδον τι εκαμεν ο Κυριος εξ αι... -11.083191 -0.825089   
3  Και εβοησεν ο Αχιμαας και ειπε προς τον βασιλε... -13.251968 -0.657799   
4  Και ηλθε Σαφαν ο γραμματευς προς τον βασιλεα κ... -10.781921  -0.68861   
5  Τοτε εσηκωθησαν οι αρχηγοι των πατριων του Ιου... -10.287888 -1.609157   
6  Η Εσθηρ δεν εφανερωσε την συγγενειαν αυτης ουτ... -11.145554 -1.151065   
7  Και ευρον οτι πικροτερα ειναι παρα θανατον η γ... -13.259445  2.831925   
8  Και ουτω θελει προσμεινει ο Κυριος δια να σας ... -15.395737  0.030364   
9  Και θελω συντριψει αυτους μετ αλληλων , και το... -11.052994  0.568834   

          3          4          5  
0  8.190218  -9.984833 -26.493286  
1   5.70701  -8.580276 -30.511108  
2  4.103956 -13.183258 -22.870483  
3  4.665

In [19]:
num_removed = (~valid_mask).sum()
print(f"Number of texts removed due to exceeding token limit: {num_removed}")

Number of texts removed due to exceeding token limit: 0


In [20]:
removed_texts = prediction_data[~valid_mask]
print(removed_texts)

Empty DataFrame
Columns: [text]
Index: []


In [21]:
token_lengths = prediction_data['text'].apply(lambda x: len(tokenizer.encode(x)))
print("Texts exceeding token limit:")
print(prediction_data[token_lengths > 512])

Texts exceeding token limit:
Empty DataFrame
Columns: [text]
Index: []


In [22]:
print(full_predicted_data.tail())

                                                  text         1         2  \
114  Με την πρόταση αυτή η Επιτροπή επιχειρεί να πρ... -3.500702 -6.479919   
115  Αυτές οι προτάσεις για μεγαλύτερη διαφάνεια στ... -3.174026 -5.971175   
116  Κυρία Πρόεδρε , κύριε Πρόεδρε του Συμβουλίου M... -7.844177 -3.853869   
117  Αυτό αποτελεί μια σημαντική ευκαιρία να τεθούν... -1.122894    -5.959   
118  Μια ακόμα κρίσιμης σημασίας παράμετρος είναι ,...  0.580551 -5.980046   

             3          4          5  
114 -17.953266  -8.618385 -55.333374  
115   -15.6772 -11.498337 -49.696289  
116  -8.865986 -11.874023 -73.174744  
117 -16.702127 -12.272057 -73.485474  
118 -15.609471  -9.517403 -43.477783  


In [23]:
print("Shape of filtered_data:", filtered_data.shape)
print("Shape of predictions:", predictions.shape)

Shape of filtered_data: (119, 1)
Shape of predictions: (119, 5)


In [41]:
print("Sum of valid_mask:", valid_mask.sum())
print("Number of rows in filtered_data:", len(filtered_data))

Sum of valid_mask: 119
Number of rows in filtered_data: 119


In [42]:
print("Shape of prediction_data:", prediction_data.shape)
print("Shape of filtered_data:", filtered_data.shape)
print("Shape of predictions:", predictions.shape)
print("Shape of predicted_data:", predicted_data.shape)
print("Shape of full_predicted_data:", full_predicted_data.shape)

Shape of prediction_data: (120, 2)
Shape of filtered_data: (119, 2)
Shape of predictions: (119, 5)
Shape of predicted_data: (119, 6)
Shape of full_predicted_data: (120, 6)
