# 1. IMPORT LIBRARIES

In [12]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, OPTForCausalLM, GPT2Tokenizer
import torch

In [1]:
import pandas as pd
import numpy as np
import csv

# 2. IMPORT FILES

In [2]:
file_paths = ['embeddings_1.3b_exp_10.csv',
              'embeddings_1.3b_exp_9.csv',
              'embeddings_1.3b_exp_8.csv',
              'embeddings_1.3b_exp_7.csv',
              'embeddings_1.3b_exp_6.csv',
              'embeddings_1.3b_exp_5.csv',
              'embeddings_1.3b_exp_4.csv',
              'embeddings_1.3b_exp_3.csv',
              'embeddings_1.3b_exp_2.csv',
              'embeddings_1.3b_exp_1.csv']
file_paths

['embeddings_1.3b_exp_10.csv',
 'embeddings_1.3b_exp_9.csv',
 'embeddings_1.3b_exp_8.csv',
 'embeddings_1.3b_exp_7.csv',
 'embeddings_1.3b_exp_6.csv',
 'embeddings_1.3b_exp_5.csv',
 'embeddings_1.3b_exp_4.csv',
 'embeddings_1.3b_exp_3.csv',
 'embeddings_1.3b_exp_2.csv',
 'embeddings_1.3b_exp_1.csv']

# 3. READ FILES AND FIND COMMON INDEXES TO ALL

In [6]:
indexes = []
for file in file_paths:
    df = pd.read_csv(file)
    indexes.append(set(df['Index']))

In [7]:
common_indexes = set.intersection(*indexes)
common_indexes

{2,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 13,
 14,
 16,
 18,
 19,
 21,
 34,
 35,
 45,
 50,
 51,
 116,
 133,
 440,
 864,
 1948,
 3216,
 4236,
 21402,
 24303,
 25522,
 45152,
 49463}

# 4. RANK AND ORDER TOKENS BY TOTAL WEIGHT VARIATION ACROSS EXPERIMENTS

In [16]:
dataframes = []
for file in file_paths:
    df = pd.read_csv(file)
    dataframes.append(df)

common_indexes = set.intersection(*(set(df['Index']) for df in dataframes))

filtered_data = pd.concat([df[df['Index'].isin(common_indexes)] for df in dataframes])

result_df = filtered_data.groupby('Index')['Value'].sum().reset_index()
result_df = result_df.sort_values('Value', ascending=False)
result_df = result_df.reset_index(drop=True)
result_df

Unnamed: 0,Index,Value
0,50,3.034466
1,440,2.815514
2,49463,2.338238
3,3216,2.263357
4,1948,2.234558
5,116,2.192341
6,21402,1.952436
7,35,1.822328
8,4236,1.750215
9,2,1.735335


In [17]:
OPT_tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-1.3b")

In [18]:
result_df['Token'] = result_df['Index'].apply(lambda x: OPT_tokenizer.batch_decode(torch.unsqueeze(torch.tensor(x), 0)))
result_df

Unnamed: 0,Index,Value,Token
0,50,3.034466,[ or]
1,440,2.815514,[ No]
2,49463,2.338238,[.}]
3,3216,2.263357,[ Yes]
4,1948,2.234558,[ answer]
5,116,2.192341,[?]
6,21402,1.952436,[�]
7,35,1.822328,[:]
8,4236,1.750215,[ �]
9,2,1.735335,[</s>]
