In [9]:
import pandas as pd
import glob
import datasets
import numpy as np

In [10]:
wiki_data = pd.read_csv("../../data/wikidata/wikidata-property-list.csv")
wiki_data = wiki_data[["Title", "ID", "Datatype", "Description"]]
wiki_data

Unnamed: 0,Title,ID,Datatype,Description
0,head of government,6,item,"head of the executive power of a town, city, m..."
1,brother,7,item,subject has the object as their brother
2,sister,9,item,subject has the object as their sister (female...
3,video,10,Commons file,relevant video
4,highway marker,14,Commons file,graphic representing the highway
...,...,...,...,...
1193,catholic.ru ID,1453,string,identifier on the site catholic.ru
1194,legal form,1454,item,legal form of an organization
1195,list of works,1455,item,link to the article with the works of a person
1196,list of monuments,1456,item,link to the list of heritage monuments in the ...


In [16]:
results_dict = {}
results_dict["language"] = []
results_dict["relation"] = []
results_dict["percentage change"] = []
results_dict["new ratio of rows"] = []
results_dict["old ratio of rows"] = []

hf_df = datasets.load_dataset("CalibraGPT/Fact-Completion")
file_names = glob.glob("../../data/result_logs/llama-30b/error-analysis/*.csv")

# confirm grabbing data correctly against LLaMa figure
# uncomment print statement at end of for loop to see
for file in file_names:
    language = file.split(".csv")[0].split("-")[-1].capitalize()
    error_df = pd.read_csv(file)
    full_hf_df = hf_df[file.split(".csv")[0].split("-")[-1].capitalize()]
    # print(language, ":", np.round((len(full_hf_df) - len(error_df))/len(full_hf_df) * 100, 3), "%")

# calculate the distribution shift by language by relation present in the errors
for file in file_names:
    language = file.split(".csv")[0].split("-")[-1].capitalize()
    error_df = pd.read_csv(file)
    full_hf_df = hf_df[file.split(".csv")[0].split("-")[-1].capitalize()]
    full_hf_df = full_hf_df.to_pandas()
    relations = np.unique(np.array(error_df.relation.tolist()))
    for relation in relations:
        new_ratio = len(error_df[error_df["relation"] == relation]) / len(error_df)
        old_ratio = len(full_hf_df[full_hf_df["relation"] == relation]) / len(
            full_hf_df
        )
        results_dict["relation"].append(relation)
        results_dict["language"].append(language)
        results_dict["percentage change"].append(
            np.round(100 * (new_ratio - old_ratio) / old_ratio, 3)
        )
        results_dict["new ratio of rows"].append(np.round(new_ratio, 3))
        results_dict["old ratio of rows"].append(np.round(old_ratio, 3))

# add wiki data in and save
df = pd.DataFrame.from_dict(results_dict)
quant_error_analysis = df.sort_values(
    by=["percentage change"], ascending=False
).reset_index(drop=True)

titles = []
descriptions = []
for i in range(len(quant_error_analysis)):
    p_id = quant_error_analysis.relation.loc[i].split("P")[-1]
    titles.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Title.reset_index(drop=True)[0]
    )
    descriptions.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Description.reset_index(drop=True)[0]
    )

quant_error_analysis["Title"] = titles
quant_error_analysis["Description"] = descriptions

quant_error_analysis.to_csv(
    "../../data/error_analysis/canonical_quant_error_analysis.csv"
)
quant_error_analysis

Found cached dataset parquet (/Users/ashbhat/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact-Completion-24a24a1e4bf6e4a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,language,relation,percentage change,new ratio of rows,old ratio of rows,Title,Description
0,German,P39,244.905,0.026,0.008,position held,subject currently or formerly holds the object...
1,English,P264,221.038,0.065,0.020,record label,brand and trademark associated with the market...
2,Danish,P264,201.396,0.069,0.023,record label,brand and trademark associated with the market...
3,Slovenian,P1303,179.534,0.027,0.010,instrument,instrument that a person plays
4,German,P264,172.157,0.065,0.024,record label,brand and trademark associated with the market...
...,...,...,...,...,...,...,...
612,Czech,P103,-86.926,0.003,0.022,native language,language that a person learned natively
613,Slovenian,P1412,-87.183,0.001,0.008,languages spoken,language(s) that a person speaks. Primarily us...
614,English,P17,-87.568,0.004,0.032,country,sovereign state of this item
615,English,P103,-90.718,0.003,0.035,native language,language that a person learned natively


In [17]:
# average across all langs
grouped_quant_error_analysis = pd.DataFrame(
    quant_error_analysis.groupby(["relation"])["percentage change"].mean()
)

pd_df = {}
pd_df["relation"] = grouped_quant_error_analysis.index.tolist()
pd_df["mean percentage change across langs"] = grouped_quant_error_analysis[
    "percentage change"
].tolist()

grouped_quant_error_analysis = pd.DataFrame.from_dict(pd_df)
grouped_quant_error_analysis = grouped_quant_error_analysis.sort_values(
    "mean percentage change across langs", ascending=False
).reset_index(drop=True)

titles = []
descriptions = []
for i in range(len(grouped_quant_error_analysis)):
    p_id = grouped_quant_error_analysis.relation.loc[i].split("P")[-1]
    titles.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Title.reset_index(drop=True)[0]
    )
    descriptions.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Description.reset_index(drop=True)[0]
    )

grouped_quant_error_analysis["Title"] = titles
grouped_quant_error_analysis["Description"] = descriptions

grouped_quant_error_analysis.to_csv(
    "../../data/error_analysis/aggregate_quant_error_analysis.csv"
)
grouped_quant_error_analysis

Unnamed: 0,relation,mean percentage change across langs,Title,Description
0,P264,113.51705,record label,brand and trademark associated with the market...
1,P101,76.44105,field of work,"fields of work related to this item (physics, ..."
2,P39,55.84905,position held,subject currently or formerly holds the object...
3,P449,49.0586,original network,network(s) the radio or television show was or...
4,P136,44.37375,genre,genre of a creative work or genre in which an ...
5,P1303,36.10595,instrument,instrument that a person plays
6,P106,30.5725,occupation,"occupation of a person; see also ""field of wor..."
7,P407,25.76575,language,language of a work (not necessarily original l...
8,P138,21.891312,named after,entity or event that inspired the subject's na...
9,P364,20.3578,original language,language in which a work was originally created


In [18]:
columns_to_keep = ['language', 'Title', 'percentage change']
filtered_df = quant_error_analysis[quant_error_analysis['percentage change'] > 0][columns_to_keep]
#filtered_df.pivot('Title','language').fillna('0')

In [19]:
# Define the minimum percentage change threshold
min_pct_change = 0

# Filter the results_dict dataframe to keep only rows with percentage change greater than min_pct_change
filtered_df_lang = filtered_df[filtered_df['percentage change'] > min_pct_change]

# Group the filtered dataframe by relation and language - compute the mean percentage change is redundant, to make groupby easier
grouped_df = filtered_df_lang.groupby(['Title', 'language'])['percentage change'].mean()

# For each relation, get the list of languages with positive percentage change
for relation in grouped_df.index.levels[0]:
    languages = list(grouped_df.loc[relation].index)
    print(f"{relation}': {languages}")

continent': ['Bulgarian', 'Dutch', 'French', 'German', 'Portuguese', 'Romanian', 'Serbian', 'Swedish', 'Ukrainian']
country of origin': ['Catalan', 'Croatian', 'Czech', 'Danish', 'English', 'German', 'Italian', 'Romanian', 'Spanish', 'Ukrainian']
developer': ['Serbian']
employer': ['Bulgarian', 'Russian', 'Ukrainian']
field of work': ['Bulgarian', 'Catalan', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'French', 'German', 'Hungarian', 'Italian', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Serbian', 'Slovenian', 'Spanish', 'Swedish', 'Ukrainian']
formation location': ['Czech', 'Romanian', 'Serbian', 'Spanish']
genre': ['Bulgarian', 'Catalan', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'French', 'German', 'Hungarian', 'Italian', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Serbian', 'Spanish', 'Swedish', 'Ukrainian']
headquarters location': ['Croatian', 'Czech', 'Polish', 'Russian', 'Slovenian', 'Spanish', 'Ukrainian']
instrument': ['Bulgarian', 'Croatian', 'Czech', 

In [20]:
# Define the minimum percentage change threshold
min_pct_change = 0

# Filter the results_dict dataframe to keep only rows with percentage change greater than min_pct_change
filtered_df = filtered_df[filtered_df['percentage change'] > min_pct_change]

# Group the filtered dataframe by relation and language, - compute the mean percentage change is redundant, to make groupby easier
grouped_df = filtered_df.groupby(['language','Title'])['percentage change'].mean()

# Initialize an empty list to hold the results
results = []

# For each language, get the list of relations with positive percentage change
for language in grouped_df.index.levels[0]:
    relations = list(grouped_df.loc[language][grouped_df.loc[language] > 0].index)
    for relation in relations:
        results.append({'language': language, 'relation': relation})

# Convert the results list to a DataFrame
grouped_df = grouped_df.sort_values(ascending=False)
grouped_by_language = grouped_df.groupby('language')
grouped_by_language.apply(display)
grouped_df.to_csv('../../data/error_analysis/grouped_by_language.csv')

language   Title                                      
Bulgarian  record label                                   74.292
           field of work                                  70.348
           occupation                                     47.758
           sport                                          40.051
           position played on team                        34.497
           genre                                          31.090
           instrument                                     25.649
           place of birth                                 23.340
           continent                                      22.471
           languages spoken                               22.156
           work location                                  18.947
           language                                       13.277
           located in                                     10.937
           employer                                       10.776
           place of death          

language  Title                  
Catalan   field of work              100.139
          record label                93.568
          languages spoken            83.089
          genre                       65.548
          sport                       56.389
          original network            47.536
          language                    40.124
          position held               36.156
          position played on team     28.197
          official language           26.563
          named after                 22.877
          occupation                  20.636
          original language           20.382
          country of origin           19.497
          manufacturer                 6.416
          located in                   2.333
Name: Catalan, dtype: float64

language  Title                  
Croatian  record label               110.362
          original network            97.410
          headquarters location       90.684
          instrument                  77.336
          position played on team     55.184
          field of work               45.242
          shares border with          43.013
          position held               41.859
          language                    41.769
          country of origin           17.010
          genre                       12.067
          original language            2.553
          occupation                   1.597
Name: Croatian, dtype: float64

language  Title                  
Czech     instrument                 129.374
          record label               109.582
          headquarters location      106.441
          position held               93.374
          field of work               55.517
          country of origin           38.890
          original network            31.831
          formation location          16.671
          named after                 16.543
          position played on team     12.137
          genre                        9.930
          occupation                   7.710
          shares border with           5.400
          sport                        0.021
          language                     0.007
Name: Czech, dtype: float64

language  Title            
Danish    record label         201.396
          field of work         93.569
          position held         83.891
          original language     83.040
          occupation            56.891
          genre                 50.102
          instrument            46.096
          original network      25.800
          language              20.283
          country of origin     18.378
          named after           15.925
          languages spoken       9.702
Name: Danish, dtype: float64

language  Title                  
Dutch     field of work              149.638
          record label               115.448
          original network            74.417
          position held               72.506
          continent                   60.721
          genre                       48.042
          instrument                  45.614
          occupation                  37.287
          language                    31.526
          manufacturer                18.272
          named after                 11.252
          position played on team      6.267
Name: Dutch, dtype: float64

language  Title            
English   record label         221.038
          field of work        136.864
          genre                 98.796
          occupation            81.299
          country of origin     77.730
          manufacturer          44.404
          original network      40.281
          position held         33.115
          named after           25.388
          place of birth         3.359
          place of death         3.332
Name: English, dtype: float64

language  Title            
French    record label         147.040
          continent            127.965
          field of work        117.976
          original language     78.293
          genre                 56.528
          original network      55.973
          position held         49.715
          named after           31.461
          manufacturer          25.287
          languages spoken      22.652
          language               9.936
          occupation             8.789
          owned by               4.597
Name: French, dtype: float64

language  Title                  
German    position held              244.905
          record label               172.157
          original network           123.804
          field of work               99.910
          instrument                  82.800
          genre                       74.295
          occupation                  51.843
          language                    27.941
          manufacturer                21.751
          continent                   17.644
          languages spoken            13.507
          country of origin           10.867
          position played on team      6.287
          owned by                     1.731
Name: German, dtype: float64

language   Title                  
Hungarian  position played on team    117.478
           instrument                  83.834
           original network            64.894
           native language             59.243
           sport                       54.588
           record label                51.516
           position held               42.696
           occupation                  26.430
           genre                       13.667
           language                    11.238
           original language            6.135
           field of work                1.266
Name: Hungarian, dtype: float64

language  Title            
Italian   record label         126.880
          field of work         91.210
          language              83.409
          languages spoken      67.113
          genre                 62.020
          original network      61.251
          country of origin     58.106
          position held         55.463
          manufacturer          42.673
          occupation            35.475
          named after           31.208
          original language     30.469
Name: Italian, dtype: float64

language  Title                  
Polish    sport                      151.886
          position held              108.532
          record label                84.567
          genre                       51.999
          field of work               38.261
          original language           37.018
          original network            33.579
          language                    29.530
          occupation                  23.356
          named after                 19.230
          headquarters location       18.249
          instrument                  13.949
          position played on team     12.470
          work location                3.868
Name: Polish, dtype: float64

language    Title            
Portuguese  record label         123.366
            position held         65.348
            field of work         58.378
            genre                 56.686
            original network      54.171
            language              49.560
            named after           49.419
            occupation            35.987
            original language     26.760
            continent             17.843
            manufacturer          17.536
            sport                  3.823
            located in             2.172
Name: Portuguese, dtype: float64

language  Title             
Romanian  original language     137.821
          language              125.495
          field of work          99.107
          record label           95.129
          named after            50.254
          genre                  47.791
          original network       46.324
          country of origin      45.779
          continent              36.520
          position held          22.549
          official language      17.370
          occupation              6.826
          formation location      4.152
          languages spoken        0.669
Name: Romanian, dtype: float64

language  Title                                      
Russian   record label                                   89.429
          position played on team                        77.466
          is in the administrative territorial entity    44.472
          field of work                                  38.843
          occupation                                     27.127
          original network                               26.879
          genre                                          26.599
          sport                                          13.513
          headquarters location                          10.074
          instrument                                     10.074
          official language                               8.393
          employer                                        3.535
Name: Russian, dtype: float64

language  Title                  
Serbian   formation location         66.697
          genre                      41.097
          continent                  36.389
          original network           35.852
          record label               33.003
          instrument                 25.023
          field of work              22.209
          language                   18.693
          occupation                 17.278
          position played on team    14.775
          owned by                   13.657
          sport                      12.090
          manufacturer                9.207
          developer                   8.772
          original language           7.163
Name: Serbian, dtype: float64

language   Title                  
Slovenian  instrument                 179.534
           record label               103.600
           position played on team     93.554
           headquarters location       84.702
           shares border with          69.130
           position held               65.556
           field of work               59.921
           named after                 28.173
           original network            17.319
           language                     9.621
           owned by                     9.099
           sport                        2.098
           occupation                   1.201
Name: Slovenian, dtype: float64

language  Title                
Spanish   record label             113.131
          field of work             73.020
          original language         72.562
          original network          54.527
          genre                     48.968
          native language           39.671
          official language         38.204
          located in                28.307
          occupation                26.155
          named after               25.376
          languages spoken          23.961
          manufacturer              23.078
          position held             18.308
          place of birth             7.281
          formation location         6.917
          headquarters location      2.581
          country of origin          2.311
          owned by                   1.784
Name: Spanish, dtype: float64

language  Title           
Swedish   record label        147.333
          field of work       130.640
          position held        72.376
          place of death       66.281
          original network     64.409
          genre                59.981
          occupation           55.172
          named after          37.566
          instrument           17.320
          continent            16.877
          language             10.375
Name: Swedish, dtype: float64

language   Title                                      
Ukrainian  position played on team                        122.208
           instrument                                     111.626
           record label                                    57.504
           sport                                           52.917
           field of work                                   46.763
           occupation                                      42.633
           genre                                           32.925
           language                                        20.057
           original network                                19.971
           is in the administrative territorial entity     17.198
           headquarters location                           13.371
           original language                               10.383
           position held                                    9.205
           employer                                         9.003
           continent 