In [25]:
import numpy as np
import json
import pandas as pd
from numpy.linalg import norm
from scipy import spatial
from zipfile import ZipFile as zf
import gram2vec.lang2vec.lang2vec as l2v
import pickle
import pandas as pd
import scipy.stats as ss
import sklearn.metrics as sm
from statistics import mean
import os
import pkg_resources
import pyconll
from iso639 import Lang
import os
import fnmatch

In [18]:
gb_language_file = "/Users/CitronVert/Desktop/grambank-grambank-analysed-fcf971a/grambank/cldf/languages.csv"
langs = pd.read_csv(gb_language_file)
lang_names = list(langs["Name"])
gram_codes = list(langs["ID"])


In [None]:
gram2iso = {}
for i in range(len(lang_names)):
    name = lang_names[i]
    id = gram_codes[i]
    try:
        code = Lang(name).pt3
        gram2iso[id] = code
    except:
        print(f"couldn't find {name}")
        


In [21]:
with open("gram2iso_dict.pkl", 'wb') as file:
    pickle.dump(gram2iso, file)

In [35]:
def find_folder_with_string(parent_directory, string):
    folders = []
    for root, dirs, _ in os.walk(parent_directory):
        for dir_name in dirs:
            if fnmatch.fnmatch(dir_name, f'*{string}*'):
                folders.append(os.path.join(root, dir_name))
    return folders
def find_files_with_string(parent_directory, string):
    files = []
    for root, _, filenames in os.walk(parent_directory):
        for filename in filenames:
            if fnmatch.fnmatch(filename, f'*{string}*'):
                files.append(os.path.join(root, filename))
    return files

In [43]:
PARENT_DIR = "/Users/CitronVert/Desktop/ud/ud-treebanks-v2.2"
#search through the local repo for UD-language name-something and select the first one (for now)
lang2path ={}
for lang in lang_names:
    lang_dirs = find_folder_with_string(PARENT_DIR, lang)
    if lang_dirs == []: 
        continue
    else:
        for dir in lang_dirs:
            train = find_files_with_string(dir, "train.conllu")
            if not train == []:
                dev = find_files_with_string(dir, "dev.conllu")
                test = find_files_with_string(dir, "test.conllu")
                break
    if not train == [] and not dev == [] and not test == []:
        lang2path[lang] = [train[0], dev[0], test[0]]

In [49]:
for language in lang2path.keys():
    train_path = lang2path[language][0]
    train = pyconll.load_from_file(train_path)
    train = train[0:500]
    path = f"./conllu/{Lang(language).pt3}_train.conllu"
    with open(path, 'w', encoding='utf-8') as f:
        train.write(f)

In [57]:
with open("/Users/CitronVert/Desktop/langrank/resources/pos_languages.txt", 'w') as file:
    isos = [Lang(language).pt3 for language in lang2path.keys()]
    file.writelines('\n'.join(str(i) for i in isos))

In [50]:
for language in lang2path.keys():
    test_path = lang2path[language][2]
    dev_path = lang2path[language][1]
    test = pyconll.load_from_file(test_path)
    dev = pyconll.load_from_file(dev_path)
    test_path = f"./conllu/{Lang(language).pt3}_test.conllu"
    dev_path = f"./conllu/{Lang(language).pt3}_dev.conllu"
    with open(test_path, 'w', encoding='utf-8') as f:
        test.write(f)
    with open(dev_path, 'w', encoding='utf-8') as f:
        dev.write(f)

In [2]:
'''
converts all lang codes to 3 letter iso codes
'''
def to_iso(lang):
    return Language.make(lang).to_alpha3()

''' 
processes gold ranking files to convert all 2 letter iso codes to 3 letters
expects "train/target" to be in (0,0)
'''
def convert_datafile_isos(datafile, savepath):

    df = pd.read_csv(datafile)
    new_columns = {}
    for column in df.columns:
        if len(column) == 2:  
            new_columns[column] = to_iso(column)
        
    df = df.rename(columns=new_columns)
    for index, row in df.iterrows():
        if len(row['train/target']) == 2:
            df.at[index, 'train/target'] = to_iso(row['train/target'])


    df.to_csv(savepath, index=False)

In [143]:
convert_datafile_isos("./golds/xpos_scores.csv", "./golds/xpos_scores_cleaned.csv")

In [20]:
'''
takes a list of 3 letter iso codes and removes all 3 letter isocodes from ranking data that are not present in the list
'''
def match_langs(match, data, column_name = "train/target"):
    filtered_data = data[data[column_name].isin(match)] #removes train language data not present in list
    filtered_data = filtered_data[[column_name] + [col for col in filtered_data.columns[1:] if col in match]] #removes target language data not present in list


def align_and_remove_unique_columns(df1, df2):
    # Find columns unique to each dataframe
    unique_cols_df1 = set(df1.columns) - set(df2.columns)
    unique_cols_df2 = set(df2.columns) - set(df1.columns)
    
    # Drop unique columns from each dataframe
    df1 = df1.drop(columns=unique_cols_df1)
    df2 = df2.drop(columns=unique_cols_df2)

    # Get the column order of df1
    column_order = df1.columns.tolist()
    
    # Reorder columns of df2 to match df1
    df2 = df2[column_order]

    return df1, df2

def align_and_remove_unique_rows(df1, df2, column_name="train/target"):
    # Find values unique to each dataframe in the specified column
    unique_values_df1 = set(df1[column_name]) - set(df2[column_name])
    unique_values_df2 = set(df2[column_name]) - set(df1[column_name])
    
    # Drop rows with unique values in the specified column from each dataframe
    df1 = df1[~df1[column_name].isin(unique_values_df1)]
    df2 = df2[~df2[column_name].isin(unique_values_df2)]
     # Use the values in the first column as the index
    df1_indexed = df1.set_index('train/target')
    df2_indexed = df2.set_index('train/target')
    
    # Reindex df2 to match the order of df1's index
    df2 = df2_indexed.reindex(df1_indexed.index)
    
    return df1, df2.reset_index()

'''
Given two organized datasets (all iso codes of 3 letters with "train/test" in (0,0)) 
reindex to match row and column order and save to file
'''
def match_mtt_xpos(xtt_path, xpos_path, data_dir):
    xtt_path = data_dir + xtt_path
    xpos_path = data_dir + xpos_path
    xtt = pd.read_csv(xtt_path)
    xpos = pd.read_csv(xpos_path)
    xtt, xpos = align_and_remove_unique_columns(xtt, xpos)
    xtt, xpos = align_and_remove_unique_rows(xtt, xpos)
    xtt.to_csv(data_dir + "aligned_xtt.csv", index = False)
    xpos.to_csv(data_dir + "aligned_xpos.csv", index = False)

'''
For whatever reason, langrank includes languages in the raw data that are not actually present in their POS datasets. 
This removes them so they don't cause issues during the creation of training data
'''
def remove_no_data_langs(data, task = "POS", column_name = "train/target"):
    target_langs =  list(data.columns)[1:]
    train_langs = list(data[column_name])
    datasets_dict = lr.map_task_to_data(task)
    for dt in datasets_dict:
        fn = pkg_resources.resource_filename(__name__, os.path.join('indexed', task, datasets_dict[dt]))
        features = np.load(fn, encoding='latin1', allow_pickle=True).item()
    missing = set()
    languages = set(target_langs + train_langs)
    for lang in languages:
        code = (lr.PREFIXES[task] + lang)
        if not code in features:
            code = lr.PREFIXES[task] + Lang(lang).pt1
        if task == "POS":
            code = [a for a in list(features.keys()) if a.startswith(lr.PREFIXES["POS"] + Lang(lang).pt1)]
            if len(code) > 0:
                code = code[0]
        if (code == []) or (not code in features): 
          missing.add(lang)
    data = data[~data[column_name].isin(missing)]
    missing_cols = missing.intersection(set(target_langs))
    data = data.drop(columns=missing_cols)
    return data


'''
Given a language and an organized dataset, returns 
ranked: a list of integers where the value at index i is the ranking of the corresponding language (based on order in the dataset) at index i 
'''
def numerical_rank(language, data):
    values = list(data[language])
    values = [-float(a) for a in values]
    ranked =  ss.rankdata(values, method = "ordinal") 
    ranked = [i - 1 for i in ranked]
    return ranked

'''
Given an organized dataset, returns 
langs_ranked: a dictionary where 
keys = language and 
values = a tuple of indices, ranked where the ranking at index i of ranked corresponds to the language at index i of indices
'''
def make_golds(data, col_name = "train/target"):
    test = list(data.columns)[1:]
    langs_ranked = {}
    for language in test: 
        df = data.copy()
        df = df.drop(data.loc[data[col_name].isin([language])].index) #for fairness, remove target language from potential training lanugages
        indices = list(df[col_name])
        ranked = numerical_rank(language, df)
        langs_ranked[language] = (indices, ranked)
    return langs_ranked

'''
Given an organized dataset and a language, returns 
a tuple of (indices, rankings)
indices: a list of ranked languages (full set of test languages - leave one out language)
rankings: a dictionary (as in make_golds); full set of test languages ranked with current lang removed
'''
def make_ranked(data, target_lang, col_name = "train/target"):
    remove = [target_lang]
    # removes current language from ranking (for leave one out)
    data = data.drop(remove,axis = 1) 
    data = data.drop(data.loc[data[col_name].isin(remove)].index) 
    test = list(data.columns)[1:]
    indices = list(data[col_name])
    rankings = {}
    for test_language in test:
        ranked = numerical_rank(test_language, data)
        rankings[test_language] = ranked
    return (indices, rankings)

'''
Given a language, task and an organized dataset, saves gold rankings and leave-one-out ranked training data to pickle 
'''
def make_data_pickles(data_path, data_dir, task, training_dir = "./training-data/"):
    data = pd.read_csv(data_dir + data_path)
    data = remove_no_data_langs(data)
    golds = make_golds(data)
    languages = list(data.columns)[1:]
    ranks = {language: make_ranked(data.copy(), language) for language in languages}
    save_path = f"{training_dir}{task}_POS_"
    with open(save_path + "golds.pkl", 'wb') as f:
        pickle.dump(golds, f)
    with open(save_path + "ranked.pkl", 'wb') as f:
        pickle.dump(ranks, f)

'''
Prepares a reference list for looking up rank scores; 
returns a list of the same length as ranking but ranks gives a score instead
if gamma max were 3 then [0, 5 , 3, 4, 2, 1] would become [3, 0, 0, 0, 1, 2]
'''
def scores_ranking(ranking, gamma_max = 10):
    scores_by_index = [0] * len(ranking)
    for i in range(len(ranking)): 
        if ranking[i] <= gamma_max:
            scores_by_index[i] = gamma_max - (ranking[i])
    return scores_by_index


# def compute_ndcg(lang, ranked_langs, predicted, gamma_max= 9, k=3):
#     ranking_langs = ranked_langs[lang][0] # list of languages for looking up index in ranking vector
#     # gives position in ranking based on index (if ranking[0] = 4 then the 0th language [ranking_langs[0]] is the 5th best)
#     ranking = ranked_langs[lang][1] 
#     print(ranking)
#     # creates vector to look up the relevance score of a given language by index
#     scores_by_index = [0] * len(ranking)
#     for i in range(len(ranking)): 
#         if ranking[i] <= gamma_max:
#             scores_by_index[i] = gamma_max - (ranking[i] -1)
#     ideal_score = [i for i in reversed(range(1, gamma_max + 1))] + [0] * (len(ranking) - gamma_max)
#     print(ideal_score)
#     predicted_score = [0] * len(ranking)
#     for j in range(len(predicted)): #for each language in ranking
#         code = predicted[j]
#         index = ranking_langs.index(code)
#         score = scores_by_index[index] #finds the true relevance of each language
#         predicted_score[j] = score
#     print(predicted_score)
#     return sm.ndcg_score(np.asarray([ideal_score]), np.asarray([predicted_score]),k=k)


# >>> # we have groud-truth relevance of some answers to a query:
# >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
# >>> # we predict some scores (relevance) for the answers
# >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
# >>> ndcg_score(true_relevance, scores)
# 0.69...
'''
Computes ncdg for a single pair of rankings. ranking and compare_ranking MUST be indexed the same
'''
def compute_ncdg(indices, ranking, compare_ranking, gamma_max = 10 , k = 3):
    # gives position in ranking based on index (if ranking[0] = 4 then the 0th language [ranking_langs[0]] is the 5th best)
    ranking = ranking 
    # creates vector to look up the relevance score of a given language by index
    golds_scores = scores_ranking(ranking, gamma_max)
    compare_scores = scores_ranking(compare_ranking, gamma_max)
    # ideal_score = [i for i in reversed(range(1, gamma_max + 1))] #creates an ideal score list up to gamma-max 
    # scores =  []
    # for i in range(gamma_max, 0, -1): 
    #     a = compare_scores.index(i) #gets the index of the predicted top n languages
    #     scores.append(golds_scores[a]) #appends the actual relevance score of each predicted language (based on gold rankings)
    return sm.ndcg_score(np.asarray([golds_scores]), np.asarray([compare_scores]),k=k)


'''
Given two gold data pickles; compute the average ncdg@3 (for comparing xpos and mtt)
'''
def compute_ncdg_from_golds(data_path, compare_path, training_dir= './training-data/'):
    golds = pickle.load(open(f'{training_dir}{data_path}', 'rb'))
    compare = pickle.load(open(f'{training_dir}{compare_path}', 'rb'))
    assert list(golds.keys()) == list(compare.keys())
    scores = []
    for lang in golds.keys():
        gold_indices = golds[lang][0]
        compare_indices = compare[lang][0]
        assert gold_indices == compare_indices
        gold_ranking = golds[lang][1]
        compare_ranking = compare[lang][1]
        assert len(compare_ranking) == len(gold_indices)
        scores.append(compute_ncdg(gold_indices,gold_ranking, compare_ranking))
    return str(mean(scores))
    
'''
Given a pandas dataframe of sorted data, returns a dictionary of the top 3 languages 
'''
def get_topk_training_languages(data , k=3):
    top_train_languages = {}
    df = data.copy()
    # Iterate over each target language column
    for target_language in df.columns[1:]:
        # Sort the dataframe by the values of the current target language column
        sorted_df = df.sort_values(by=target_language, ascending=False)

        # Get the top 3 train languages for the current target language
        top_train = sorted_df['train/target'].head(k).tolist()

        # Store the top 3 train languages in the dictionary
        top_train_languages[target_language] = top_train

    return top_train_languages

def ud_gram_overlap():
    

In [21]:
match_mtt_xpos("xtt_pos_scores_clean.csv", "xpos_scores_cleaned.csv", "/Users/CitronVert/Desktop/langrank/golds/")

In [3]:
make_data_pickles("aligned_xtt.csv", "./golds/", "mtt")
make_data_pickles("aligned_xpos.csv", "./golds/", "xpos")

In [7]:
train = pickle.load(open("/projects/enri8153/langrank/training-data/mtt_POS_ranked.pkl", 'rb'))
gold =  pickle.load(open("/projects/enri8153/langrank/training-data/mtt_POS_golds.pkl", 'rb'))
print(len(train["pcm"][0]))
print(len(gold["pcm"][0]))
# b = pickle.load(open("/projects/enri8153/langrank/training-data/xpos_POS_ranked.pkl", 'rb'))

53
53


In [3]:
truth = [len(a[lang][0]) == len(b[lang][0]) for lang in a.keys()]
assert False not in truth

In [79]:
compute_ncdg_from_golds("xtt_golds.pkl", "xpos_golds.pkl")


'0.41797699874321753'

In [41]:
xpos = pickle.load(open('./training-data/xpos_golds.pkl', 'rb'))
xtt =  pickle.load(open('./training-data/xtt_golds.pkl', 'rb'))
assert(list(xpos.keys()) == list(xtt.keys()))
assert xtt["pcm"][1].index(0) == xtt["pcm"][0].index("eng")

In [26]:
xpos_csv = pd.read_csv(open('./golds/aligned_xpos.csv', 'rb'))
xtt_csv =  pd.read_csv(open('./golds/aligned_xtt.csv', 'rb'))
top_xpos = get_topk_training_languages(xpos_csv, k=10 )
top_xtt = get_topk_training_languages(xtt_csv, k=10)
print(top_xpos["afr"])
print(top_xtt["afr"])

FileNotFoundError: [Errno 2] No such file or directory: './training-data/xpos_golds.pkl'

legacy below this point

In [None]:
def make_gold_xpos(data):
    indices = data.columns 
    indices = list(indices[1:]) # creates final list of languages
    langs_ranked_ties = {}
    # creates a ranking for each language (with the language itself removed from consideration)
    for language in indices: 
        values = list(data[language])
        values = [-float(a) for a in values]
        ranked =  ss.rankdata(values, method = "ordinal") #ranks remaining languages (ties are assigned the minimum ranking that would be assigned if no ties were presen)
        langs_ranked_ties[language] = (indices, ranked)
    return langs_ranked_ties

def make_xpos_rank(data, lang):
    remove = [lang]
    data = data.drop(remove,axis = 1) #columns that match undesirable language codes
    data = data.drop(data.loc[data["lang_train"].isin(remove)].index) # removes rows that match undesirable language codes
    indices = data.columns 
    indices = list(indices[1:]) # creates final list of languages
    rankings = {}
    for lang in indices:
        values = list(data[lang])
        values = [-float(a) for a in values]
        ranked =  ss.rankdata(values, method = "ordinal") #ranks remaining languages (ties are assigned the minimum ranking that would be assigned if no ties were presen)
        ranked = [i - 1 for i in ranked]
        rankings[lang] = ranked
    return (indices, rankings)

In [17]:
gram_golds = make_gold_xpos(filtered_data)
with open("./training-data/POS_XLMR_gram_golds_no_ties.pkl", 'wb') as f:
    pickle.dump(gram_golds, f)

training = {}
for language in filtered_data.columns[1:]:
    ranking = make_xpos_rank(filtered_data, language)
    training[language] = ranking
with open("./training-data/POS_XLMR_gram_ranked_train_no_ties.pkl", 'wb') as f:
    pickle.dump(training, f)


NameError: name 'make_gold_xpos' is not defined

In [11]:
xtt = pickle.load(open("/Users/CitronVert/Desktop/langrank/training-data/POS_gram_golds_no_ties.pkl", 'rb'))
xlmr = pickle.load(open("/Users/CitronVert/Desktop/langrank/training-data/POS_XLMR_gram_golds_no_ties.pkl", 'rb'))

In [13]:
xlmr = {lang: [xlmr[lang][0][i-1] for i in xlmr[lang][1]] for lang in list(xlmr.keys())}

In [21]:
import csv

# Function to parse the CSV file
def parse_csv(filename):
    data = {}
    with open(filename, 'r', newline='') as file:
        reader = csv.reader(file)
        header = next(reader)  # Get header
        for row in reader:
            test_lang = row[0]
            train_langs = header[1:]
            accuracies = [float(accuracy) for accuracy in row[1:]]
            data[test_lang] = dict(zip(train_langs, accuracies))
    return data

# Function to get top-3 training languages for each testing language
def get_top3_training_languages(data):
    top3_training_languages = {}
    for test_lang, train_data in data.items():
        top_train_langs = sorted(train_data, key=train_data.get, reverse=True)[:3]
        top3_training_languages[test_lang] = top_train_langs
    return top3_training_languages

# Function to write results to a new CSV file
def write_to_csv(data, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Testing Language', '1st Best Training Language', '2nd Best Training Language', '3rd Best Training Language'])
        for test_lang, top_langs in data.items():
            writer.writerow([test_lang] + top_langs)


input_filename = '/Users/CitronVert/Desktop/langrank/golds/xpos_scores.csv'
output_filename = 'best_xlmr_train.csv'

# Parse CSV file
data = parse_csv(input_filename)

# Get top-3 training languages for each testing language
top3_training_languages = get_top3_training_languages(data)

# Write results to a new CSV file
write_to_csv(top3_training_languages, output_filename)




In [14]:
xlmr

{'bel': ['bel', 'hye', 'lit', 'gle', 'hun', 'tam', 'mar'],
 'gle': ['lit', 'bel', 'hye', 'gle', 'hun', 'tam', 'mar'],
 'hun': ['lit', 'hun', 'bel', 'gle', 'hye', 'tam', 'mar'],
 'hye': ['gle', 'tam', 'hun', 'bel', 'hye', 'mar', 'lit'],
 'lit': ['gle', 'mar', 'hye', 'hun', 'bel', 'tam', 'lit'],
 'mar': ['hun', 'mar', 'gle', 'hye', 'lit', 'bel', 'tam'],
 'tam': ['hun', 'mar', 'hye', 'bel', 'lit', 'tam', 'gle']}

In [16]:
import sklearn.metrics as sm
from statistics import mean

def compute_ndcg(lang, ranked_langs, predicted, gamma_max= 9, k=3):
    ranking_langs = ranked_langs[lang][0] # list of languages for looking up index in ranking vector
    # gives position in ranking based on index (if ranking[0] = 4 then the 0th language [ranking_langs[0]] is the 5th best)
    ranking = ranked_langs[lang][1] 
    print(ranking)
    # creates vector to look up the relevance score of a given language by index
    scores_by_index = [0] * len(ranking)
    for i in range(len(ranking)): 
        if ranking[i] <= gamma_max:
            scores_by_index[i] = gamma_max - (ranking[i] -1)
    ideal_score = [i for i in reversed(range(1, gamma_max + 1))] + [0] * (len(ranking) - gamma_max)
    predicted_score = [0] * len(ranking)
    for j in range(len(predicted)): #for each language in ranking
        code = predicted[j]
        index = ranking_langs.index(code)
        score = scores_by_index[index] #finds the true relevance of each language
        predicted_score[j] = score
    return sm.ndcg_score(np.asarray([ideal_score]), np.asarray([predicted_score]),k=k)



predictions = xlmr
rankings = xtt
languages = list(predictions.keys())
ndcg = {lang: compute_ndcg(lang, rankings, predictions[lang]) for lang in languages}
score = str(mean(ndcg.values()))
print(score)


[2, 3, 5, 1, 4, 0, 8, 6, 7]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[9, 7, 5, 4, 3, 2, 6, 0, 0]
[6, 1, 4, 7, 5, 8, 2, 0, 3]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[6, 3, 9, 10, 7, 8, 5, 0, 0]
[1, 4, 8, 7, 3, 6, 2, 5, 0]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[2, 10, 3, 5, 6, 8, 7, 0, 0]
[0, 1, 2, 5, 7, 6, 4, 8, 3]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[2, 6, 7, 5, 9, 3, 8, 0, 0]
[0, 1, 4, 2, 7, 5, 6, 8, 3]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[2, 3, 9, 7, 8, 4, 6, 0, 0]
[5, 0, 4, 8, 2, 6, 7, 3, 1]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[9, 8, 7, 10, 6, 2, 3, 0, 0]
[0, 5, 7, 6, 2, 8, 1, 4, 3]
[9, 8, 7, 6, 5, 4, 3, 2, 1]
[7, 8, 5, 4, 3, 9, 6, 0, 0]
0.7551847984556894


In [68]:
numeric_df = data.drop(columns=['lang_train']).apply(pd.to_numeric)


In [87]:
df = data.copy()

for column in df.columns:
    # Set entries to null where the value in 'lang_train' matches the column name
    df.loc[df['lang_train'] == column, column] = np.nan

numeric_df = df.drop(columns=['lang_train']).apply(pd.to_numeric)


ranges = numeric_df.max(skipna=True) - numeric_df.min(skipna=True)# Find the column with the maximum range
column_with_max_span = ranges.idxmax()

print("Column with the greatest span of values:", column_with_max_span)
min_val = numeric_df[column_with_max_span].min()
max_val = numeric_df[column_with_max_span].max()
min_index = data.loc[data[column_with_max_span] == min_val, "lang_train"].iloc[0]
max_index = data.loc[data[column_with_max_span] == max_val, "lang_train"].iloc[0]
print(f"{column_with_max_span}: Min Value: {min_val} (lang: {min_index}), Max Value: {max_val} (lang: {max_index})")

Column with the greatest span of values: cat
cat: Min Value: 25.72064229057845 (Index: san), Max Value: 95.65196362932868 (Index: spa)


In [None]:
# data = data[:54] #cuts off the top 3 data from the bottom of the csv (MT)
# data = data[:31] # (DEP)
# data = data[:60] # (POS)
# data = data[:54] #(EL)


def generate_rankings(data, remove):
    remove = list(set(remove) & set(data.columns))
    data = data.drop(remove, axis = 1) # drop test languages (columns) that match undesirable language codes
    data = data.drop(data.loc[data['v extra v \ trg >>'].isin(remove)].index) # removes rows that match undesirable language codes
    indices = data.columns 
    indices = list(indices[1:]) # creates final list of languages
    langs_ranked_ties = {}
    # creates a ranking for each language (with the language itself removed from consideration)
    for language in indices: 
        values = list(data[language])
        values = [-float(a) for a in values]
        ranked =  ss.rankdata(values, method = "ordinal") #ranks remaining languages (ties are assigned the minimum ranking that would be assigned if no ties were presen)
        langs_ranked_ties[language] = (indices, ranked)
    return langs_ranked_ties


def generate_ranking(data, language, r):
    # remove.append(language)
    remove = r.copy()
    remove.append(language)
    remove = list(set(remove) & set(data.columns))
    data = data.drop(remove,axis = 1) #columns that match undesirable language codes
    data = data.drop(data.loc[data['v extra v \ trg >>'].isin(remove)].index) # removes rows that match undesirable language codes
    indices = data.columns 
    indices = list(indices[1:]) # creates final list of languages
    langs_ranked_ties = {}
    # creates a ranking for each language (with the language itself removed from consideration)
    rankings = {}
    for lang in indices:
        values = list(data[lang])
        values = [-float(a) for a in values]
        ranked =  ss.rankdata(values, method = "ordinal") #ranks remaining languages (ties are assigned the minimum ranking that would be assigned if no ties were presen)
        ranked = [i - 1 for i in ranked]
        rankings[lang] = ranked
    return (indices, rankings)

In [None]:
data = data.rename(columns={'v pivot v \ test >>':'v extra v \ trg >>'}) #stupid POS/EL inconsistency 
# data = data.rename(columns={'Unnamed: 0':'v extra v \ trg >>'}) #stupid POS/EL inconsistency 
    


In [None]:
# finds the intersection of the grambank languages and task languages
# finds the rankings for each language with non-intersection languages removed and saves to a pickle file

import pickle
import gram2vec.lang2vec.lang2vec as l2v

gram = l2v.available_gram()


col_langs = to_iso(list(data.columns)[1:])
print("there  were {l} test languages originally. Those languages were {col}.".format(l = len(col_langs), col = col_langs))

lang = list(data['v extra v \ trg >>'])
lang = to_iso(lang)
missing = [a for a in col_langs if not a in lang]
print("there were  {l} languages excluded for not being present in ranking data. Those languages were {missing}.".format(l = len(missing), missing = missing))


data['v extra v \ trg >>'] = lang #reassigns ranking langs to ISO-3 codes
data.columns = ['v extra v \ trg >>'] + col_langs #reassigns test languages to ISO-3 codes
data = data.drop(missing,axis = 1) #drop test languages (columns) that aren't present in ranking languages (for now)

col_langs = list(data.columns)[1:]  #reassigns column headers to remaining test languages
print("After dropping missing languages, there are {l} test languages remaining. Those languages were {col}.".format(l = len(col_langs), col = col_langs))


common_test = (set(gram) & set(col_langs)) # final set of test languages (in grambank AND in ranking languages)

total_langs = lang + col_langs

common= (set(gram) & set(total_langs))
remove = list(set(total_langs) ^ common) # languages that should be removed (not in grambank)
print("there were {l} total languages excluded for not being present in grambank. Those languages were {missing}.".format(l = len(remove), missing = remove))

training = {}
gram_golds = generate_rankings(data, remove)
with open("./training-data/MT_gram_golds_no_ties.pkl", 'wb') as f:
    pickle.dump(gram_golds, f)

for language in common_test:
    ranking = generate_ranking(data, language, remove)
    training[language] = ranking
with open("./training-data/MT_ranked_train_no_ties.pkl", 'wb') as f:
    pickle.dump(training, f)



In [2]:
#Loads in list of languages that have precomputed distances in lang2vec
with open("./gram2vec/lang2vec/data/distances/distances_languages.txt") as f:
    dist_langs = f.read()
dist_langs =  dist_langs.split(",")

In [3]:
# Extract ISO6393 codes from... somewhere
with open('/Users/CitronVert/Downloads/resourcemap.json') as f:
  
# returns JSON object as 
# a dictionary
    data = json.load(f)

def get_iso6393(identifiers):
    for identifier in identifiers:
        if identifier.get("type") == "iso639-3":
            return identifier.get("identifier")
    return None

# Use list comprehension to extract the required fields
glotto2iso = {
        resource.get("id"): get_iso6393(resource.get("identifiers", []))
        for resource in data.get("resources", [])
        if get_iso6393(resource.get("identifiers", [])) is not None
    }

iso2glotto = {v: k for k, v in glotto2iso.items()}


In [4]:
# replace language codes in the vector data with ISO codes

from pycldf.dataset import Dataset
# metadata = "/Users/CitronVert/Desktop/grambank-grambank-analysed-fcf971a/grambank/cldf/StructureDataset-metadata.json"
imputed = "/Users/CitronVert/Desktop/NALA/grambank-grambank-analysed-fcf971a/R_grambank/output/GB_wide/GB_wide_imputed_binarized.tsv"
v = pd.read_csv(imputed, sep='\t')
for lang in v["Language_ID"]:
    if lang in glotto2iso.keys():
        v.loc[ v["Language_ID"] == lang, "Language_ID"] = glotto2iso[lang]
    else:
        v.drop(v[v['Language_ID'] == lang].index, inplace = True)


In [5]:
feats = list(v.columns[1:])


In [6]:
categories = "/Users/CitronVert/Downloads/feature_grouping_with_alternatives.csv"
gram_categories = pd.read_csv(categories)
cats = {}
groups = {}
for feat in feats:
    if not len(feat) == 5:
        feat = feat[:5]
    domain = gram_categories.loc[gram_categories["Feature_ID"] == feat, 'Main_domain'].item()
    fine = gram_categories.loc[gram_categories["Feature_ID"] == feat, 'Finer_grouping'].item()
    if fine in groups.keys():
        groups[fine].append(feat)
    else:
        groups[fine] = [feat]
    if domain in cats.keys():
        cats[domain].append(feat)
    else:
        cats[domain] = [feat]


In [7]:
#data: pandas array with grambank vector data (imputed and converted to iso codes)
#iso: string ISO code for a language
#reuturns the grambank feature vector
def get_vector(data, iso, sub_feats=None):
    uriel = np.array(l2v.get_features(iso, "syntax_knn", sub_feats=sub_feats)[iso])
    # gram = np.array(data[data['Language_ID'] == iso].values.tolist()[0][1:])
    # concat = np.concatenate((uriel, gram))
    return uriel

In [8]:
#finds the languages that are present in both lang2vec and grambank and reports the length
with open("/Users/CitronVert/Desktop/NALA/bankrank/gram2vec/lang2vec/data/distances/intersection_langs.txt", 'r') as file:
    intersection = file.read()
intersection = intersection.split(",")
d =len(intersection)

In [9]:
#read in csv
fine_cat ="/Users/CitronVert/Desktop/NALA/bankrank data/fine_categorization.csv"
cats = pd.read_csv(fine_cat)

In [10]:
np_order = cats["Uriel"][12].split(",")
clause = cats["Uriel"][11].split(",")


In [11]:
np_order=[a.strip() for a in np_order]
clause = [a.strip() for a in clause]

print(len(np_order))
print(len(clause))


16
41


In [12]:
feat_vecs = [get_vector(v, iso) for iso in intersection]
sub_np_feat_vecs = [get_vector(v, iso, np_order) for iso in intersection]
sub_clause_feat_vecs = [get_vector(v, iso, clause) for iso in intersection]

In [13]:
sub_np_feat_vecs[1]

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0.,
       0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
       0., 0.])

In [14]:
feat_vecs[1]

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0.,
       1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       0.])

In [15]:
feat_arr = np.stack(feat_vecs)
sub_np_feat_arr =  np.stack(sub_np_feat_vecs)
sub_clause_feat_arr =  np.stack(sub_clause_feat_vecs)

# print(sub_np_feat_arr.shape)
# print(sub_clause_feat_arr.shape)

In [18]:
import sklearn.metrics.pairwise as sk
import scipy.sparse as sparse

cosine_similarities = sk.cosine_similarity(feat_arr)
knn = sparse.csr_matrix(1-cosine_similarities)
sparse.save_npz("./URIEL_knn_distances.npz", knn)



In [3]:
import scipy.sparse as sparse

np = sparse.load_npz("/Users/CitronVert/Desktop/NALA/bankrank/gram2vec/lang2vec/data/distances/URIEL_knn_distances_np_order_ablation.npz")
clause = sparse.load_npz("/Users/CitronVert/Desktop/NALA/bankrank/gram2vec/lang2vec/data/distances/URIEL_knn_distances_clause_ablation.npz")
reg = sparse.load_npz("/Users/CitronVert/Desktop/NALA/bankrank/gram2vec/lang2vec/data/distances/URIEL_knn_distances.npz")


In [21]:
print(reg)


  (0, 0)	-2.220446049250313e-16
  (0, 1)	0.5890025317366068
  (0, 2)	0.05370837673721851
  (0, 3)	0.4856555001263603
  (0, 4)	0.3043344070000654
  (0, 5)	0.48550424457247354
  (0, 6)	0.3617152614957745
  (0, 7)	0.4645503299035949
  (0, 8)	0.6284197321912309
  (0, 9)	0.41666666666666663
  (0, 10)	0.5068030380839282
  (0, 11)	0.3036893761772086
  (0, 12)	0.06840573860297583
  (0, 13)	0.34927599213080796
  (0, 14)	0.5628071751074061
  (0, 15)	0.452003375648809
  (0, 16)	0.571253537143728
  (0, 17)	0.4246035444312495
  (0, 18)	0.6712020253892854
  (0, 19)	0.2602045571258924
  (0, 20)	0.04100590738541632
  (0, 21)	0.38888888888888873
  (0, 22)	0.6055946811266921
  (0, 23)	0.09850212828958171
  (0, 24)	0.5463035642189196
  :	:
  (1469, 1445)	0.4997835966139753
  (1469, 1446)	0.5454545454545454
  (1469, 1447)	0.4848725736710717
  (1469, 1448)	0.4276361929678575
  (1469, 1449)	0.4197411468143405
  (1469, 1450)	0.4997835966139753
  (1469, 1451)	0.5067799747921895
  (1469, 1452)	0.28454524120982

In [None]:
GRAMBANK_LANGUAGES = l2v.available_grambank()
GRAMBANK_DISTANCES =  "/Users/CitronVert/Desktop/langrank/gram2vec/lang2vec/data/learned.npy"
with open(GRAMBANK_DISTANCES, "rb") as f:
    feat_pred = np.load(GRAMBANK_DISTANCES, allow_pickle=True)

In [None]:
feat_pred

In [None]:
#for every pair of languages that is present in both grambank and lang2vec,
#find the cosine difference between their feature vectors
import scipy.sparse as sparse

res = np.zeros((d,d))
for x in range(d):
    l1 = intersection[x]
    v1 = get_vector(v, l1)
    for y in range(x,d):
        l2 = intersection[y]
        v2 = get_vector(v, l2)
        diff = spatial.distance.cosine(v1, v2)
        res[x][y] = diff


# #saves the distance measures as a sparse matrix (for compatability with lang2vec)
sA = sparse.csr_matrix(res)
sparse.save_npz("./concat_distances.npz", sA)
               

In [None]:
intersection

In [None]:
x = intersection.index("mij")
y = intersection.index("sed")

In [None]:
res[y][x]

In [None]:
#saves a list of the languages for which we have distance values
data_string = ",".join(str(item) for item in intersection)

    # Open the file in write mode
with open("./URIEL_distance_langs.txt", 'w') as file:
    # Write the data string to the file
    file.write(data_string)

In [None]:
# training code for reference
# def test_train():
#     langs = ["aze", "ben", "fin"]
#     datasets = [os.path.join(root, "sample-data", "ted-train.orig.{}".format(l)) for l in langs]
#     seg_datasets = [os.path.join(root, "sample-data", "ted-train.orig.spm8000.{}".format(l)) for l in langs]
#     rank = [[0, 1, 2], [1, 0, 2], [2, 1, 0]] # random
#     tmp_dir = "tmp"
#     prepare_train_file(datasets=datasets, segmented_datasets=seg_datasets,
#                        langs=langs, rank=rank, tmp_dir=tmp_dir, task="MT")
#     output_model = "{}/model.txt".format(tmp_dir)
#     train(tmp_dir=tmp_dir, output_model=output_model)
#     assert os.path.isfile(output_model)

In [None]:
TED_PATH = "/Users/CitronVert/Desktop/langrank/indexed/MT/ted.npy"
def create_MT_dataset(data_path, lang): 
    data = np.load(data_path, encoding='latin1', allow_pickle=True).item()
    filename = "ted-train.orig."+lang
    


def train(langs, datasets, seg_datasets, rank): 
    tmp_dir = "tmp"
    prepare_train_file(datasets=datasets, segmented_datasets=seg_datasets,
                       langs=langs, rank=rank, tmp_dir=tmp_dir, task="MT")
    output_model = "{}/model.txt".format(tmp_dir)
    train(tmp_dir=tmp_dir, output_model=output_model)
    assert os.path.isfile(output_model)

In [None]:
#function to compute the ndcg score
#lang: ISO code for task language
#ranked_langs: gold rankings for task
#predicted: predicted ranking for lang
#gamma_max: hyper param from lin et al.

import sklearn.metrics as sm
import numpy as np
def compute_ndcg(lang, ranked_langs, predicted, gamma_max= 10):
    ranking_langs = ranked_langs[lang][0] # list of languages for looking up index in ranking vector
    # gives position in ranking based on index (if ranking[0] = 4 then the 0th language [ranking_langs[0]] is the 4th best)
    ranking = ranked_langs[lang][1] 
    # creates vector to look up the relevance score of a given language by index
    scores_by_index = [0] * len(ranking)
    for i in range(len(ranking)): 
        if ranking[i] <= gamma_max:
            scores_by_index[i] = gamma_max - (ranking[i] - 1)
    print("scores by index")
    print(scores_by_index)
    ideal_score = [i for i in reversed(range(1, gamma_max + 1))] + [0] * (len(ranking) - gamma_max)
    print("IDEAL")
    print(ideal_score)
    predicted_score = [0] * len(ranking)
    for j in range(len(predicted)): #for each language in ranking
        code = predicted[j]
        print(code)
        index = ranking_langs.index(code)
        print(ranking[index])
        score = scores_by_index[index] #finds the true relevance of each language
        predicted_score[j] = score
    return sm.ndcg_score(np.asarray([ideal_score]), np.asarray([predicted_score]))

In [None]:
# a small ndcg test
# remember-- we don't exclude the test language
dummy = {
    "aze": (["ben", "fin", "deu", "eng"],[1, 2, 3, 4]),
    "ben": (["aze","fin", "deu", "eng"],[1, 3, 2, 4]),
    "fin": (["aze", "ben", "deu", "eng"],[3, 4, 2, 1]),
    "deu": (["aze", "ben", "fin", "eng"],[4, 3, 2, 1]),
    "eng": (["aze", "ben", "fin", "deu"],[4, 3, 2, 1])
}
lang = "eng"
predicted = ["fin","deu","aze","ben"]
gamma_max = 2 
print(compute_ndcg(lang, dummy, predicted, gamma_max))

In [None]:
# Modify the code to accept params instead of freakin data
# pull list of MT languages from ... somewhere
# run write a function to run langrank_predict with the all model for all langs in list and return ndcg
# average list
# is it same??

In [None]:
import langrank as lr
import pickle
with open("./MT_ranked_ties.pkl", 'rb') as f:
    rankings = pickle.load(f)
languages = list(rankings.keys())

predicted = {}
for lang in languages:
    prepared = lr.prepare_featureset(lang)
    predicted[lang] = lr.rank(prepared, task="MT", candidates="all", return_langs = True, uriel = True)




In [None]:
with open("./MT_full_ranked_no_ties.pkl", 'rb') as f:
    rankings = pickle.load(f)
languages = list(rankings.keys())
rank = rankings["aze"][1]
gold = [0]*len(languages)
for i in range(len(languages)):
    gold[rank[i]-1] = languages[i]
print(gold)

In [None]:
import pickle
with open("./uriel_predictions.pkl", 'rb') as f:
    predictions = pickle.load(f)
print(predictions["aze"])

In [None]:
with open("./MT_full_ranked_ties.pkl", 'rb') as f:
    rankings = pickle.load(f)

In [None]:
lang = "aze"
ranking_langs = rankings[lang][0] # list of languages for looking up index in ranking vector
ranking = rankings[lang][1] 


In [None]:
langs = [""] * len(ranking_langs)
for i in range(len(ranking_langs)):
    langs[ranking[i] - 1] = ranking_langs[i]

In [None]:
langs

In [None]:
import numpy as np

In [None]:
tasks = {"MT", "EL", "DEP", "POS"}
cases = {"uriel", "gram"}
path = "/Users/CitronVert/Desktop/langrank/results"
for task in tasks:
    for case in cases:
        p = "{base}/{task}/{task}_{case}_predictions.pkl".format(base = path, task = task, case = case)
        with open(p, 'rb') as f:
            predictions = pickle.load(f)
        n = "{base}/{task}/{task}_{case}_ndcg.pkl".format(base = path, task = task, case = case)
        with open(n, 'rb') as f:
            scores = pickle.load(f)
        vals = [i for i in list(scores.values())]
        # vals
        predict = pd.DataFrame.from_dict(predictions)
        scores_d = pd.DataFrame( columns = list(scores.keys()))
        scores_d.loc[len(scores_d)]= vals 
        predict.to_csv(r"{base}/{task}/{task}_{case}_predictions.csv".format(base = path, task = task, case = case))
        scores_d.to_csv(r"{base}/{task}/{task}_{case}_ndcg.csv".format(base = path, task = task, case = case))



In [None]:
scores_d.loc[len(scores_d)]= vals 

In [None]:
import math
def percent_missing(lang, path):
    v = pd.read_csv(path, sep='\t')
    glotto = iso2glotto[lang]
    vector = list(get_vector(v, glotto))
    a = sum(math.isnan(x) for x in vector)
    return a/len(vector)

    

In [None]:
def get_task_langs(task):
    with open("./training-data/{}_ranked_train_no_ties.pkl".format(task), 'rb') as f:
        rankings = pickle.load(f)
    languages = list(rankings.keys())
    return languages

In [None]:
PATH = "/Users/CitronVert/Desktop/grambank-grambank-analysed-fcf971a/R_grambank/output/GB_wide/GB_cropped_for_missing.tsv"
tasks = ["MT", "DEP", "POS", "EL"]
for task in tasks:
    d = {}
    langs = get_task_langs(task)
    for lang in langs: 
        missing = percent_missing(lang, PATH)
        d[lang] = missing
    with open(r"./{task}_percent_missing.txt".format(task = task), 'w') as fp:
        for item in d.keys():
            percent = d[item]
            # write each item on a new line
            fp.write("{lang}: {percent}\n".format(lang = item, percent = percent))
        print('Done')


In [None]:
with open(r'./uriel_features.txt', 'w') as fp:
    for item in features:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')