In [297]:
import pandas as pd
import os

In [298]:
TARGET_NAMES = ['toxic', 'severe_toxic', 'obscene', 'threat',
                        'insult', 'identity_hate']

In [299]:
def read_dir(dir_path):
    files = os.listdir(dir_path)
    dfs = [pd.read_csv(os.path.join(dir_path, file_path)) for file_path in files]
    
    return dfs
    
    

In [300]:
dfs = read_dir('../enesmbles/300_glove_6b_ensemble_lda/lda/')

In [301]:
def simple_ensemble(dfs):
    ensemble = dfs.pop()
    
    for df in dfs:
        ensemble = ensemble.loc[:, TARGET_NAMES].add(df.loc[:, TARGET_NAMES])
        
    ensemble = ensemble / (len(dfs) + 1)
    ensemble[ensemble.loc[:, TARGET_NAMES] > 1] = 1
    ensemble[ensemble.loc[:, TARGET_NAMES] < 0] = 0
    
    return ensemble

In [302]:
print(len(dfs))

2


In [303]:
ensemble = simple_ensemble(dfs)

In [304]:
ensemble_df = pd.concat([dfs[0].id.to_frame(), ensemble], axis=1)

In [305]:
ensemble_df.to_csv('300_glove_ensemble_lda.csv', index=False)

In [283]:
rankings = {name: {} for name in TARGET_NAMES}

for i, df in enumerate(dfs):
    index = list(range(len(dfs)))
    index.pop(i)
    
    for ind in index: 
        corr = df.corrwith(dfs[ind])
        
        for name in TARGET_NAMES:
            if i > ind:
                key_index = '{}_{}'.format(i, ind)
            else:
                key_index = '{}_{}'.format(ind, i)
            rankings[name][key_index] = corr[name]
                    
    

In [284]:
import operator

def get_sorted_rankings(rankings_dict):
    return {key: sorted(correlation_dict.items(), key=operator.itemgetter(1), reverse=False) 
                   for (key, correlation_dict) in rankings_dict.items()}    

In [285]:
sorted_rankings = get_sorted_rankings(rankings)

In [286]:
def get_top_n(sorted_rankings, dfs, n):
    assert n > 0, 'n must be greater than 0!'
    
    ensemble_df = {}
    
    for name in TARGET_NAMES:
        sorted_ranking = sorted_rankings[name][:n-1]
        
        df_ids = set([])
        
        
        for key, _ in sorted_ranking:
            df_1, df_2 = key.split('_')
            
            df_1, df_2 = int(df_1), int(df_2)
            
            df_ids.add(df_1)
            df_ids.add(df_2)
        
        df_ids = list(df_ids)
        df_id = df_ids.pop(0)
        
        df = dfs[df_id][name]
        
        for df_id in df_ids:
            df = df.add(dfs[df_id][name])

        ensemble_df[name] = df.divide(len(df_ids))
    ensemble = pd.DataFrame(ensemble_df)
    ensemble[ensemble.loc[:, TARGET_NAMES] > 1] = 1
    ensemble[ensemble.loc[:, TARGET_NAMES] < 0] = 0
        
        
    return ensemble


In [287]:
ensemble = get_top_n(sorted_rankings, dfs, 3)

In [288]:
ensemble = pd.concat([dfs[0].id.to_frame(), ensemble], axis=1)

In [289]:
ensemble.to_csv('corr_ensemble_lda.csv', index=False)