In [11]:
import os
import pandas as pd
import numpy as np
import seaborn as sb

In [12]:
tmp = [2, 3, 4]
tmp.extend([i for i in range(6, 19)])

dict_usecols = {
    'kmeans': [i for i in range(2, 16)],
    'opf': tmp,
    'opf_snn': tmp,
    'user_knn': [i for i in range(2, 15)],
    'dbscan': [i for i in range(2, 18)]
}

datasets = ['amzmagazinesubs', 'ml100k', 'mlsmall']
algorithms = ['kmeans', 'opf', 'opf_snn', 'user_knn', 'dbscan']
output_dir = 'out'

### Creates a csv file that unifies, for each dataset, the results concerning all datasets 

In [13]:
params_ds = {'amzmagazinesubs': [30, 'jaccard', 'amzms'], 'ml100k': [50, 'cosine', 'ml100k'], 'mlsmall': [60, 'squared_euclidean', 'mlls']}

In [109]:
print("Compiling results for:", algorithms)

for ds in datasets:
    
    print("Compiling results of '", ds, "'...", end="")
    df = pd.DataFrame()
  
    for algo in algorithms:

        for f in os.scandir('/'.join(['..', output_dir, ds, algo])):

            # out_file = pd.read_csv(f.path, index_col=1, engine='c', dtype=object)
            # print("Reading file ", f.name)
            out_file = pd.read_csv(f.path, usecols=dict_usecols.get(algo), engine='c')
            out_file.insert(0, "algorithm", [algo] * out_file.shape[0])
               
            if algo == 'kmeans':
                x = [i for i in range(10, 70, 10)]
                x.insert(len(x), params_ds.get(ds)[0])

                out_file.insert(1, "n_neighbors", x)
                out_file.insert(2, "n_neighbors_best", [0] * len(x))

            if algo in ['opf', 'opf_snn']:
                out_file.rename(columns={'kmax': 'n_neighbors', 
                                         'kbest': 'n_neighbors_best', 
                                         'nclusters':'n_clusters'}, inplace=True, copy=False)

            if algo == 'user_knn':
                x = [0] * (out_file.shape[0] - 1)
                x.insert(len(x), params_ds.get(ds)[0])

                out_file.insert(2, "n_neighbors_best", [0] * len(x))
                out_file.insert(3, "n_clusters", [0] * len(x))

                x[len(x) - 1] = params_ds.get(ds)[1]
                out_file.insert(4, "distance", x)
            
            
            if algo == 'dbscan':
                out_file.drop(['eps', 'min_samples'], axis=1, inplace=True)
                out_file.insert(1, "n_neighbors", [params_ds.get(ds)[0]])
                out_file.insert(2, "n_neighbors_best", [0])
            
            df = pd.concat([df, out_file])
    
    f_name = '_'.join([params_ds.get(ds)[2], 'results.csv'])
    df.to_csv("/".join(['..', 'out', f_name]))        
    
    print("... done.")

Compiling results for: ['kmeans', 'opf', 'opf_snn', 'user_knn', 'dbscan']
Compiling results of ' amzmagazinesubs '...... done.
Compiling results of ' ml100k '...... done.
Compiling results of ' mlsmall '...... done.
