In [1]:
import pandas as pd
import numpy as np
import json
import ast
import pickle
import glob

In [2]:
def _read_rank_list(filename):
    with open(filename, 'rb') as f:
        rank_list = pickle.load(f)
        
    rank_list = pd.DataFrame(rank_list)
    return rank_list


def _create_sparse_df(df):
    # # ***** create sparse one-hot encoded dataset *******
    X = pd.DataFrame(0, index=range(df.shape[0]), 
                     columns=['col_{}'.format(i) for i in range(70)])

    def set_values(row):
        A = row['A']
        B = row['B']
        C = row['C']
        X.loc[row.name, 'col_{}'.format(A)] = 1
        X.loc[row.name, 'col_{}'.format(B)] = 1
        X.loc[row.name, 'col_{}'.format(C)] = 1

    df.apply(set_values, axis=1)

    X = pd.get_dummies(df[['A', 'B', 'C']], columns=['A', 'B', 'C'], prefix='col')
    X = X.groupby(level=0, axis=1).max()

#     X['feasibility'] = df[2].copy()
    return X

## Read and concat all selected cuts

In [3]:
# Get all CSV files in the folder starting with "rank"
pickle_files = glob.glob("temp_files/rank_list_*.pickle")

dfs = []

for f in pickle_files:
    temp_rl = _read_rank_list(f)
    dfs.append(temp_rl)
#     print(temp_rl.shape)
    
rank_list = pd.concat(dfs)
rank_list.shape

(2000, 6)

In [4]:
rank_list.drop(columns=[0, 3, 4, 5], inplace=True)
rank_list[['A', 'B', 'C']] = rank_list[1].apply(lambda x: pd.Series(x))
rank_list[1] = rank_list[1].astype(str)
rank_list.reset_index(drop=True, inplace=True)
rank_list.head(2)

Unnamed: 0,1,2,A,B,C
0,"[9, 18, 41]",0.5,9,18,41
1,"[22, 52, 53]",0.724745,22,52,53


In [5]:
rank_list = rank_list.rename(columns={2:'2'})

In [6]:
rank_list.sort_values(by=1)

Unnamed: 0,1,2,A,B,C
1089,"[0, 10, 20]",0.018487,0,10,20
1985,"[0, 10, 37]",0.236417,0,10,37
1212,"[0, 10, 44]",0.015237,0,10,44
593,"[0, 10, 44]",0.094791,0,10,44
1662,"[0, 10, 49]",0.236417,0,10,49
...,...,...,...,...,...
91,"[9, 55, 62]",0.572551,9,55,62
659,"[9, 56, 62]",0.054476,9,56,62
710,"[9, 62, 63]",0.023185,9,62,63
79,"[9, 62, 63]",0.630901,9,62,63


## Read all generated dfs

In [7]:
df = pd.read_csv('temp_files/full_df_20.csv')
df.drop('5',axis=1,inplace=True)
df.head()

Unnamed: 0,1,2,A,B,C
0,"[0, 2, 17]",0.000921,0,2,17
1,"[0, 2, 39]",0.014385,0,2,39
2,"[0, 8, 54]",0.002592,0,8,54
3,"[0, 9, 12]",6.6e-05,0,9,12
4,"[0, 10, 20]",0.002446,0,10,20


In [8]:
df.shape

(66799, 5)

In [9]:
df.duplicated().sum()

12450

In [10]:
df.drop_duplicates(inplace=True, keep='first')
df.shape

(54349, 5)

In [11]:
merged = df.merge(rank_list, on=['A','B','C','2'], how='inner')

In [12]:
merged

Unnamed: 0,1,2,A,B,C,1.1
0,"[1, 25, 59]",0.017215,1,25,59,"[1, 25, 59]"
1,"[11, 35, 44]",0.047974,11,35,44,"[11, 35, 44]"
2,"[22, 44, 51]",0.022566,22,44,51,"[22, 44, 51]"
3,"[38, 46, 47]",0.027219,38,46,47,"[38, 46, 47]"
4,"[45, 49, 56]",0.026894,45,49,56,"[45, 49, 56]"
...,...,...,...,...,...,...
704,"[40, 51, 52]",0.630901,40,51,52,"[40, 51, 52]"
705,"[40, 52, 63]",0.630901,40,52,63,"[40, 52, 63]"
706,"[41, 52, 63]",0.630901,41,52,63,"[41, 52, 63]"
707,"[51, 52, 59]",0.630901,51,52,59,"[51, 52, 59]"


In [13]:
1/0

ZeroDivisionError: division by zero

In [None]:
df[df['1'] == rank_list[1][0]]

In [None]:
rank_list[1] = rank_list[1].astype(str)

In [None]:
df[df['1'].isin(rank_list[1])]

In [None]:
X = _create_sparse_df(rank_list)
X.head(2)

In [None]:
# calculate the sum of every column
col_sum = X.sum()
pd.DataFrame(col_sum).sort_values(by=0, ascending=False)

In [None]:
zzz