In [4]:
import pandas as pd
from pandas import DataFrame, Series, to_datetime, to_numeric
import os

def get_variable_types(df: DataFrame) -> dict[str, list]:
    variable_types: dict = {"numeric": [], "binary": [], "date": [], "symbolic": []}

    nr_values: Series = df.nunique(axis=0, dropna=True)
    for c in df.columns:
        if 2 == nr_values[c]:
            variable_types["binary"].append(c)
            df[c].astype("bool")
        else:
            try:
                to_numeric(df[c], errors="raise")
                variable_types["numeric"].append(c)
            except ValueError:
                    variable_types["symbolic"].append(c)

    return variable_types

classes = {'adult': ['income'],'BankNoteAuthentication': ['class'],
           'Breast_Cancer': ['diagnosis'], 'Churn_Modelling': ['Exited'],
           'diabetes':['Outcome'], 'heart': ['target'], 'Iris': ['Species'],
           'Titanic': ['Survived'], 'Wine': ['Class'], 'WineQT': ['quality'],
           'vehicle': ['target'], 'apple_quality': ['Quality'], 'loan_data': ['Loan_Status'],
           'credit_customers': ['class'], 'smoking_drinking': ['DRK_YN'], 'sky_survey': ['class'],
           'weatherAUS': ['RainTomorrow'], 'Dry_Bean_Dataset': ['Class'],'abalone': ['Sex'],
           'car_insurance': ['is_claim'], 'Covid_Data': ['CLASSIFICATION'],'customer_segmentation': ['Segmentation'],
           'detect_dataset': ['Output'],'e-commerce': ['ReachedOnTime'], 'Employee': ['LeaveOrNot'],
           'Hotel_Reservations': ['booking_status'], 'Liver_Patient': ['Selector'], 'maintenance': ['Machine_failure'],
           'ObesityDataSet': ['NObeyesdad'], 'phone': ['price_range'], 'Placement': ['status'],
           'StressLevelDataset': ['stress_level'], 'urinalysis_tests': ['Diagnosis'], 'water_potability': ['Potability']}

conditions = {'adult':['hours-per-week <= 41.5','capital-loss <= 1820.5'], 'BankNoteAuthentication':['skewness <= 5.16','curtosis <= 0.19'],
                'Breast_Cancer':['perimeter_mean <= 90.47','texture_worst <= 27.89'],'Churn_Modelling':['Age <= 42.5','NumOfProducts <= 2.5'],
                'diabetes':['BMI <= 29.85','Age <= 27.5'],'heart':['slope <= 1.5','restecg <= 0.5'],
                'Titanic':['Pclass <= 2.5','Parch <= 0.5'],'vehicle':['MAJORSKEWNESS <= 74.5','CIRCULARITY <= 49.5'],
                'Wine':['Total phenols <= 2.36','Proanthocyanins <= 1.58'],'WineQT':['density <= 1.0','chlorides <= 0.08'],
                'apple_quality':['Juiciness <= -0.3','Crunchiness <= 2.25'],'loan_data':['Loan_Amount_Term <= 420.0','ApplicantIncome <= 1519.0'],
                'credit_customers':['existing_credits <= 1.5','residence_since <= 3.5'],'smoking_drinking':['SMK_stat_type_cd <= 1.5','gamma_GTP <= 35.5'],
                'sky_survey':['dec <= 22.21','mjd <= 55090.5'],'weatherAUS':['Rainfall <= 0.1','Pressure3pm <= 1009.65'],
                'Dry_Bean_Dataset':['Area <= 39172.5','AspectRation <= 1.86'],'abalone':['Height <= 0.13','Diameter <= 0.45'],
                'car_insurance':['displacement <= 1196.5','height <= 1519.0'],'Covid_Data':['CARDIOVASCULAR <= 50.0','ASHTMA <= 1.5'],
                'customer_segmentation':['Family_Size <= 2.5','Work_Experience <= 9.5'],'detect_dataset':['Ic <= 71.01','Vb <= -0.37'],
                'e-commerce':['Prior_purchases <= 3.5','Customer_care_calls <= 4.5'],'Employee':['JoiningYear <= 2017.5','ExperienceInCurrentDomain <= 3.5'],
                'Hotel_Reservations':['lead_time <= 151.5','no_of_special_requests <= 2.5'],'Liver_Patient':['Alkphos <= 211.5','Sgot <= 26.5'],
                'maintenance':['Rotational speed [rpm] <= 1381.5','Torque [Nm] <= 65.05'],'ObesityDataSet':['FAF <= 2.0','Height <= 1.72'],
                'phone':['int_memory <= 30.5','mobile_wt <= 91.5'],'Placement':['ssc_p <= 60.09','hsc_p <= 70.24'],
                'StressLevelDataset':['basic_needs <= 3.5','bullying <= 1.5'],'urinalysis_tests':['Age <= 0.1','pH <= 5.5'],
                'water_potability':['Hardness <= 278.29','Chloramines <= 6.7'],'Iris':['PetalWidthCm <= 0.7','PetalWidthCm <= 1.75']}

neighbors = {'abalone': ['932','117','683','1191'],'adult':['21974','541','9274','434'],
             'apple_quality': ['148','784','1625','243'],'BankNoteAuthentication': ['214','436','179','131'],
             'Breast_Cancer': ['184','50','20','144'],'car_insurance': ['2141','686','774','3813'],
             'Churn_Modelling': ['4831','114','1931','124'],'Covid_Data': ['173','7971','16','46'],
             'credit_customers': ['264','183','146','107'],'customer_segmentation': ['249','524','723','11'],
             'detect_dataset': ['797','6394','3','1206'],'diabetes': ['111','98','167','161'],
             'Dry_Bean_Dataset': ['760','2501','4982','1284'],'e-commerce': ['3657','906','1540','1596'],
             'Employee': ['1781','1215','44','217'],'heart': ['202','181','137','197'],'Hotel_Reservations': ['10612','9756','4955','69'],
             'Iris': ['35','38','32'],'Liver_Patient': ['77','125','109','94'],'loan_data': ['3','204','2','6'],
             'maintenance': ['943','46','21','5990'],'ObesityDataSet': ['840','370','107','160'],'phone': ['469','209','86','636'],
             'Placement': ['16','20','68','46'],'sky_survey': ['208','11119','945','1728'],'smoking_drinking': ['7218','1796','3135','2793'],
             'StressLevelDataset': ['271','240','223','36'],'Titanic': ['181','72','188','57'],'urinalysis_tests': ['3','23','215','763'],
             'vehicle': ['1','2','3','4'],'Wine': ['305','109','53','125'],'water_potability': ['8','1388','5','6'],
             'weatherAUS':['1154','1686','251','608'],'Wine':['49','12','2','60'],'WineQT':['154','27','172','447']}

directory = '/home/eduvedras/tese/templates/datasets/'

variable_types = {}

data = pd.read_csv("chartdata.csv",sep=";")

In [20]:
new_dataset = pd.DataFrame(columns=['Id','Data','Chart'])
for index,row in data.iterrows():
    possible_name = row['Chart'].split("_")
    if possible_name[0] in classes.keys():
        dataset = pd.read_csv(directory + f"{possible_name[0]}.csv")
        filename = possible_name[0]
    elif possible_name[0] + "_" + possible_name[1] in classes.keys():
        dataset = pd.read_csv(directory + possible_name[0] + "_" + possible_name[1]+ ".csv")
        filename = possible_name[0] + "_" + possible_name[1]
    
    types = get_variable_types(dataset)
    if 'boxplots' in row['Chart']:
        aux = "[F,{"
        for var in types["numeric"]:
            aux += var + ":{Outliers:,Balanced:" + "}," 
        aux = aux[:-1] + "}" + "]"  
        new_row = {'Id':row['Id'],'Data': aux,'Chart': row['Chart']}
    elif 'histograms_numeric' in row['Chart']:
        aux = "{"
        numeric = types["numeric"]
        if classes[filename][0] in numeric:
            numeric.remove(classes[filename][0])
        for var in numeric:
            min = dataset[var].min()
            max = dataset[var].max()
            aux += var + ":{Outliers:,Balanced:,Range:[" + f"{min},{max}]" + "},"
        aux = aux[:-1] + "}"
        new_row = {'Id':row['Id'],'Data': aux,'Chart': row['Chart']}
    elif 'histograms_symbolic' in row['Chart']:
        aux = "{"
        symbolic = types['symbolic'] + types['binary']
        if classes[filename][0] in symbolic:
            symbolic.remove(classes[filename][0])
        
        for var in symbolic:
            l = [x for x in dataset[var].unique().tolist() if str(x) != 'nan']
            aux += f"{var}:{l},"
        aux = aux[:-1] + "}"
        new_row = {'Id':row['Id'],'Data': aux,'Chart': row['Chart']}
    elif 'mv' in row['Chart']:
        mv: dict[str, int] = {}
        for var in dataset.columns:
            nr: int = dataset[var].isna().sum()
            if nr > 0:
                mv[var] = nr
        new_row = {'Id':row['Id'],'Data': mv,'Chart': row['Chart']}
    elif 'overfitting' not in row['Chart'] and 'decision_tree' in row['Chart']:
        if len(neighbors[filename])==4:
            aux = {conditions[filename][0]:{'samples':int(neighbors[filename][0])+int(neighbors[filename][1])+int(neighbors[filename][2])+int(neighbors[filename][3]),'value':[],'class':'',
                'True':{conditions[filename][1]:{'samples':int(neighbors[filename][0])+int(neighbors[filename][1]),'value':[],'class':'','True':{'samples':int(neighbors[filename][0]),'value':[],'class':''},'False':{'samples':int(neighbors[filename][1]),'value':[],'class':''}}},
                'False':{conditions[filename][1]:{'samples':int(neighbors[filename][2])+int(neighbors[filename][3]),'value':[],'class':'','True':{'samples':int(neighbors[filename][2]),'value':[],'class':''},'False':{'samples':int(neighbors[filename][3]),'value':[],'class':''}}}}}
        elif len(neighbors[filename])==3:
            aux = {conditions[filename][0]:{'samples':int(neighbors[filename][0])+int(neighbors[filename][1])+int(neighbors[filename][2]),'value':[],'class':'',
                'True':{'samples':int(neighbors[filename][0]),'value':[],'class':''},
                'False':{conditions[filename][1]:{'samples':int(neighbors[filename][1])+int(neighbors[filename][2]),'value':[],'class':'','True':{'samples':int(neighbors[filename][1]),'value':[],'class':''},'False':{'samples':int(neighbors[filename][2]),'value':[],'class':''}}}}}
        new_row = {'Id':row['Id'],'Data': aux,'Chart': row['Chart']}
    else:
        new_row = {'Id':row['Id'],'Data': row['Data'],'Chart': row['Chart']}
    new_dataset.loc[len(new_dataset)] = new_row
    
#new_dataset.to_csv("chartdatatest.csv",sep=";",index=False)

In [8]:
import pandas as pd
dataset = pd.read_csv("chartdatatest.csv",sep=";")
new_dataset = pd.DataFrame(columns=['Id','Data','Chart'])

for index,row in dataset.iterrows():
    possible_name = row['Chart'].split("_")
    if possible_name[0] in classes.keys():
        dataset = pd.read_csv(directory + f"{possible_name[0]}.csv")
        filename = possible_name[0]
    elif possible_name[0] == 'Dry':
        dataset = pd.read_csv(directory + "Dry_Bean_Dataset.csv")
        filename = 'Dry_Bean_Dataset'
    elif possible_name[0] + "_" + possible_name[1] in classes.keys():
        dataset = pd.read_csv(directory + possible_name[0] + "_" + possible_name[1]+ ".csv")
        filename = possible_name[0] + "_" + possible_name[1]
        
    types = get_variable_types(dataset)
        
    if 'histograms_numeric' in row['Chart']:
        aux = row['Data']
        numeric = types["numeric"]
        if classes[filename][0] in numeric:
            numeric.remove(classes[filename][0])
        for var in numeric:
            l = [x for x in dataset[var].unique().tolist() if str(x) != 'nan']
            if len(l) <= 10:
                ans = 'T'
            else:
                ans = 'F'
            spl = aux.split("Range:[",1)
            aux = spl[0] + "Ordinal:" + ans + spl[1].split("]",1)[1]
                
        new_row = {'Id':row['Id'],'Data': aux,'Chart': row['Chart']}
    else:
        new_row = {'Id':row['Id'],'Data': row['Data'],'Chart': row['Chart']}
    new_dataset.loc[len(new_dataset)] = new_row
    
new_dataset.to_csv("example.csv",sep=";",index=False)

KeyError: 'Id'