### KE-TALENT Dataset Preprocessing

- This notebook preprocesses the following 11 datasets from the TALENT benchmark:
    - Abalone_reg
    - Diamonds
    - Parkinsons_Telemonitoring
    - archive_r56_Portuguese
    - communities_and_crime
    - Bank_Customer_Churn_Dataset
    - statlog
    - taiwanese_bankruptcy_prediction
    - ASP-POTASSCO-classification
    - internet_usage
    - predict_students_dropout_and_academic_success
- This notebook:
    1. Converts columns that are incorrectly labeled as numerical or categorical into the appropriate type.
    1. When there are categorical columns, it pre-processes the categorical columns of the datasets into one-hot format and appends the categories to new one-hot encoded columns.
       The output will be saved in the directory (`talent/{dataset_name}/onehot`).
- To add more datasets, define the column conversion information (`num_col_conversion` and `cat_col_conversion`), then run the remaining cells.

In [1]:
# Run this once at first to work on the project root
%cd ..

/usr2/juyongk/graph-concept-prior


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import collections
import json
import math
import os
import shutil
from copy import deepcopy
from pathlib import Path

import numpy as np
import pandas as pd

- Run this for backup

In [3]:
# All 11 datasets
talent_root = 'data/talent'
talent_dataset_names = [
    "Abalone_reg",
    "Diamonds",
    "Parkinsons_Telemonitoring",
    "archive_r56_Portuguese",
    "communities_and_crime",
    "Bank_Customer_Churn_Dataset",
    "statlog",
    "taiwanese_bankruptcy_prediction",
    "ASP-POTASSCO-classification",
    "internet_usage",
    "predict_students_dropout_and_academic_success",
]

In [4]:
# Initial backup
for dataset_name in talent_dataset_names:
    dataset_dir = os.path.join(talent_root, dataset_name)
    new_dataset_dir = os.path.join(talent_root, dataset_name+'_backup')
    if os.path.exists(new_dataset_dir):
        print(dataset_name, 'already backed up')
        continue

    print('move', dataset_dir, '->', new_dataset_dir)
    shutil.move(dataset_dir, new_dataset_dir)

move data/talent/Abalone_reg -> data/talent/Abalone_reg_backup
move data/talent/Diamonds -> data/talent/Diamonds_backup
move data/talent/Parkinsons_Telemonitoring -> data/talent/Parkinsons_Telemonitoring_backup
move data/talent/archive_r56_Portuguese -> data/talent/archive_r56_Portuguese_backup
move data/talent/communities_and_crime -> data/talent/communities_and_crime_backup
move data/talent/Bank_Customer_Churn_Dataset -> data/talent/Bank_Customer_Churn_Dataset_backup
move data/talent/statlog -> data/talent/statlog_backup
move data/talent/taiwanese_bankruptcy_prediction -> data/talent/taiwanese_bankruptcy_prediction_backup
move data/talent/ASP-POTASSCO-classification -> data/talent/ASP-POTASSCO-classification_backup
move data/talent/internet_usage -> data/talent/internet_usage_backup
move data/talent/predict_students_dropout_and_academic_success -> data/talent/predict_students_dropout_and_academic_success_backup


In [5]:
# Remove exists working datasets
for dataset_name in talent_dataset_names:
    dataset_dir = os.path.join(talent_root, dataset_name)
    new_dataset_dir = os.path.join(talent_root, dataset_name+'_backup')
    if not os.path.exists(new_dataset_dir):
        print(new_dataset_dir, 'does not exists. backup first.')

    if os.path.exists(dataset_dir):
        print('remove', dataset_dir)
        shutil.rmtree(dataset_dir)
    else:
        print(dataset_dir, 'does not exists. skip')

data/talent/Abalone_reg does not exists. skip
data/talent/Diamonds does not exists. skip
data/talent/Parkinsons_Telemonitoring does not exists. skip
data/talent/archive_r56_Portuguese does not exists. skip
data/talent/communities_and_crime does not exists. skip
data/talent/Bank_Customer_Churn_Dataset does not exists. skip
data/talent/statlog does not exists. skip
data/talent/taiwanese_bankruptcy_prediction does not exists. skip
data/talent/ASP-POTASSCO-classification does not exists. skip
data/talent/internet_usage does not exists. skip
data/talent/predict_students_dropout_and_academic_success does not exists. skip


In [6]:
# Helper functions
def load_json(path):
    return json.loads(Path(path).read_text())

# Pre-processing functions
# def split_camel_case_numbers(name):
#     # Step 1: Insert space between lowercase->uppercase and uppercase->lowercase transitions
#     name = re.sub(r'([a-z])([A-Z])|([A-Z])([A-Z][a-z])', r'\1\3 \2\4', name)
#     # Step 2: Insert space before and after numbers
#     name = re.sub(r'(\d+)', r' \1 ', name)  # Adds spaces around numbers
#     return ' '.join(name.split())  # Removes any extra spaces

# def format_name(name):
#     name = name.replace('_', ' ')
#     name = split_camel_case_numbers(name)
#     name = name.lower()
#     return name

def get_full_desc_sentences(col_name_descs):
    ret = []
    for name, desc in col_name_descs:
        assert name or desc
        if not desc or name == desc:
            ret.append(name)
            # ret.append(format_name(name))
        elif not name or 'attribute' in name.lower():
            ret.append(desc)
        else:
            ret.append(name + " : " + desc)
    return ret

new_category = '~new~'
unknown_value = np.iinfo('int64').max - 3

def get_categories(col_train, missing_strs):
    col_norm = [
        new_category if c in missing_strs else
        c.strip() if isinstance(c, str) else 
        # format_name(c) if isinstance(c, str) else 
        new_category if math.isnan(c) else c
        for c in col_train.tolist()
    ]
    col_counter = collections.Counter(col_norm)
    categories = sorted(col_counter.keys())
    mode_val = col_counter.most_common(1)[0][0]
    return categories, mode_val

def index_cat_ord(col, categories, mode_val, missing_strs):
    ret = []
    mode_idx = categories.index(mode_val)
    for c in col.tolist():
        v = new_category if c in missing_strs else \
            c.strip() if isinstance(c, str) else \
            new_category if math.isnan(c) else c
            # format_name(c) if isinstance(c, str) else \
        ret.append(categories.index(v) if (v in categories) else mode_idx)
        if v not in categories:
            print(v, "not found ->", mode_val)
    return ret

def expand_col_name(col_name, categories, cat_conv_map=None):
    ret = []
    # col_name = format_name(col_name)
    for category in categories:
        if isinstance(cat_conv_map, dict):
            ret.append(f"{col_name} is {cat_conv_map[category]}")
            continue
        
        if category == new_category:
            ret.append(f"{col_name} is unknown")
        elif category in [0, 0.0, '0']:
            ret.append(f"{col_name} is no")
        elif category in [1, 1.0, '1']:
            ret.append(f"{col_name} is yes")
        else:
            ret.append(f"{col_name} is {category}")
    return ret

- Run one of the cells below to select a dataset

In [7]:
dataset_name = 'Abalone_reg'
missing_strs = []

num_col_conversion = [
    ('Length', 'num', 'Longest shell measurement'),
    ('Diameter', 'num', 'perpendicular to length'),
    ('Height', 'num', 'with meat in shell'),
    ('Whole_weight', 'num', 'whole abalone'),
    ('Shucked_weight', 'num', 'weight of meat'),
    ('Viscera_weight', 'num', 'gut weight (after bleeding)'),
    ('Shell_weight', 'num', 'after being dried'),
]

cat_col_conversion = [
    ('Sex', 'cat', '', None),
]

In [12]:
dataset_name = 'Diamonds'
missing_strs = []

num_col_conversion = [
    ('carat', 'num', 'A measure of diamond weight. One carat is equivalent to 0.2 grams.'),
    ('depth', 'num', 'A measure of a diamond’s size, in millimeters.'),
    ('table', 'num', 'A measure of a diamond’s size, in millimeters.'),
    ('x', 'num', 'A measure of a diamond’s size, in millimeters.'),
    ('y', 'num', 'A measure of a diamond’s size, in millimeters.'),
    ('z', 'num', 'A measure of a diamond’s size, in millimeters.'),
]

cat_col_conversion = [
    ('Cut', 'cat', 'How a rough diamond is shaped into a finished diamond. Better cuts create more symmetrical and luminous diamonds.', None),
    ('Color', 'cat', 'The color of the diamond represented by different letters. Colorless diamonds are considered better than diamonds with a yellow tint.', None),
    ('Clarity', 'cat', 'How clear a diamond is in 8 ordered levels. The fewer and less noticeable a diamond’s imperfections, the better its clarity.', None),
]

In [17]:
dataset_name = 'Parkinsons_Telemonitoring'
missing_strs = []

num_col_conversion = [
    ('age', 'num', 'Subject age'),
    ('test_time', 'num', 'Time since recruitment into the trial. The integer part is the number of days since recruitment. '),
    ('Jitter(%)', 'num', 'Several measures of variation in fundamental frequency'),
    ('Jitter(Abs)', 'num', 'Several measures of variation in fundamental frequency'),
    ('Jitter:RAP', 'num', 'Several measures of variation in fundamental frequency'),
    ('Jitter:PPQ5', 'num', 'Several measures of variation in fundamental frequency'),
    ('Jitter:DDP', 'num', 'Several measures of variation in fundamental frequency'),
    ('Shimmer', 'num', 'Several measures of variation in amplitude'),
    ('Shimmer(dB)', 'num', 'Several measures of variation in amplitude'),
    ('Shimmer:APQ3', 'num', 'Several measures of variation in amplitude'),
    ('Shimmer:APQ5', 'num', 'Several measures of variation in amplitude'),
    ('Shimmer:APQ11', 'num', 'Several measures of variation in amplitude'),
    ('Shimmer:DDA', 'num', 'Several measures of variation in amplitude'),
    ('NHR', 'num', 'Two measures of ratio of noise to tonal components in the voice'),
    ('HNR', 'num', 'Two measures of ratio of noise to tonal components in the voice'),
    ('RPDE', 'num', 'A nonlinear dynamical complexity measure'),
    ('DFA', 'num', 'Signal fractal scaling exponent'),
    ('PPE', 'num', 'A nonlinear measure of fundamental frequency variation '),
]

cat_col_conversion = [
    ('sex', 'cat', 'Subject sex', None),
]

In [22]:
dataset_name = 'archive_r56_Portuguese'
missing_strs = []

num_col_conversion = [
    ("student's age", 'num', ""),
]

cat_col_conversion = [
    ("student's school", 'cat', "", {
        'GP': 'Gabriel Pereira',
        'MS': 'Mousinho da Silveira',
    }),
    ("student's sex", 'cat', "", {
        'F': 'female',
        'M': 'male',
    }),
    ("student's home address type", 'cat', "", {
        'R': 'rural',
        'U': 'urban',
    }),
    ("family size", 'cat', "", {
        'GT3': 'greater than 3',
        'LE3': 'less or equal to 3',
    }),
    ("parent's cohabitation status", 'cat', "", {
        'A': 'apart',
        'T': 'living together',
    }),
    ("mother's education", 'cat', "", {
        '0': 'none',
        '1': 'primary education (4th grade)',
        '2': '5th to 9th grade',
        '3': 'secondary education',
        '4': 'higher education',
    }),
    ("father's education", 'cat', "", {
        '0': 'none',
        '1': 'primary education (4th grade)',
        '2': '5th to 9th grade',
        '3': 'secondary education',
        '4': 'higher education',
    }),
    ("mother's job", 'cat', "", {
        'at_home': 'at home',
        'health': 'health care related',
        'other': 'other',
        'services': 'civil services (e.g. administrative or police)',
        'teacher': 'teacher',
    }),
    ("father's job", 'cat', "", {
        'at_home': 'at home',
        'health': 'health care related',
        'other': 'other',
        'services': 'civil services (e.g. administrative or police)',
        'teacher': 'teacher',
    }),
    ("reason to choose this school", 'cat', "", {
        'course': 'course preference',
        'home': 'close to home',
        'other': 'other',
        'reputation': 'school reputation',
    }),
    ("student's guardian", 'cat', "", {
        'father': 'father',
        'mother': 'mother',
        'other': 'other',
    }),
    ("home to school travel time", 'num', "(1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)", None),
    ("weekly study time", 'num', "(1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)", None),
    ("number of past class failures", 'num', "", None),
    ("extra educational support", 'cat', "", None),
    ("family educational support", 'cat', "", None),
    ("extra paid classes within the course subject", 'cat', "", None),
    ("extra-curricular activities", 'cat', "", None),
    ("attended nursery school", 'cat', "", None),
    ("wants to take higher education", 'cat', "", None),
    ("Internet access at home", 'cat', "", None),
    ("with a romantic relationship", 'cat', "", None),
    ("quality of family relationships", 'num', "(from 1 - very bad to 5 - excellent)", None),
    ("free time after school", 'num', "(from 1 - very low to 5 - very high)", None),
    ("going out with friends", 'num', "(from 1 - very low to 5 - very high)", None),
    ("workday alcohol consumption", 'num', "(from 1 - very low to 5 - very high)", None),
    ("weekend alcohol consumption", 'num', "(from 1 - very low to 5 - very high)", None),
    ("current health status", 'num', "(from 1 - very bad to 5 - very good)", None),
    ("number of school absences", 'num', "", None),
]

In [27]:
dataset_name = 'communities_and_crime'
missing_strs = []

num_col_conversion = [
    ('state', 'num', 'US state (by number) - not counted as predictive above, but if considered, should be consided nominal'),
    ('fold', 'num', 'fold number for non-random 10 fold cross validation, potentially useful for debugging, paired tests - not predictive'),
    ('population', 'num', 'population for community'),
    ('householdSize', 'num', 'mean people per household'),
    ('racePctblack', 'num', 'percentage of population that is african american'),
    ('racePctWhite', 'num', 'percentage of population that is caucasian'),
    ('racePctAsian', 'num', 'percentage of population that is of asian heritage'),
    ('racePctHisp', 'num', 'percentage of population that is of hispanic heritage'),
    ('agePct12t21', 'num', 'percentage of population that is 12-21 in age'),
    ('agePct12t29', 'num', 'percentage of population that is 12-29 in age'),
    ('agePct16t24', 'num', 'percentage of population that is 16-24 in age'),
    ('agePct65up', 'num', 'percentage of population that is 65 and over in age'),
    ('numbUrban', 'num', 'number of people living in areas classified as urban'),
    ('pctUrban', 'num', 'percentage of people living in areas classified as urban'),
    ('medIncome', 'num', 'median household income'),
    ('pctWWage', 'num', 'percentage of households with wage or salary income in 1989'),
    ('pctWFarmSelf', 'num', 'percentage of households with farm or self employment income in 1989'),
    ('pctWInvInc', 'num', 'percentage of households with investment / rent income in 1989'),
    ('pctWSocSec', 'num', 'percentage of households with social security income in 1989'),
    ('pctWPubAsst', 'num', 'percentage of households with public assistance income in 1989'),
    ('pctWRetire', 'num', 'percentage of households with retirement income in 1989'),
    ('medFamInc', 'num', 'median family income (differs from household income for non-family households)'),
    ('perCapInc', 'num', 'per capita income'),
    ('whitePerCap', 'num', 'per capita income for caucasians'),
    ('blackPerCap', 'num', 'per capita income for african americans'),
    ('indianPerCap', 'num', 'per capita income for native americans'),
    ('AsianPerCap', 'num', 'per capita income for people with asian heritage'),
    ('OtherPerCap', 'num', 'per capita income for people with \'other\' heritage'),
    ('HispPerCap', 'num', 'per capita income for people with hispanic heritage'),
    ('NumUnderPov', 'num', 'number of people under the poverty level'),
    ('PctPopUnderPov', 'num', 'percentage of people under the poverty level'),
    ('PctLess9thGrade', 'num', 'percentage of people 25 and over with less than a 9th grade education'),
    ('PctNotHSGrad', 'num', 'percentage of people 25 and over that are not high school graduates'),
    ('PctBSorMore', 'num', 'percentage of people 25 and over with a bachelors degree or higher education'),
    ('PctUnemployed', 'num', 'percentage of people 16 and over, in the labor force, and unemployed'),
    ('PctEmploy', 'num', 'percentage of people 16 and over who are employed'),
    ('PctEmplManu', 'num', 'percentage of people 16 and over who are employed in manufacturing'),
    ('PctEmplProfServ', 'num', 'percentage of people 16 and over who are employed in professional services'),
    ('PctOccupManu', 'num', 'percentage of people 16 and over who are employed in manufacturing'),
    ('PctOccupMgmtProf', 'num', 'percentage of people 16 and over who are employed in management or professional occupations'),
    ('MalePctDivorce', 'num', 'percentage of males who are divorced'),
    ('MalePctNevMarr', 'num', 'percentage of males who have never married'),
    ('FemalePctDiv', 'num', 'percentage of females who are divorced'),
    ('TotalPctDiv', 'num', 'percentage of population who are divorced'),
    ('PersPerFam', 'num', 'mean number of people per family'),
    ('PctFam2Par', 'num', 'percentage of families (with kids) that are headed by two parents'),
    ('PctKids2Par', 'num', 'percentage of kids in family housing with two parents'),
    ('PctYoungKids2Par', 'num', 'percent of kids 4 and under in two parent households'),
    ('PctTeen2Par', 'num', 'percent of kids age 12-17 in two parent households'),
    ('PctWorkMomYoungKids', 'num', 'percentage of moms of kids 6 and under in labor force'),
    ('PctWorkMom', 'num', 'percentage of moms of kids under 18 in labor force'),
    ('NumIlleg', 'num', 'number of kids born to never married'),
    ('PctIlleg', 'num', 'percentage of kids born to never married'),
    ('NumImmig', 'num', 'total number of people known to be foreign born'),
    ('PctImmigRecent', 'num', 'percentage of _immigrants_ who immigated within last 3 years'),
    ('PctImmigRec5', 'num', 'percentage of _immigrants_ who immigated within last 5 years'),
    ('PctImmigRec8', 'num', 'percentage of _immigrants_ who immigated within last 8 years'),
    ('PctImmigRec10', 'num', 'percentage of _immigrants_ who immigated within last 10 years'),
    ('PctRecentImmig', 'num', 'percent of _population_ who have immigrated within the last 3 years'),
    ('PctRecImmig5', 'num', 'percent of _population_ who have immigrated within the last 5 years'),
    ('PctRecImmig8', 'num', 'percent of _population_ who have immigrated within the last 8 years'),
    ('PctRecImmig10', 'num', 'percent of _population_ who have immigrated within the last 10 years'),
    ('PctSpeakEnglOnly', 'num', 'percent of people who speak only English'),
    ('PctNotSpeakEnglWell', 'num', 'percent of people who do not speak English well'),
    ('PctLargHouseFam', 'num', 'percent of family households that are large (6 or more)'),
    ('PctLargHouseOccup', 'num', 'percent of all occupied households that are large (6 or more people)'),
    ('PersPerOccupHous', 'num', 'mean persons per household'),
    ('PersPerOwnOccHous', 'num', 'mean persons per owner occupied household'),
    ('PersPerRentOccHous', 'num', 'mean persons per rental household'),
    ('PctPersOwnOccup', 'num', 'percent of people in owner occupied households'),
    ('PctPersDenseHous', 'num', 'percent of persons in dense housing (more than 1 person per room)'),
    ('PctHousLess3BR', 'num', 'percent of housing units with less than 3 bedrooms'),
    ('MedNumBR', 'num', 'median number of bedrooms'),
    ('HousVacant', 'num', 'number of vacant households'),
    ('PctHousOccup', 'num', 'percent of housing occupied'),
    ('PctHousOwnOcc', 'num', 'percent of households owner occupied'),
    ('PctVacantBoarded', 'num', 'percent of vacant housing that is boarded up'),
    ('PctVacMore6Mos', 'num', 'percent of vacant housing that has been vacant more than 6 months'),
    ('MedYrHousBuilt', 'num', 'median year housing units built'),
    ('PctHousNoPhone', 'num', 'percent of occupied housing units without phone (in 1990, this was rare!)'),
    ('PctWOFullPlumb', 'num', 'percent of housing without complete plumbing facilities'),
    ('OwnOccLowQuart', 'num', 'owner occupied housing - lower quartile value'),
    ('OwnOccMedVal', 'num', 'owner occupied housing - median value'),
    ('OwnOccHiQuart', 'num', 'owner occupied housing - upper quartile value'),
    ('RentLowQ', 'num', 'rental housing - lower quartile rent'),
    ('RentMedian', 'num', 'rental housing - median rent (Census variable H32B from file STF1A)'),
    ('RentHighQ', 'num', 'rental housing - upper quartile rent'),
    ('MedRent', 'num', 'median gross rent (Census variable H43A from file STF3A - includes utilities)'),
    ('MedRentPctHousInc', 'num', 'median gross rent as a percentage of household income'),
    ('MedOwnCostPctInc', 'num', 'median owners cost as a percentage of household income - for owners with a mortgage'),
    ('MedOwnCostPctIncNoMtg', 'num', 'median owners cost as a percentage of household income - for owners without a mortgage'),
    ('NumInShelters', 'num', 'number of people in homeless shelters'),
    ('NumStreet', 'num', 'number of homeless people counted in the street'),
    ('PctForeignBorn', 'num', 'percent of people foreign born'),
    ('PctBornSameState', 'num', 'percent of people born in the same state as currently living'),
    ('PctSameHouse85', 'num', 'percent of people living in the same house as in 1985 (5 years before)'),
    ('PctSameCity85', 'num', 'percent of people living in the same city as in 1985 (5 years before)'),
    ('PctSameState85', 'num', 'percent of people living in the same state as in 1985 (5 years before)'),
    ('LandArea', 'num', 'land area in square miles'),
    ('PopDens', 'num', 'population density in persons per square mile'),
    ('PctUsePubTrans', 'num', 'percent of people using public transit for commuting'),
    ('LemasPctOfficDrugUn', 'num', 'percent of officers assigned to drug units'),
]

cat_col_conversion = [
    
]

In [32]:
dataset_name = 'Bank_Customer_Churn_Dataset'
missing_strs = []

num_col_conversion = [
    ('credit_score', 'num', ''),
    ('age', 'num', ''),
    ('tenure', 'num', ''),
    ('balance', 'num', ''),
    ('products_number', 'num', ''),
    ('estimated_salary', 'num', ''),
]

cat_col_conversion = [
    ('country', 'cat', '', None),
    ('credit_card', 'cat', '', None),
    ('active_member', 'cat','', None),
    ('gender', 'cat', '', None),
]

In [37]:
dataset_name = 'statlog'
missing_strs = []

num_col_conversion = [
    ('Duration', 'num', ''),
    ('Credit amount', 'num', ''),
    ('Installment rate in percentage of disposable income', 'num', ''),
    ('Present residence since', 'num', ''),
    ('Age', 'num', ''),
    ('Number of existing credits at this bank', 'num', ''),
    ('Number of people being liable to provide maintenance for', 'num', ''),
]
# num_col_conversion = [
#     ('Attribute2', 'num', 'Duration'),
#     ('Attribute5', 'num', 'Credit amount'),
#     ('Attribute8', 'num', 'Installment rate in percentage of disposable income'),
#     ('Attribute11', 'num', 'Present residence since'),
#     ('Attribute13', 'num', 'Age'),
#     ('Attribute16', 'num', 'Number of existing credits at this bank'),
#     ('Attribute18', 'num', 'Number of people being liable to provide maintenance for'),
# ]

cat_col_conversion = [
    ('Status of existing checking account', 'cat', '', {
        'A11': '... < 0 DM',
        'A12': '0 <= ... <  200 DM',
        'A13': '... >= 200 DM / salary assignments for at least 1 year',
        'A14': 'no checking account',
    }),
    ('Credit history', 'cat', '', {
        'A30': 'no credits taken / all credits paid back duly',
        'A31': 'all credits at this bank paid back duly',
        'A32': 'existing credits paid back duly till now',
        'A33': 'delay in paying off in the past',
        'A34': 'critical account / other credits existing (not at this bank)',
    }),
    ('Purpose', 'cat', '', {
        'A40': 'car (new)',
        'A41': 'car (used)',
        'A42': 'furniture/equipment',
        'A43': 'radio/television',
        'A44': 'domestic appliances',
        'A45': 'repairs',
        'A46': 'education',
        'A47': '(vacation - does not exist?)',
        'A48': 'retraining',
        'A49': 'business',
        'A410': 'others',
    }),
    ('Savings account/bonds', 'cat', '', {
        'A61': '... <  100 DM',
        'A62': '100 <= ... <  500 DM',
        'A63': '500 <= ... < 1000 DM',
        'A64': '.. >= 1000 DM',
        'A65': 'unknown / no savings account',
    }),
    ('Present employment since', 'cat', '', {
        'A71': 'unemployed',
        'A72': '... < 1 year',
        'A73': '1  <= ... < 4 years ',
        'A74': '4  <= ... < 7 years',
        'A75': '... >= 7 years',
    }),
    ('Personal status and sex', 'cat', '', {
        'A91': 'male   : divorced/separated',
        'A92': 'female : divorced/separated/married',
        'A93': 'male   : single',
        'A94': 'male   : married/widowed',
        'A95': 'female : single',
    }),
    ('Other debtors / guarantors', 'cat', '', {
        'A101': 'none',
        'A102': 'co-applicant',
        'A103': 'guarantor',
    }),
    ('Property', 'cat', '', {
        'A121': 'real estate',
        'A122': 'building society savings agreement/ life insurance',
        'A123': 'car or other, not in attribute 6',
        'A124': 'unknown / no property',
    }),
    ('Other installment plans', 'cat', '', {
        'A141': 'bank',
        'A142': 'stores',
        'A143': 'none',
    }),
    ('Housing', 'cat', '', {
        'A151': 'rent',
        'A152': 'own',
        'A153': 'for free',
    }),
    ('Job', 'cat', '', {
        'A171': 'unemployed / unskilled  - non-resident',
        'A172': 'unskilled - resident',
        'A173': 'skilled employee / official',
        'A174': 'management / self-employed / highly qualified employee / officer',
    }),
    ('Telephone', 'cat', '', {
        'A191': 'none',
        'A192': 'yes, registered under the customers name',
    }),
    ('foreign worker', 'cat', '', {
        'A201': 'yes',
        'A202': 'no',
    }),
]
# cat_col_conversion = [
#     ('Attribute1', 'cat', 'Status of existing checking account', {
#         'A11': '... < 0 DM',
#         'A12': '0 <= ... <  200 DM',
#         'A13': '... >= 200 DM / salary assignments for at least 1 year',
#         'A14': 'no checking account',
#     }),
#     ('Attribute3', 'cat', 'Credit history', {
#         'A30': 'no credits taken / all credits paid back duly',
#         'A31': 'all credits at this bank paid back duly',
#         'A32': 'existing credits paid back duly till now',
#         'A33': 'delay in paying off in the past',
#         'A34': 'critical account / other credits existing (not at this bank)',
#     }),
#     ('Attribute4', 'cat', 'Purpose', {
#         'A40': 'car (new)',
#         'A41': 'car (used)',
#         'A42': 'furniture/equipment',
#         'A43': 'radio/television',
#         'A44': 'domestic appliances',
#         'A45': 'repairs',
#         'A46': 'education',
#         'A47': '(vacation - does not exist?)',
#         'A48': 'retraining',
#         'A49': 'business',
#         'A410': 'others',
#     }),
#     ('Attribute6', 'cat', 'Savings account/bonds', {
#         'A61': '... <  100 DM',
#         'A62': '100 <= ... <  500 DM',
#         'A63': '500 <= ... < 1000 DM',
#         'A64': '.. >= 1000 DM',
#         'A65': 'unknown / no savings account',
#     }),
#     ('Attribute7', 'cat', 'Present employment since', {
#         'A71': 'unemployed',
#         'A72': '... < 1 year',
#         'A73': '1  <= ... < 4 years ',
#         'A74': '4  <= ... < 7 years',
#         'A75': '... >= 7 years',
#     }),
#     ('Attribute9', 'cat', 'Personal status and sex', {
#         'A91': 'male   : divorced/separated',
#         'A92': 'female : divorced/separated/married',
#         'A93': 'male   : single',
#         'A94': 'male   : married/widowed',
#         'A95': 'female : single',
#     }),
#     ('Attribute10', 'cat', 'Other debtors / guarantors', {
#         'A101': 'none',
#         'A102': 'co-applicant',
#         'A103': 'guarantor',
#     }),
#     ('Attribute12', 'cat', 'Property', {
#         'A121': 'real estate',
#         'A122': 'building society savings agreement/ life insurance',
#         'A123': 'car or other, not in attribute 6',
#         'A124': 'unknown / no property',
#     }),
#     ('Attribute14', 'cat', 'Other installment plans', {
#         'A141': 'bank',
#         'A142': 'stores',
#         'A143': 'none',
#     }),
#     ('Attribute15', 'cat', 'Housing', {
#         'A151': 'rent',
#         'A152': 'own',
#         'A153': 'for free',
#     }),
#     ('Attribute17', 'cat', 'Job', {
#         'A171': 'unemployed / unskilled  - non-resident',
#         'A172': 'unskilled - resident',
#         'A173': 'skilled employee / official',
#         'A174': 'management / self-employed / highly qualified employee / officer',
#     }),
#     ('Attribute19', 'cat', 'Telephone', {
#         'A191': 'none',
#         'A192': 'yes, registered under the customers name',
#     }),
#     ('Attribute20', 'cat', 'foreign worker', {
#         'A201': 'yes',
#         'A202': 'no',
#     }),
# ]

In [42]:
dataset_name = 'ASP-POTASSCO-classification'
missing_strs = []

num_col_conversion = [
    ('repetition', 'num', ''),
    ('Frac_Neg_Body', 'num', ''),
    ('Frac_Pos_Body', 'num', ''),
    ('Frac_Unary_Rules', 'num', ''),
    ('Frac_Binary_Rules', 'num', ''),
    ('Frac_Ternary_Rules', 'num', ''),
    ('Frac_Integrity_Rules', 'num', ''),
    ('Tight', 'num', ''),
    ('Problem_Variables', 'num', ''),
    ('Free_Problem_Variables', 'num', ''),
    ('Assigned_Problem_Variables', 'num', ''),
    ('Constraints', 'num', ''),
    ('Constraints.Vars', 'num', ''),
    ('Created_Bodies', 'num', ''),
    ('Program_Atoms', 'num', ''),
    ('SCCS', 'num', ''),
    ('Nodes_in_Positive_BADG', 'num', ''),
    ('Rules', 'num', ''),
    ('Normal_Rules', 'num', ''),
    ('Cardinality_Rules', 'num', ''),
    ('Choice_Rules', 'num', ''),
    ('Weight_Rules', 'num', ''),
    ('Frac_Normal_Rules', 'num', ''),
    ('Frac_Cardinality_Rules', 'num', ''),
    ('Frac_Choice_Rules', 'num', ''),
    ('Frac_Weight_Rules', 'num', ''),
    ('Equivalences', 'num', ''),
    ('Atom.Atom_Equivalences', 'num', ''),
    ('Body.Body_Equivalences', 'num', ''),
    ('Other_Equivalences', 'num', ''),
    ('Frac_Atom.Atom_Equivalences', 'num', ''),
    ('Frac_Body.Body_Equivalences', 'num', ''),
    ('Frac_Other_Equivalences', 'num', ''),
    ('Binary_Constraints', 'num', ''),
    ('Ternary_Constraints', 'num', ''),
    ('Other_Constraints', 'num', ''),
    ('Frac_Binary_Constraints', 'num', ''),
    ('Frac_Ternary_Constraints', 'num', ''),
    ('Frac_Other_Constraints', 'num', ''),
    ('Choices.1', 'num', ''),
    ('Conflicts.Choices.1', 'num', ''),
    ('Avg_Conflict_Levels.1', 'num', ''),
    ('Avg_LBD_Levels.1', 'num', ''),
    ('Learnt_from_Conflict.1', 'num', ''),
    ('Learnt_from_Loop.1', 'num', ''),
    ('Frac_Learnt_from_Conflict.1', 'num', ''),
    ('Frac_Learnt_from_Loop.1', 'num', ''),
    ('Literals_in_Conflict_Nogoods.1', 'num', ''),
    ('Literals_in_Loop_Nogoods.1', 'num', ''),
    ('Frac_Literals_in_Conflict_Nogoods.1', 'num', ''),
    ('Frac_Literals_in_Loop_Nogoods.1', 'num', ''),
    ('Removed_Nogoods.1', 'num', ''),
    ('Learnt_Binary.1', 'num', ''),
    ('Learnt_Ternary.1', 'num', ''),
    ('Learnt_Others.1', 'num', ''),
    ('Frac_Removed_Nogood.1', 'num', ''),
    ('Frac_Learnt_Binary.1', 'num', ''),
    ('Frac_Learnt_Ternary.1', 'num', ''),
    ('Frac_Learnt_Others.1', 'num', ''),
    ('Skipped_Levels_while_Backjumping.1', 'num', ''),
    ('Avg_Skipped_Levels_while_Backjumping.1', 'num', ''),
    ('Longest_Backjumping.1', 'num', ''),
    ('Running_Avg_Conflictlevel.1', 'num', ''),
    ('Running_Avg_LBD.1', 'num', ''),
    ('Choices.2', 'num', ''),
    ('Conflicts.Choices.2', 'num', ''),
    ('Avg_Conflict_Levels.2', 'num', ''),
    ('Avg_LBD_Levels.2', 'num', ''),
    ('Learnt_from_Conflict.2', 'num', ''),
    ('Learnt_from_Loop.2', 'num', ''),
    ('Frac_Learnt_from_Conflict.2', 'num', ''),
    ('Frac_Learnt_from_Loop.2', 'num', ''),
    ('Literals_in_Conflict_Nogoods.2', 'num', ''),
    ('Literals_in_Loop_Nogoods.2', 'num', ''),
    ('Frac_Literals_in_Conflict_Nogoods.2', 'num', ''),
    ('Frac_Literals_in_Loop_Nogoods.2', 'num', ''),
    ('Removed_Nogoods.2', 'num', ''),
    ('Learnt_Binary.2', 'num', ''),
    ('Learnt_Ternary.2', 'num', ''),
    ('Learnt_Others.2', 'num', ''),
    ('Frac_Removed_Nogood.2', 'num', ''),
    ('Frac_Learnt_Binary.2', 'num', ''),
    ('Frac_Learnt_Ternary.2', 'num', ''),
    ('Frac_Learnt_Others.2', 'num', ''),
    ('Skipped_Levels_while_Backjumping.2', 'num', ''),
    ('Avg_Skipped_Levels_while_Backjumping.2', 'num', ''),
    ('Longest_Backjumping.2', 'num', ''),
    ('Running_Avg_Conflictlevel.2', 'num', ''),
    ('Running_Avg_LBD.2', 'num', ''),
    ('Choices.3', 'num', ''),
    ('Conflicts.Choices.3', 'num', ''),
    ('Avg_Conflict_Levels.3', 'num', ''),
    ('Avg_LBD_Levels.3', 'num', ''),
    ('Learnt_from_Conflict.3', 'num', ''),
    ('Learnt_from_Loop.3', 'num', ''),
    ('Frac_Learnt_from_Conflict.3', 'num', ''),
    ('Frac_Learnt_from_Loop.3', 'num', ''),
    ('Literals_in_Conflict_Nogoods.3', 'num', ''),
    ('Literals_in_Loop_Nogoods.3', 'num', ''),
    ('Frac_Literals_in_Conflict_Nogoods.3', 'num', ''),
    ('Frac_Literals_in_Loop_Nogoods.3', 'num', ''),
    ('Removed_Nogoods.3', 'num', ''),
    ('Learnt_Binary.3', 'num', ''),
    ('Learnt_Ternary.3', 'num', ''),
    ('Learnt_Others.3', 'num', ''),
    ('Frac_Removed_Nogood.3', 'num', ''),
    ('Frac_Learnt_Binary.3', 'num', ''),
    ('Frac_Learnt_Ternary.3', 'num', ''),
    ('Frac_Learnt_Others.3', 'num', ''),
    ('Skipped_Levels_while_Backjumping.3', 'num', ''),
    ('Avg_Skipped_Levels_while_Backjumping.3', 'num', ''),
    ('Longest_Backjumping.3', 'num', ''),
    ('Running_Avg_Conflictlevel.3', 'num', ''),
    ('Running_Avg_LBD.3', 'num', ''),
    ('Choices.4', 'num', ''),
    ('Conflicts.Choices.4', 'num', ''),
    ('Avg_Conflict_Levels.4', 'num', ''),
    ('Avg_LBD_Levels.4', 'num', ''),
    ('Learnt_from_Conflict.4', 'num', ''),
    ('Learnt_from_Loop.4', 'num', ''),
    ('Frac_Learnt_from_Conflict.4', 'num', ''),
    ('Frac_Learnt_from_Loop.4', 'num', ''),
    ('Literals_in_Conflict_Nogoods.4', 'num', ''),
    ('Literals_in_Loop_Nogoods.4', 'num', ''),
    ('Frac_Literals_in_Conflict_Nogoods.4', 'num', ''),
    ('Frac_Literals_in_Loop_Nogoods.4', 'num', ''),
    ('Removed_Nogoods.4', 'num', ''),
    ('Learnt_Binary.4', 'num', ''),
    ('Learnt_Ternary.4', 'num', ''),
    ('Learnt_Others.4', 'num', ''),
    ('Frac_Removed_Nogood.4', 'num', ''),
    ('Frac_Learnt_Binary.4', 'num', ''),
    ('Frac_Learnt_Ternary.4', 'num', ''),
    ('Frac_Learnt_Others.4', 'num', ''),
    ('Skipped_Levels_while_Backjumping.4', 'num', ''),
    ('Avg_Skipped_Levels_while_Backjumping.4', 'num', ''),
    ('Longest_Backjumping.4', 'num', ''),
    ('Running_Avg_Conflictlevel.4', 'num', ''),
    ('Running_Avg_LBD.4', 'num', ''),
    ('runtime', 'num', ''),
]

cat_col_conversion = [
    ('runstatus', 'cat', '', None),
]

In [47]:
dataset_name = 'taiwanese_bankruptcy_prediction'
missing_strs = []

num_col_conversion = [
    ('ROA(C) before interest and depreciation before interest', 'num', ''),
    ('ROA(A) before interest and % after tax', 'num', ''),
    ('ROA(B) before interest and depreciation after tax', 'num', ''),
    ('Operating Gross Margin', 'num', ''),
    ('Realized Sales Gross Margin', 'num', ''),
    ('Operating Profit Rate', 'num', ''),
    ('Pre-tax net Interest Rate', 'num', ''),
    ('After-tax net Interest Rate', 'num', ''),
    ('Non-industry income and expenditure/revenue', 'num', ''),
    ('Continuous interest rate (after tax)', 'num', ''),
    ('Operating Expense Rate', 'num', ''),
    ('Research and development expense rate', 'num', ''),
    ('Cash flow rate', 'num', ''),
    ('Interest-bearing debt interest rate', 'num', ''),
    ('Tax rate (A)', 'num', ''),
    ('Net Value Per Share (B)', 'num', ''),
    ('Net Value Per Share (A)', 'num', ''),
    ('Net Value Per Share (C)', 'num', ''),
    ('Persistent EPS in the Last Four Seasons', 'num', ''),
    ('Cash Flow Per Share', 'num', ''),
    ('Revenue Per Share (Yuan ¥)', 'num', ''),
    ('Operating Profit Per Share (Yuan ¥)', 'num', ''),
    ('Per Share Net profit before tax (Yuan ¥)', 'num', ''),
    ('Realized Sales Gross Profit Growth Rate', 'num', ''),
    ('Operating Profit Growth Rate', 'num', ''),
    ('After-tax Net Profit Growth Rate', 'num', ''),
    ('Regular Net Profit Growth Rate', 'num', ''),
    ('Continuous Net Profit Growth Rate', 'num', ''),
    ('Total Asset Growth Rate', 'num', ''),
    ('Net Value Growth Rate', 'num', ''),
    ('Total Asset Return Growth Rate Ratio', 'num', ''),
    ('Cash Reinvestment %', 'num', ''),
    ('Current Ratio', 'num', ''),
    ('Quick Ratio', 'num', ''),
    ('Interest Expense Ratio', 'num', ''),
    ('Total debt/Total net worth', 'num', ''),
    ('Debt ratio %', 'num', ''),
    ('Net worth/Assets', 'num', ''),
    ('Long-term fund suitability ratio (A)', 'num', ''),
    ('Borrowing dependency', 'num', ''),
    ('Contingent liabilities/Net worth', 'num', ''),
    ('Operating profit/Paid-in capital', 'num', ''),
    ('Net profit before tax/Paid-in capital', 'num', ''),
    ('Inventory and accounts receivable/Net value', 'num', ''),
    ('Total Asset Turnover', 'num', ''),
    ('Accounts Receivable Turnover', 'num', ''),
    ('Average Collection Days', 'num', ''),
    ('Inventory Turnover Rate (times)', 'num', ''),
    ('Fixed Assets Turnover Frequency', 'num', ''),
    ('Net Worth Turnover Rate (times)', 'num', ''),
    ('Revenue per person', 'num', ''),
    ('Operating profit per person', 'num', ''),
    ('Allocation rate per person', 'num', ''),
    ('Working Capital to Total Assets', 'num', ''),
    ('Quick Assets/Total Assets', 'num', ''),
    ('Current Assets/Total Assets', 'num', ''),
    ('Cash/Total Assets', 'num', ''),
    ('Quick Assets/Current Liability', 'num', ''),
    ('Cash/Current Liability', 'num', ''),
    ('Current Liability to Assets', 'num', ''),
    ('Operating Funds to Liability', 'num', ''),
    ('Inventory/Working Capital', 'num', ''),
    ('Inventory/Current Liability', 'num', ''),
    ('Current Liabilities/Liability', 'num', ''),
    ('Working Capital/Equity', 'num', ''),
    ('Current Liabilities/Equity', 'num', ''),
    ('Long-term Liability to Current Assets', 'num', ''),
    ('Retained Earnings to Total Assets', 'num', ''),
    ('Total income/Total expense', 'num', ''),
    ('Total expense/Assets', 'num', ''),
    ('Current Asset Turnover Rate', 'num', ''),
    ('Quick Asset Turnover Rate', 'num', ''),
    ('Working capitcal Turnover Rate', 'num', ''),
    ('Cash Turnover Rate', 'num', ''),
    ('Cash Flow to Sales', 'num', ''),
    ('Fixed Assets to Assets', 'num', ''),
    ('Current Liability to Liability', 'num', ''),
    ('Current Liability to Equity', 'num', ''),
    ('Equity to Long-term Liability', 'num', ''),
    ('Cash Flow to Total Assets', 'num', ''),
    ('Cash Flow to Liability', 'num', ''),
    ('CFO to Assets', 'num', ''),
    ('Cash Flow to Equity', 'num', ''),
    ('Current Liability to Current Assets', 'num', ''),
    ('Liability-Assets Flag', 'num', ''),
    ('Net Income to Total Assets', 'num', ''),
    ('Total assets to GNP price', 'num', ''),
    ('No-credit Interval', 'num', ''),
    ('Gross Profit to Sales', 'num', ''),
    ('Net Income to Stockholder\'s Equity', 'num', ''),
    ('Liability to Equity', 'num', ''),
    ('Degree of Financial Leverage (DFL)', 'num', ''),
    ('Interest Coverage Ratio (Interest expense to EBIT)', 'num', ''),
    ('Net Income Flag', 'num', ''),
    ('Equity to Liability', 'num', ''),
]

cat_col_conversion = [
    
]

In [52]:
dataset_name = 'internet_usage'
missing_strs = ["Not_Say", "Not_say", "Dont_Know", "Dont_know"]

num_col_conversion = []

cat_col_conversion = [
    ('Age', 'num', '', None),
    ('Community_Building', 'cat', '', None),
    ('Community_Membership_Family', 'cat', '', None),
    ('Community_Membership_Hobbies', 'cat', '', None),
    ('Community_Membership_None', 'cat', '', None),
    ('Community_Membership_Other', 'cat', '', None),
    ('Community_Membership_Political', 'cat', '', None),
    ('Community_Membership_Professional', 'cat', '', None),
    ('Community_Membership_Religious', 'cat', '', None),
    ('Community_Membership_Support', 'cat', '', None),
    ('Country', 'cat', '', None),
    ('Disability_Cognitive', 'cat', '', None),
    ('Disability_Hearing', 'cat', '', None),
    ('Disability_Motor', 'cat', '', None),
    ('Disability_Not_Impaired', 'cat', '', None),
    ('Disability_Not_Say', 'cat', '', None),
    ('Disability_Vision', 'cat', '', None),
    ('Education_Attainment', 'cat', '', None),
    ('Falsification_of_Information', 'cat', '', None),
    ('Gender', 'cat', '', None),
    ('Household_Income', 'cat', '', None),
    ('How_You_Heard_About_Survey_Banner', 'cat', '', None),
    ('How_You_Heard_About_Survey_Friend', 'cat', '', None),
    ('How_You_Heard_About_Survey_Mailing_List', 'cat', '', None),
    ('How_You_Heard_About_Survey_Others', 'cat', '', None),
    ('How_You_Heard_About_Survey_Printed_Media', 'cat', '', None),
    ('How_You_Heard_About_Survey_Remebered', 'cat', '', None),
    ('How_You_Heard_About_Survey_Search_Engine', 'cat', '', None),
    ('How_You_Heard_About_Survey_Usenet_News', 'cat', '', None),
    ('How_You_Heard_About_Survey_WWW_Page', 'cat', '', None),
    ('Major_Geographical_Location', 'cat', '', None),
    ('Major_Occupation', 'cat', '', None),
    ('Marital_Status', 'cat', '', None),
    ('Most_Import_Issue_Facing_the_Internet', 'cat', '', None),
    ('Opinions_on_Censorship', 'cat', '', None),
    ('Primary_Computing_Platform', 'cat', '', None),
    ('Primary_Language', 'cat', '', None),
    ('Primary_Place_of_WWW_Access', 'cat', '', None),
    ('Race', 'cat', '', None),
    ('Not_Purchasing_Bad_experience', 'cat', '', None),
    ('Not_Purchasing_Bad_press', 'cat', '', None),
    ('Not_Purchasing_Cant_find', 'cat', '', None),
    ('Not_Purchasing_Company_policy', 'cat', '', None),
    ('Not_Purchasing_Easier_locally', 'cat', '', None),
    ('Not_Purchasing_Enough_info', 'cat', '', None),
    ('Not_Purchasing_Judge_quality', 'cat', '', None),
    ('Not_Purchasing_Never_tried', 'cat', '', None),
    ('Not_Purchasing_No_credit', 'cat', '', None),
    ('Not_Purchasing_Not_applicable', 'cat', '', None),
    ('Not_Purchasing_Not_option', 'cat', '', None),
    ('Not_Purchasing_Other', 'cat', '', None),
    ('Not_Purchasing_Prefer_people', 'cat', '', None),
    ('Not_Purchasing_Privacy', 'cat', '', None),
    ('Not_Purchasing_Receipt', 'cat', '', None),
    ('Not_Purchasing_Security', 'cat', '', None),
    ('Not_Purchasing_Too_complicated', 'cat', '', None),
    ('Not_Purchasing_Uncomfortable', 'cat', '', None),
    ('Not_Purchasing_Unfamiliar_vendor', 'cat', '', None),
    ('Registered_to_Vote', 'cat', '', None),
    ('Sexual_Preference', 'cat', '', None),
    ('Web_Ordering', 'cat', '', None),
    ('Web_Page_Creation', 'cat', '', None),
    ('Who_Pays_for_Access_Dont_Know', 'cat', '', None),
    ('Who_Pays_for_Access_Other', 'cat', '', None),
    ('Who_Pays_for_Access_Parents', 'cat', '', None),
    ('Who_Pays_for_Access_School', 'cat', '', None),
    ('Who_Pays_for_Access_Self', 'cat', '', None),
    ('Who_Pays_for_Access_Work', 'cat', '', None),
    ('Willingness_to_Pay_Fees', 'cat', '', None),
    ('Years_on_Internet', 'cat', '', None),
]

In [57]:
dataset_name = 'predict_students_dropout_and_academic_success'
missing_strs = []

num_col_conversion = [
    ('Previous qualification (grade)', 'num', 'Grade of previous qualification (between 0 and 200)'),
    ('Admission grade', 'num', 'Admission grade (between 0 and 200)'),
    ('Unemployment rate', 'num', 'Unemployment rate (%)'),
    ('Inflation rate', 'num', 'Inflation rate (%)'),
    ('GDP', 'num', 'GDP'),
]

cat_col_conversion = [
    ('Marital Status', 'cat', '', {
        1.0: "single",
        2.0: "married",
        3.0: "widower",
        4.0: "divorced",
        5.0: "facto union",
        6.0: "legally separated"
    }),
    ('Application mode', 'cat', '', {
        1.0: '1st phase - general contingent',
        2.0: 'Ordinance No. 612/93',
        5.0: '1st phase - special contingent (Azores Island)',
        7.0: 'Holders of other higher courses',
        10.0: 'Ordinance No. 854-B/99',
        15.0: 'International student (bachelor)',
        16.0: '1st phase - special contingent (Madeira Island)',
        17.0: '2nd phase - general contingent',
        18.0: '3rd phase - general contingent',
        26.0: 'Ordinance No. 533-A/99, item b2) (Different Plan)',
        27.0: 'Ordinance No. 533-A/99, item b3 (Other Institution)',
        39.0: 'Over 23 years old',
        42.0: 'Transfer',
        43.0: 'Change of course',
        44.0: 'Technological specialization diploma holders',
        51.0: 'Change of institution/course',
        53.0: 'Short cycle diploma holders',
        57.0: 'Change of institution/course (International)',
    }),
    ('Application order', 'num', '', None),
    ('Course', 'cat', '', {
        33.0: 'Biofuel Production Technologies',
        171.0: 'Animation and Multimedia Design',
        8014.0: 'Social Service (evening attendance)',
        9003.0: 'Agronomy',
        9070.0: 'Communication Design',
        9085.0: 'Veterinary Nursing',
        9119.0: 'Informatics Engineering',
        9130.0: 'Equinculture',
        9147.0: 'Management',
        9238.0: 'Social Service',
        9254.0: 'Tourism',
        9500.0: 'Nursing',
        9556.0: 'Oral Hygiene',
        9670.0: 'Advertising and Marketing Management',
        9773.0: 'Journalism and Communication',
        9853.0: 'Basic Education',
        9991.0: 'Management (evening attendance)'
    }),
    ('Daytime/evening attendance', 'cat', '', {
        1.0: 'daytime',
        0.0: 'evening',
    }),
    ('Previous qualification', 'cat', '', {
        1.0: "Secondary education",
        2.0: "Higher education - bachelor's degree",
        3.0: "Higher education - degree",
        4.0: "Higher education - master's",
        5.0: "Higher education - doctorate",
        6.0: "Frequency of higher education",
        9.0: "12th year of schooling - not completed",
        10.0: "11th year of schooling - not completed",
        12.0: "Other - 11th year of schooling",
        14.0: "10th year of schooling",
        15.0: "10th year of schooling - not completed",
        19.0: "Basic education 3rd cycle (9th/10th/11th year) or equiv.",
        38.0: "Basic education 2nd cycle (6th/7th/8th year) or equiv.",
        39.0: "Technological specialization course",
        40.0: "Higher education - degree (1st cycle)",
        42.0: "Professional higher technical course",
        43.0: "Higher education - master (2nd cycle)"
    }),
    ('Nationality', 'cat', '', {
        1.0: 'Portuguese',
        2.0: 'German',
        6.0: 'Spanish',
        11.0: 'Italian',
        13.0: 'Dutch',
        14.0: 'English',
        17.0: 'Lithuanian',
        21.0: 'Angolan',
        22.0: 'Cape Verdean',
        24.0: 'Guinean',
        25.0: 'Mozambican',
        26.0: 'Santomean',
        32.0: 'Turkish',
        41.0: 'Brazilian',
        62.0: 'Romanian',
        100.0: 'Moldova (Republic of)',
        101.0: 'Mexican',
        103.0: 'Ukrainian',
        105.0: 'Russian',
        108.0: 'Cuban',
        109.0: 'Colombian'
    }),
    ('Mother\'s qualification', 'cat', '', {
        1.0: "Secondary Education - 12th Year of Schooling or Eq.",
        2.0: "Higher Education - Bachelor's Degree",
        3.0: "Higher Education - Degree",
        4.0: "Higher Education - Master's",
        5.0: "Higher Education - Doctorate",
        6.0: "Frequency of Higher Education",
        9.0: "12th Year of Schooling - Not Completed",
        10.0: "11th Year of Schooling - Not Completed",
        11.0: "7th Year (Old)",
        12.0: "Other - 11th Year of Schooling",
        14.0: "10th Year of Schooling",
        18.0: "General commerce course",
        19.0: "Basic Education 3rd Cycle (9th/10th/11th Year) or Equiv.",
        22.0: "Technical-professional course",
        26.0: "7th year of schooling",
        27.0: "2nd cycle of the general high school course",
        29.0: "9th Year of Schooling - Not Completed",
        30.0: "8th year of schooling",
        34.0: "Unknown",
        35.0: "Can't read or write",
        36.0: "Can read without having a 4th year of schooling",
        37.0: "Basic education 1st cycle (4th/5th year) or equiv.",
        38.0: "Basic Education 2nd Cycle (6th/7th/8th Year) or Equiv.",
        39.0: "Technological specialization course",
        40.0: "Higher education - degree (1st cycle)",
        41.0: "Specialized higher studies course",
        42.0: "Professional higher technical course",
        43.0: "Higher Education - Master (2nd cycle)",
        44.0: "Higher Education - Doctorate (3rd cycle)"
    }),
    ('Father\'s qualification', 'cat', '', {
        1.0: "Secondary Education - 12th Year of Schooling or Eq.",
        2.0: "Higher Education - Bachelor's Degree",
        3.0: "Higher Education - Degree",
        4.0: "Higher Education - Master's",
        5.0: "Higher Education - Doctorate",
        6.0: "Frequency of Higher Education",
        9.0: "12th Year of Schooling - Not Completed",
        10.0: "11th Year of Schooling - Not Completed",
        11.0: "7th Year (Old)",
        12.0: "Other - 11th Year of Schooling",
        13.0: "2nd year complementary high school course",
        14.0: "10th Year of Schooling",
        18.0: "General commerce course",
        19.0: "Basic Education 3rd Cycle (9th/10th/11th Year) or Equiv.",
        20.0: "Complementary High School Course",
        22.0: "Technical-professional course",
        25.0: "Complementary High School Course - not concluded",
        26.0: "7th year of schooling",
        27.0: "2nd cycle of the general high school course",
        29.0: "9th Year of Schooling - Not Completed",
        30.0: "8th year of schooling",
        31.0: "General Course of Administration and Commerce",
        33.0: "Supplementary Accounting and Administration",
        34.0: "Unknown",
        35.0: "Can't read or write",
        36.0: "Can read without having a 4th year of schooling",
        37.0: "Basic education 1st cycle (4th/5th year) or equiv.",
        38.0: "Basic Education 2nd Cycle (6th/7th/8th Year) or Equiv.",
        39.0: "Technological specialization course",
        40.0: "Higher education - degree (1st cycle)",
        41.0: "Specialized higher studies course",
        42.0: "Professional higher technical course",
        43.0: "Higher Education - Master (2nd cycle)",
        44.0: "Higher Education - Doctorate (3rd cycle)"
    }),
    ('Mother\'s occupation', 'cat', '', {
        0.0: "Student",
        1.0: "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
        2.0: "Specialists in Intellectual and Scientific Activities",
        3.0: "Intermediate Level Technicians and Professions",
        4.0: "Administrative staff",
        5.0: "Personal Services, Security and Safety Workers and Sellers",
        6.0: "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
        7.0: "Skilled Workers in Industry, Construction and Craftsmen",
        8.0: "Installation and Machine Operators and Assembly Workers",
        9.0: "Unskilled Workers",
        10.0: "Armed Forces Professions",
        90.0: "Other Situation",
        99.0: "(blank)",
        122.0: "Health professionals",
        123.0: "teachers",
        125.0: "Specialists in information and communication technologies (ICT)",
        131.0: "Intermediate level science and engineering technicians and professions",
        132.0: "Technicians and professionals, of intermediate level of health",
        134.0: "Intermediate level technicians from legal, social, sports, cultural and similar services",
        141.0: "Office workers, secretaries in general and data processing operators",
        143.0: "Data, accounting, statistical, financial services and registry-related operators",
        144.0: "Other administrative support staff",
        151.0: "personal service workers",
        152.0: "sellers",
        153.0: "Personal care workers and the like",
        171.0: "Skilled construction workers and the like, except electricians",
        173.0: "Skilled workers in printing, precision instrument manufacturing, jewelers, artisans and the like",
        175.0: "Workers in food processing, woodworking, clothing and other industries and crafts",
        191.0: "cleaning workers",
        192.0: "Unskilled workers in agriculture, animal production, fisheries and forestry",
        193.0: "Unskilled workers in extractive industry, construction, manufacturing and transport",
        194.0: "Meal preparation assistants",
    }),
    ('Father\'s occupation', 'cat', '', {
        0.0: "Student",
        1.0: "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
        2.0: "Specialists in Intellectual and Scientific Activities",
        3.0: "Intermediate Level Technicians and Professions",
        4.0: "Administrative staff",
        5.0: "Personal Services, Security and Safety Workers and Sellers",
        6.0: "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
        7.0: "Skilled Workers in Industry, Construction and Craftsmen",
        8.0: "Installation and Machine Operators and Assembly Workers",
        9.0: "Unskilled Workers",
        10.0: "Armed Forces Professions",
        90.0: "Other Situation",
        99.0: "(blank)",
        101.0: "Armed Forces Officers",
        102.0: "Armed Forces Sergeants",
        103.0: "Other Armed Forces personnel",
        112.0: "Directors of administrative and commercial services",
        114.0: "Hotel, catering, trade and other services directors",
        121.0: "Specialists in the physical sciences, mathematics, engineering and related techniques",
        122.0: "Health professionals",
        123.0: "teachers",
        124.0: "Specialists in finance, accounting, administrative organization, public and commercial relations",
        131.0: "Intermediate level science and engineering technicians and professions",
        132.0: "Technicians and professionals, of intermediate level of health",
        134.0: "Intermediate level technicians from legal, social, sports, cultural and similar services",
        135.0: "Information and communication technology technicians",
        141.0: "Office workers, secretaries in general and data processing operators",
        143.0: "Data, accounting, statistical, financial services and registry-related operators",
        144.0: "Other administrative support staff",
        151.0: "personal service workers",
        152.0: "sellers",
        153.0: "Personal care workers and the like",
        154.0: "Protection and security services personnel",
        161.0: "Market-oriented farmers and skilled agricultural and animal production workers",
        163.0: "Farmers, livestock keepers, fishermen, hunters and gatherers, subsistence",
        171.0: "Skilled construction workers and the like, except electricians",
        172.0: "Skilled workers in metallurgy, metalworking and similar",
        174.0: "Skilled workers in electricity and electronics",
        175.0: "Workers in food processing, woodworking, clothing and other industries and crafts",
        181.0: "Fixed plant and machine operators",
        182.0: "assembly workers",
        183.0: "Vehicle drivers and mobile equipment operators",
        192.0: "Unskilled workers in agriculture, animal production, fisheries and forestry",
        193.0: "Unskilled workers in extractive industry, construction, manufacturing and transport",
        194.0: "Meal preparation assistants",
        195.0: "Street vendors (except food) and street service providers"
    }),
    ('Displaced', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Educational special needs', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Debtor', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Tuition fees up to date', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Gender', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Scholarship holder', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Age at enrollment', 'num', '', {1.0: "yes", 0.0: "no"}),
    ('International', 'cat', '', {1.0: "yes", 0.0: "no"}),
    ('Curricular units 1st sem (credited)', 'num', 'Number of curricular units credited in the 1st semester', None),
    ('Curricular units 1st sem (enrolled)', 'num', 'Number of curricular units enrolled in the 1st semester', None),
    ('Curricular units 1st sem (evaluations)', 'num', 'Number of evaluations to curricular units in the 1st semester', None),
    ('Curricular units 1st sem (approved)', 'num', 'Number of curricular units approved in the 1st semester', None),
    ('Curricular units 1st sem (without evaluations)', 'num', 'Number of curricular units without evalutions in the 1st semester', None),
    ('Curricular units 2nd sem (credited)', 'num', 'Number of curricular units credited in the 2nd semester', None),
    ('Curricular units 2nd sem (enrolled)', 'num', 'Number of curricular units enrolled in the 2nd semester', None),
    ('Curricular units 2nd sem (evaluations)', 'num', 'Number of evaluations to curricular units in the 2nd semester', None),
    ('Curricular units 2nd sem (approved)', 'num', 'Number of curricular units approved in the 2nd semester', None),
    ('Curricular units 2nd sem (without evaluations)', 'num', 'Number of curricular units without evalutions in the 2nd semester', None),

]

In [None]:
dataset_name = 'Dataset name'
missing_strs = []

num_col_conversion = [
    
]

cat_col_conversion = [
    
]

- Preprocessing each dataset

In [58]:
# Load the old dataset (from ~_backup)
old_dataset_dir = f"data/talent/{dataset_name}_backup"
new_dataset_dir = f"data/talent/{dataset_name}"

info = load_json(os.path.join(old_dataset_dir, 'info.json'))
display(info)

N, C, y = {}, {}, {}
for split in ['train', 'val', 'test']:
    N[split] = np.load(os.path.join(old_dataset_dir, f"N_{split}.npy"), allow_pickle=True) if info['n_num_features'] else None
    C[split] = np.load(os.path.join(old_dataset_dir, f"C_{split}.npy"), allow_pickle=True) if info['n_cat_features'] else None
    y[split] = np.load(os.path.join(old_dataset_dir, f"y_{split}.npy"), allow_pickle=True)
X_num = np.concat(list(N.values()), axis=0) if N['train'] is not None else None
X_cat = np.concat(list(C.values()), axis=0) if C['train'] is not None else None

print(f"N: {[(v.shape, v.dtype) if v is not None else None for k, v in N.items()]}")
print(f"C: {[(v.shape, v.dtype) if v is not None else None for k, v in C.items()]}")
print(f"y: {[(v.shape, v.dtype) for k, v in y.items()]}")

{'task_type': 'multiclass',
 'num_classes': 3,
 'n_num_features': 5,
 'n_cat_features': 29,
 'train_size': 2831,
 'val_size': 708,
 'test_size': 885,
 'source': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success'}

N: [((2831, 5), dtype('float64')), ((708, 5), dtype('float64')), ((885, 5), dtype('float64'))]
C: [((2831, 29), dtype('float64')), ((708, 29), dtype('float64')), ((885, 29), dtype('float64'))]
y: [((2831,), dtype('int64')), ((708,), dtype('int64')), ((885,), dtype('int64'))]


In [59]:
# Re-arrange numerical (both continuous and ordinal) and categorical columns
# For categorical columns, we enumerate the set of categories
# 1. cat -> num: convert *missing strs* into nan
# 2. num -> cat: not implemented yet (pending development)
new_num_col_name_desc = []
new_cat_col_name_desc = []
new_cat_col_onehot_name_desc = []
new_N = {'train': [], 'val': [], 'test': []}
new_C = {'train': [], 'val': [], 'test': []}
new_C_onehot = {'train': [], 'val': [], 'test': []}
new_categories = []

for i, conv in enumerate(num_col_conversion):
    col_name, col_type = conv[0], conv[1]
    if col_type == 'num':
        col_desc = conv[2]
        new_num_col_name_desc.append((col_name, col_desc))
        # new_num_col_name_desc.append((format_name(col_name), col_desc))
        for split in ['train', 'val', 'test']:
            new_N[split].append(N[split][:, i:i+1])
    else: # 'cat'
        assert col_type == 'cat'
        raise NotImplementedError("TODO")

is_missing = np.vectorize(lambda x: x in missing_strs)
for i, conv in enumerate(cat_col_conversion):
    col_name, col_type = conv[0], conv[1]
    if col_type == 'num':
        col_desc, cat_conv = conv[2], conv[3]
        new_num_col_name_desc.append((col_name, col_desc))
        # new_num_col_name_desc.append((format_name(col_name), col_desc))
        for split in ['train', 'val', 'test']:
            col = C[split][:, i:i+1]
            col = np.where(is_missing(col), np.nan, col)
            new_N[split].append(col.astype('float64'))
    else:
        assert col_type == 'cat'
        col_desc, cat_conv = conv[2], conv[3]
        if isinstance(cat_conv, dict):
            cat_conv = {k: v for k, v in cat_conv.items()}
            # cat_conv = {format_name(k) if isinstance(k, str) else k: v for k, v in cat_conv.items()}
        for split in ['train', 'val', 'test']:
            col = C[split][:, i]
            if split == 'train':
                categories, mode_val = get_categories(col, missing_strs=[])
                num_category = len(categories)
                new_categories.append(num_category)
                print(col_name, categories)
                if isinstance(cat_conv, dict):
                    print(col_name, ":", len(cat_conv), "->", len(categories))
            new_col_ord = index_cat_ord(col, categories, mode_val, missing_strs=[])
            new_col = np.zeros((len(col), num_category), dtype='int64')
            new_col[range(len(col)), new_col_ord] = 1
            assert np.all(new_col.sum(axis=0) < len(col))
            if isinstance(cat_conv, dict):
                new_C[split].append(np.array([cat_conv[v] for v in col]))
            else:
                new_C[split].append(col)
            new_C_onehot[split].append(new_col)
        new_col_names = expand_col_name(col_name, categories, cat_conv)
        new_col_descs = [col_desc] * num_category
        new_cat_col_name_desc.append((col_name, col_desc))
        new_cat_col_onehot_name_desc.extend(list(zip(new_col_names, new_col_descs)))

for split in ['train', 'val', 'test']:
    if new_N[split]:
        new_N[split] = np.concat(new_N[split], axis=1)
        assert new_N[split].shape[1] == len(new_num_col_name_desc)
    else:
        new_N[split] = None
    if new_C[split]:
        new_C[split] = np.stack(new_C[split], axis=1)
        assert new_C[split].shape[1] == len(new_cat_col_name_desc)
        new_C_onehot[split] = np.concat(new_C_onehot[split], axis=1)
        assert new_C_onehot[split].shape[1] == len(new_cat_col_onehot_name_desc)
    else:
        new_C[split] = None
        new_C_onehot[split] = None

Marital Status [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Marital Status : 6 -> 6
Application mode [1.0, 2.0, 5.0, 7.0, 10.0, 15.0, 16.0, 17.0, 18.0, 39.0, 42.0, 43.0, 44.0, 51.0, 53.0, 57.0]
Application mode : 18 -> 16
27.0 not found -> 1.0
26.0 not found -> 1.0
Course [33.0, 171.0, 8014.0, 9003.0, 9070.0, 9085.0, 9119.0, 9130.0, 9147.0, 9238.0, 9254.0, 9500.0, 9556.0, 9670.0, 9773.0, 9853.0, 9991.0]
Course : 17 -> 17
Daytime/evening attendance [0.0, 1.0]
Daytime/evening attendance : 2 -> 2
Previous qualification [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0, 10.0, 12.0, 14.0, 15.0, 19.0, 38.0, 39.0, 40.0, 42.0, 43.0]
Previous qualification : 17 -> 17
Nationality [1.0, 2.0, 6.0, 11.0, 13.0, 14.0, 17.0, 21.0, 22.0, 24.0, 25.0, 26.0, 41.0, 62.0, 100.0, 101.0, 103.0, 105.0, 108.0]
Nationality : 21 -> 19
32.0 not found -> 1.0
109.0 not found -> 1.0
Mother's qualification [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0, 10.0, 11.0, 12.0, 14.0, 19.0, 22.0, 29.0, 30.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0

In [60]:
# Save 1 - same format as the original TALENT datasets
print(new_dataset_dir)
os.makedirs(new_dataset_dir, exist_ok=True)
for split in ['train', 'val', 'test']:
    if new_N[split] is not None:
        np.save(os.path.join(new_dataset_dir, f"N_{split}.npy"), new_N[split], allow_pickle=True)
    if new_C[split] is not None:
        np.save(os.path.join(new_dataset_dir, f"C_{split}.npy"), new_C[split], allow_pickle=True)
    np.save(os.path.join(new_dataset_dir, f"y_{split}.npy"), y[split], allow_pickle=True)

new_info = deepcopy(info)
new_info['n_num_features'] = len(new_num_col_name_desc)
new_info['n_cat_features'] = len(new_cat_col_name_desc)
new_info['categories'] = new_categories
display(new_info)
with open(os.path.join(new_dataset_dir, 'info.json'), 'w') as fd:
    json.dump(new_info, fd)
    
# Col descs
new_col_name_descs = new_num_col_name_desc + new_cat_col_name_desc
new_col_names, new_col_descs = zip(*new_col_name_descs)
df_new_col_desc = pd.DataFrame({
    'name': new_col_names,
    'desc': new_col_descs,
})
df_new_col_desc.to_csv(os.path.join(new_dataset_dir, 'col_desc.csv'), index=False)

new_col_sents = get_full_desc_sentences(new_col_name_descs)
assert len(new_col_sents) == new_info['n_num_features'] + new_info['n_cat_features']
display(new_col_sents)

data/talent/predict_students_dropout_and_academic_success


{'task_type': 'multiclass',
 'num_classes': 3,
 'n_num_features': 17,
 'n_cat_features': 17,
 'train_size': 2831,
 'val_size': 708,
 'test_size': 885,
 'source': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success',
 'categories': [6, 16, 17, 2, 17, 19, 25, 29, 31, 42, 2, 2, 2, 2, 2, 2, 2]}

['Previous qualification (grade) : Grade of previous qualification (between 0 and 200)',
 'Admission grade : Admission grade (between 0 and 200)',
 'Unemployment rate : Unemployment rate (%)',
 'Inflation rate : Inflation rate (%)',
 'GDP',
 'Application order',
 'Age at enrollment',
 'Curricular units 1st sem (credited) : Number of curricular units credited in the 1st semester',
 'Curricular units 1st sem (enrolled) : Number of curricular units enrolled in the 1st semester',
 'Curricular units 1st sem (evaluations) : Number of evaluations to curricular units in the 1st semester',
 'Curricular units 1st sem (approved) : Number of curricular units approved in the 1st semester',
 'Curricular units 1st sem (without evaluations) : Number of curricular units without evalutions in the 1st semester',
 'Curricular units 2nd sem (credited) : Number of curricular units credited in the 2nd semester',
 'Curricular units 2nd sem (enrolled) : Number of curricular units enrolled in the 2nd semester',

In [61]:
# Save 2 - one-hot format (only if there are categorical features)
if len(new_cat_col_onehot_name_desc) == 0:
    print('No categorical columns. skipped.')
else:
    onehot_dataset_dir = os.path.join(new_dataset_dir, 'onehot')
    print(onehot_dataset_dir)
    os.makedirs(onehot_dataset_dir, exist_ok=True)
    for split in ['train', 'val', 'test']:
        if new_N[split] is not None:
            np.save(os.path.join(onehot_dataset_dir, f"N_{split}.npy"), new_N[split], allow_pickle=True)
        if new_C[split] is not None:
            np.save(os.path.join(onehot_dataset_dir, f"C_{split}.npy"), new_C_onehot[split], allow_pickle=True)
        np.save(os.path.join(onehot_dataset_dir, f"y_{split}.npy"), y[split], allow_pickle=True)
    
    new_info = deepcopy(info)
    new_info['n_num_features'] = len(new_num_col_name_desc)
    new_info['n_cat_features'] = len(new_cat_col_onehot_name_desc)
    new_info['categories'] = new_categories
    display(new_info)
    with open(os.path.join(onehot_dataset_dir, 'info.json'), 'w') as fd:
        json.dump(new_info, fd)
        
    # Col desc
    new_col_name_descs = new_num_col_name_desc + new_cat_col_onehot_name_desc
    new_col_names, new_col_descs = zip(*new_col_name_descs)
    df_new_col_desc = pd.DataFrame({
        'name': new_col_names,
        'desc': new_col_descs,
    })
    df_new_col_desc.to_csv(os.path.join(onehot_dataset_dir, 'col_desc.csv'), index=False)
    
    new_col_sents = get_full_desc_sentences(new_col_name_descs)
    assert len(new_col_sents) == new_info['n_num_features'] + new_info['n_cat_features']
    display(new_col_sents)

data/talent/predict_students_dropout_and_academic_success/onehot


{'task_type': 'multiclass',
 'num_classes': 3,
 'n_num_features': 17,
 'n_cat_features': 218,
 'train_size': 2831,
 'val_size': 708,
 'test_size': 885,
 'source': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success',
 'categories': [6, 16, 17, 2, 17, 19, 25, 29, 31, 42, 2, 2, 2, 2, 2, 2, 2]}

['Previous qualification (grade) : Grade of previous qualification (between 0 and 200)',
 'Admission grade : Admission grade (between 0 and 200)',
 'Unemployment rate : Unemployment rate (%)',
 'Inflation rate : Inflation rate (%)',
 'GDP',
 'Application order',
 'Age at enrollment',
 'Curricular units 1st sem (credited) : Number of curricular units credited in the 1st semester',
 'Curricular units 1st sem (enrolled) : Number of curricular units enrolled in the 1st semester',
 'Curricular units 1st sem (evaluations) : Number of evaluations to curricular units in the 1st semester',
 'Curricular units 1st sem (approved) : Number of curricular units approved in the 1st semester',
 'Curricular units 1st sem (without evaluations) : Number of curricular units without evalutions in the 1st semester',
 'Curricular units 2nd sem (credited) : Number of curricular units credited in the 2nd semester',
 'Curricular units 2nd sem (enrolled) : Number of curricular units enrolled in the 2nd semester',