In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install openml --quiet
!pip install researchpy --quiet
!pip install ucimlrepo --quiet

from ucimlrepo import fetch_ucirepo,list_available_datasets

import researchpy as rp
import scipy.stats as stats
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import os

import openml
import pandas as pd
from openml.datasets import edit_dataset, fork_dataset, get_dataset

import pickle

In [None]:
def check_if_datetime(x):
    try:
        pd.to_datetime(x)
    except (RuntimeError, TypeError, NameError, IOError, ValueError):
        return False
    else:
        return True

# Define function
def check_if_numeric(x):
    try:
        pd.to_numeric(x)
    except (RuntimeError, TypeError, NameError, IOError, ValueError):
        return False
    else:
        return True

def identify_variable_type(x):
    # Is the column empty? If so, it will be classified as 'NA':
    if (x.dropna().empty == True):
        return "NA"

    if x.dropna().nunique() > len(x)*0.99:
        return "u_manual"

    # Is the variable categorical? We check the number of unique values:
    if x.dropna().nunique() == 2:
        return "c_binary"

    if (check_if_numeric(x) == True):
        if ((x.dropna().shape[0] < 3000 and x.dropna().nunique()<10) or (x.dropna().nunique()<30 and x.dropna().shape[0] >= 3000)):
          return "c_integer"
    else:
        return "c_categorical"

    # If no numbers are present, we classify it as a string:
    if (x.astype(str).str.contains(r"[0-9]").any() == False):
        return "c_string"
    # We then check if it's numeric, or predominantly numeric with some exceptions:
    elif (check_if_numeric(x) == True):
        if (x.dropna() % 1 == 0).all():
            return "n_integer"
        else:
            return "n_float"
    # next, we check if it's a date or a time, or predominantly datetime with some exceptions:
    elif (check_if_datetime(x) == True):
        return "n_datetime"
    elif (x.astype(str)[x.astype(str).str.contains(r"[0-9]") == False].nunique() < 11 and
          check_if_datetime(x[x.astype(str).str.contains(r"[0-9]") == True]) == True):
        return "n_datetime"
    # If none of the above apply, we classify the variable as string:
    else:
        return "c_string"

def remove_dtname(text,list_dt):
    for i in list_dt:
        text = text.replace(i, '')
    return text

def schema_detect(df_raw):

    df=df_raw.copy()
    df_col_names=df.columns.to_list()
    column_types=df.apply(identify_variable_type)
    column_types=[s[:5] for s in column_types]
    col_names = [i +'_'+ j for i, j in zip(list(df.columns), column_types)]

    c_col=[s for s in col_names if '_c_' in s]
    n_col=[s for s in col_names if '_n_' in s]
    b_col=[s for s in col_names if '_bin' in s]
    int_col=[s for s in col_names if '_int' in s]
    manual_col=[s for s in col_names if '_man' in s]

    lists=['_c_cat','_n_int','_n_flo','_c_bin','_c_str','_c_int','_u_man']

    c_col=[remove_dtname(i,lists) for i in c_col]
    n_col=[remove_dtname(i,lists) for i in n_col]
    b_col=[remove_dtname(i,lists) for i in b_col]
    int_col=[remove_dtname(i,lists) for i in int_col]
    manual_col=[remove_dtname(i,lists) for i in manual_col]

    df=df.drop(manual_col,axis=1)

    c_col_index=[df.columns.get_loc(c) for c in c_col]

    print(f'There are {len(col_names)} columns in total, {len(c_col)} categorical, {len(n_col)} numeric, {len(b_col)} binary, {len(manual_col)} manual and {len(int_col)} int.')

    cat_dict={}

    for i in c_col:
      df[i]=df[i].astype('category')
      cat_dict[i] = dict(enumerate(df[i].cat.categories))
      df[i]=df[i].cat.codes
      df[i]=df[i].astype('int')

    df[c_col] = df[c_col].astype('object')



    return [df,cat_dict,df_col_names,c_col,n_col,b_col,int_col,manual_col,c_col_index]


def g_stats(data,c_col,n_col,dir,y_name=None):

  print(f'Data directory is in {dir}')

  print(f'Orginal dataset shape {data.shape}')

  rp.summarize(data[n_col]).to_csv(dir+'/stats_num.csv',index=False)
  rp.summary_cat(data[c_col]).to_csv(dir+'/stats_cat.csv',index=False)

  if y_name is not None:
    ss,rs={},{}
    for i in n_col:
      summary, results = rp.ttest(group1= data[i][data[y_name] == data[y_name].unique()[1]], group1_name= "Postive",
                                  group2= data[i][data[y_name] == data[y_name].unique()[0]], group2_name= "Negative")
      ss[i]=summary
      rs[i]=results

    pd.concat(ss).to_csv(dir+'/y_stats_num.csv')

    cb={}
    for i in c_col:
      cb[i]=pd.concat([pd.crosstab(data[i],data[y_name]),pd.crosstab(data[i],data[y_name]).apply(lambda r: r/r.sum(), axis=0)],axis=1)

    pd.concat(cb).to_csv(dir+'/y_stats_cat.csv')

In [None]:

def data_setup(data_id,data_name,DATA,repo='openml',y_name='label'):

    data_folder=DATA+data_name
    # Check whether the specified path exists or not
    isExist = os.path.exists(data_folder)
    if not isExist:
      os.makedirs(data_folder)

    if repo=='openml':
      full_data= fetch_openml(data_id=data_id, as_frame=True, parser="pandas")
      X=full_data.data
      y=full_data.target
    elif repo=='uci':
      # fetch dataset
      full_data = fetch_ucirepo(id=data_id)

      # data (as pandas dataframes)
      X = full_data.data.features
      y = full_data.data.targets

    df_raw=X.copy()
    df_raw['label']=y

    pickle.dump(df_raw,open(data_folder+"/original.pickle","wb"))
    df,cat_dict,df_col_names,c_col,n_col,b_col,int_col,manual_col,c_col_index=schema_detect(df_raw)

    g_stats(df_raw,c_col,n_col,data_folder,y_name=y_name)

    c_col_x=c_col.copy()
    try:
      c_col_x.remove('label')
    except:
      pass

    pickle.dump(df,open(data_folder+"/df.pickle","wb"))

    with open(data_folder+"/metadata.pickle", "wb") as pickle_out:
      pickle.dump(c_col, pickle_out)
      pickle.dump(n_col, pickle_out)
      pickle.dump(b_col, pickle_out)
      pickle.dump(c_col_x, pickle_out)
      pickle.dump(cat_dict, pickle_out)
      pickle.dump(c_col_index, pickle_out)
      pickle.dump(int_col, pickle_out)


def cv_setup(data_name,DATA,cv=5):

  data_folder=DATA+data_name

  df = pickle.load(open(data_folder+'/df.pickle',"rb"))
  X = df.drop(['label'],axis=1)
  y = df['label']

  # Check whether the specified path exists or not
  isExist = os.path.exists(data_folder+'/synthetic')
  if not isExist:
    os.makedirs(data_folder+'/synthetic')

    for i in range(cv):
      os.makedirs(data_folder+'/synthetic/seed'+str(i))

      if len(np.unique(y))<10:
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=i)
      else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

      baseline=pd.concat([X_train,y_train],axis=1)
      df_test=pd.concat([X_test,y_test],axis=1)
      baseline.to_csv(data_folder+'/synthetic/seed' + str(i) +'/baseline.csv',index=False)
      df_test.to_csv(data_folder+'/synthetic/seed' + str(i) +'/df_test.csv',index=False)

In [None]:
def get_Xy(id):
  # This is done based on the dataset ID.
  dataset = openml.datasets.get_dataset(id)

  # Print a summary
  print(f"This is dataset '{dataset.name}', the target feature is "
      f"'{dataset.default_target_attribute}'")

  X, y, categorical_indicator, attribute_names = dataset.get_data(target=dataset.default_target_attribute, dataset_format="dataframe")
  return X,y,categorical_indicator, attribute_names

In [None]:
DATA='/content/drive/MyDrive/'

In [None]:
data_ids=[45059,45057,43892,1114,43976,42477,44226,44053,43889,43903]
data_names=['sick','jasmine','national-longitudinal-survey-binary','KDDCup09_upselling','eye_movements',
 'default-of-credit-card','NewspaperChurn','compass','law-school-admission-bianry','diabetes']

for data_id,data_name in zip(data_ids, data_names):
  data_setup(data_id,data_name,DATA)
  cv_setup(data_name,DATA,cv=5)

data_ids=[144,2]
data_names=['credit-g','adult']

for data_id,data_name in zip(data_ids, data_names):
  data_setup(data_id,data_name,DATA,repo='uci')
  cv_setup(data_name,DATA,cv=5)