In [2]:
# Load libraries
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# Turn interactive plotting off, show plot only when plt.show() is called
plt.ioff()

INPUT = "datasets"
OUTPUT = os.path.join("part2_results")

if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT)

from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning, message='`np.bool` is a deprecated alias')

# Import datasets

In [3]:
# preprocessing method
def cat_2_num(df:pd.DataFrame):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].astype('category')
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

In [4]:
def get_statistical_overview(X:pd.DataFrame):
    X = X.replace('?', np.nan)
    types = ""
    for i in range(len(X.dtypes)):
        types += f"Attribute {i}: {X.dtypes[i]}; "
    return {
        'Number of Instances': [len(X)],
        'Number of Attributes': [len(X.columns)],
        'Attribute Information': ['; '.join([f"{key}:{value} attributes" for (key, value) in dict(X.dtypes.value_counts()).items()])],
        'Missing Value':[f"{1-(sum(X.count()))/(X.shape[0]*X.shape[1]):.2%}"]
    }

In [23]:
X = pd.read_table(os.path.join(INPUT,"german.data-numeric"),delim_whitespace = True, header = None)

## UCI datasets

In [349]:
# can't find header in docs
def German():
    df = pd.read_table(os.path.join(INPUT,"german.data-numeric"),delim_whitespace = True, header = None)
    stats = get_statistical_overview(df.drop(columns = [24]))
    stats['Abbreviation'] = ['German']
    stats['Handling NA'] = ["no na values"]
    df[24] = df[24]-1    # change label from 1,2 to 0,1
    return df.drop(columns = [24]), df[24].astype(bool), pd.DataFrame(stats)

In [350]:
# can't find header names
def Australian():
    df = pd.read_table(os.path.join(INPUT,"australian.dat"),delim_whitespace = True, header = None)
    stats = get_statistical_overview(df)
    stats['Abbreviation'] = ['Australian']
    stats['Handling NA'] = ["no na values"]
    return df.drop(columns = [14]), df[14].astype(bool), pd.DataFrame(stats)

In [351]:
# manually include header
def Crx():
    df = pd.read_csv(os.path.join(INPUT,"crx.data"), header = None)
    stats = get_statistical_overview(df.drop(columns = [15]))
    stats['Handling NA'] = ["drop entries with na values"]    # manual added
    stats['Abbreviation'] = ['Crx']
    # drop entries with ?
    df = df.replace("?", np.nan).dropna()
    # convert category data to numerical data
    df = cat_2_num(df)
    return df.drop(columns = [15]), df[15].astype(bool), pd.DataFrame(stats)

In [352]:
# manually include header
def Hepatitis():
    df = pd.read_csv(os.path.join(INPUT,"hepatitis.data"), header = None)
    stats = get_statistical_overview(df.drop(columns = [19]))
    stats['Handling NA'] = ["treat na value as a class"]
    stats['Abbreviation'] = ['Hepatitis']
    df = cat_2_num(df)
    df[19] = df[19]-1 # change to 0 or 1
    return df.drop(columns = [19]), df[19].astype(bool), pd.DataFrame(stats)

In [353]:
# manually include header
def Ionosphere():
    df = pd.read_csv(os.path.join(INPUT, "ionosphere.data"), header=None)
    stats = get_statistical_overview(df.drop(columns = [34]))
    stats['Handling NA'] = ["no na values"]
    stats['Abbreviation'] = ['Ionosphere']
    df = cat_2_num(df)
    return df.drop(columns = [34]), df[34].astype(bool), pd.DataFrame(stats)

## Additional Kaggle datasets

In [354]:
def Pumpkin():
    df = pd.read_excel(os.path.join("datasets",'Pumpkin_Seeds_Dataset.xlsx'), sheet_name='Pumpkin_Seeds_Dataset',engine='openpyxl')
    stats = get_statistical_overview(df.drop(columns = ['Class']))
    stats['Handling NA'] = ["no na values"]
    stats['Abbreviation'] = ['Pumpkin']
    df = cat_2_num(df)
    return df.drop(columns = ['Class']), df['Class'].astype(bool), pd.DataFrame(stats)

In [355]:
# 5644 samples, relatively large dataset
def Mushroom():
    df = pd.read_csv(os.path.join(INPUT,'mushrooms.csv'))
    stats = get_statistical_overview(df.drop(columns = ['class']))
    stats['Abbreviation'] = ['Mushroom']
    stats['Handling NA'] = ["drop entries with na values"]
    df = df.replace("?", np.nan).dropna()
    df = cat_2_num(df)
    return df.drop(columns = ['class']), df['class'].astype(bool), pd.DataFrame(stats)

In [356]:
def Diabetes():
    df = pd.read_csv(os.path.join(INPUT,'diabetes_data.csv'), sep=';')
    stats = get_statistical_overview(df.drop(columns = ['class']))
    stats['Abbreviation'] = ['Diabetes']
    stats['Handling NA'] = ["no na values"]
    df = cat_2_num(df)
    return df.drop(columns = ['class']), df['class'].astype(bool), pd.DataFrame(stats)

# Exploratory Data Analysis

In [357]:
def save_plots(X, y, dataset_name):
    width = len(X.columns)

    # basic hist plot
    X.hist(figsize=(int(width*1.2),int(width*0.8)))
    plt.savefig(os.path.join(OUTPUT, f"{dataset_name}_X_hist.png"))
    plt.close()

    corr_mat = X.corr().round(2)
    f, ax = plt.subplots(figsize=(width,width))
    mask = np.zeros_like(corr_mat,dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr_mat,mask=mask,vmin=-1,vmax=1,center=0, 
                cmap='plasma',square=False,lw=2,annot=True,cbar=False).set_title(f"{dataset_name} Correlation Map")
    plt.savefig(os.path.join(OUTPUT, f"{dataset_name}_X_corr.png"))
    plt.close()

    ax = y.value_counts().plot(kind='bar', title = f"{dataset_name} Y Distribution")
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2, p.get_height()*1.01), ha='center', va='center')
    plt.savefig(os.path.join(OUTPUT, f"{dataset_name}_y.png"))
    plt.close()

In [358]:
dataset_getters = [German, Australian, Crx, Hepatitis, Ionosphere, Pumpkin, Mushroom, Diabetes]
attribute_stats = []
for getter in dataset_getters:
    X, y, stats = getter()
    save_plots(X, y, getter.__name__)
    attribute_stats.append(stats)
pd.concat(attribute_stats).to_csv(os.path.join(OUTPUT, 'model_statistical_overview.tsv'), sep='\t', index=False)