# Objetive

Load each dataset and generate a summary that includes:
* Name
* Number of instances
* Number of attributes
* Number of continuous attributes
* Total number of attributes

## DISCRETE

In [1]:
from os import listdir
from os.path import isfile, join, splitext
from scipy.io import arff
import pandas as pd

path = "../../data/discrete/"
data_names = [splitext(file)[0] for file in listdir(path)] # Split the file name from its extension, i.e., "alarm.arff" -> ("alarm", ".arff")

data_info = pd.DataFrame(columns=["name", "n_instances", "n_attributes"])
for name in data_names:
    data = arff.loadarff(path + name + ".arff")
    data = pd.DataFrame(data[0])
    n_nominal_attributes = sum(type == "object" for type in data.dtypes.values)
    n_numerical_attributes = len(data.dtypes.values) - n_nominal_attributes
    data_info = data_info.append({"name": name, 
                                  "n_instances": data.shape[0],
                                  "n_nominal_attributes": str(n_nominal_attributes),
                                  "n_numerical_attributes": str(n_numerical_attributes),
                                  "n_attributes": data.shape[1]}, ignore_index=True)
    
data_info.sort_values(by='n_attributes', ascending=True)

Unnamed: 0,name,n_instances,n_attributes,n_nominal_attributes,n_numerical_attributes
6,hiv_test,428,4,4,0
1,balance_scale,625,5,5,0
5,hayes_roth,160,5,5,0
3,car_evaluation,1728,7,7,0
8,nursery,12960,9,9,0
2,breast_cancer,277,10,10,0
13,web_phishing,1353,10,10,0
9,solar_flare,1389,13,13,0
11,vote,232,17,17,0
14,zoo,101,17,17,0


## CONTINUOUS

In [2]:
from os import listdir
from os.path import isfile, join, splitext
from scipy.io import arff
import pandas as pd

path = "../../data/continuous/"
data_names = [splitext(file)[0] for file in listdir(path)] # Split the file name from its extension, i.e., "alarm.arff" -> ("alarm", ".arff")

data_info = pd.DataFrame(columns=["name", "n_instances", "n_attributes"])
for name in data_names:
    data = arff.loadarff(path + name + ".arff")
    data = pd.DataFrame(data[0])
    n_nominal_attributes = sum(type == "object" for type in data.dtypes.values)
    n_numerical_attributes = len(data.dtypes.values) - n_nominal_attributes
    data_info = data_info.append({"name": name, 
                                  "n_instances": data.shape[0],
                                  "n_nominal_attributes": str(n_nominal_attributes),
                                  "n_numerical_attributes": str(n_numerical_attributes),
                                  "n_attributes": data.shape[1]}, ignore_index=True)
    
data_info.sort_values(by='n_attributes', ascending=True)

Unnamed: 0,name,n_instances,n_attributes,n_nominal_attributes,n_numerical_attributes
9,real_state_valuation,414,5,0,5
2,buddymove,249,6,0,6
8,qsar_fish_toxicity,908,7,0,7
4,ilpd,579,9,0,9
7,qsar_aqua_toxicity,545,9,0,9
1,alcohol,125,10,0,10
10,travel_reviews,980,10,0,10
14,wine_quality_white,4898,12,0,12
13,wine,178,13,0,13
5,leaf,340,14,0,14


## MIXED

In [3]:
from os import listdir
from os.path import isfile, join, splitext
from scipy.io import arff
import pandas as pd

path = "../../data/mixed/"
data_names = [splitext(file)[0] for file in listdir(path)] # Split the file name from its extension, i.e., "alarm.arff" -> ("alarm", ".arff")

data_info = pd.DataFrame(columns=["name", "n_instances", "n_attributes"])
for name in data_names:
    data = arff.loadarff(path + name + ".arff")
    data = pd.DataFrame(data[0])
    n_nominal_attributes = sum(type == "object" for type in data.dtypes.values)
    n_numerical_attributes = len(data.dtypes.values) - n_nominal_attributes
    data_info = data_info.append({"name": name, 
                                  "n_instances": data.shape[0],
                                  "n_nominal_attributes": str(n_nominal_attributes),
                                  "n_numerical_attributes": str(n_numerical_attributes),
                                  "n_attributes": data.shape[1]}, ignore_index=True)
    
data_info.sort_values(by='n_attributes', ascending=True)

Unnamed: 0,name,n_instances,n_attributes,n_nominal_attributes,n_numerical_attributes
3,haberman,306,4,1,3
6,iris,150,5,1,4
12,user_knowledge,258,6,1,5
2,ecoli,336,7,1,6
14,vertebral,310,7,1,6
8,planning_relax,182,13,1,12
10,thoracic_surgery,470,14,11,3
13,vehicle,846,19,1,18
11,thyroid,3103,21,16,5
7,parkinsons,195,23,1,22


## ALL

In [5]:
paths = ["../../data/discrete/", "../../data/continuous/", "../../data/mixed/"]

data_info = pd.DataFrame(columns=["name", "n_instances", "n_attributes"])

for path in paths:
    data_names = [splitext(file)[0] for file in listdir(path)] # Split the file name from its extension, i.e., "alarm.arff" -> ("alarm", ".arff")
    for name in data_names:
        data = arff.loadarff(path + name + ".arff")
        data = pd.DataFrame(data[0])
        n_nominal_attributes = sum(type == "object" for type in data.dtypes.values)
        n_numerical_attributes = len(data.dtypes.values) - n_nominal_attributes
        data_info = data_info.append({"name": name, 
                                      "n_instances": data.shape[0],
                                      "n_nominal_attributes": str(n_nominal_attributes),
                                      "n_numerical_attributes": str(n_numerical_attributes),
                                      "n_attributes": data.shape[1]}, ignore_index=True)

print(data_info.shape)
data_info_sorted = data_info.sort_values(by='n_attributes', ascending=True)
data_info_sorted

(45, 5)


Unnamed: 0,name,n_instances,n_attributes,n_nominal_attributes,n_numerical_attributes
6,hiv_test,428,4,4,0
33,haberman,306,4,1,3
1,balance_scale,625,5,5,0
36,iris,150,5,1,4
5,hayes_roth,160,5,5,0
24,real_state_valuation,414,5,0,5
42,user_knowledge,258,6,1,5
17,buddymove,249,6,0,6
44,vertebral,310,7,1,6
3,car_evaluation,1728,7,7,0
