In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.2-py3-none-any.whl.metadata (5.3 kB)
Downloading ucimlrepo-0.0.2-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.2


In [2]:
import pandas as pd

In [62]:
class Dataset:

	def __init__(self, path) -> None:
		self.filename = path
		self.read_dataset()
		self.clean_dataset()

	def read_dataset(self):
		self.df = pd.read_csv(self.filename)

	def clean_dataset(self):
		self.df = self.df.dropna()

	def get_dataframe(self):
		return self.df
	
	def get_subsampled_dataframe(self, frac):
		return self.df.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=0))
	
	def describe(self, label):
		print(f'Shape: {self.df.shape}')
		n_instances = self.df.shape[0]
		count1 = self.df[label].sum()
		count0 = n_instances - count1
		print(f"Class proportion: {count0/n_instances:.2f} / {count1/n_instances:.2f}")

	

# Dataset test 1
(Small instances / Small features)

In [71]:
# Glioma Grading Clinical and Mutation Features
ds_glicoma = Dataset('datasets/TCGA_InfoWithGrade.csv')
df_glicoma = ds_glicoma.get_dataframe()
ds_glicoma.describe('Grade')


Shape: (839, 24)
Class proportion: 0.58 / 0.42


# Dataset test 2 
(Large Instances/Large Features)

In [68]:
# TUNADROMD (Malware)
ds_malware = Dataset(path = 'datasets/TUANDROMD.csv')
df_malware = ds_malware.get_dataframe()
ds_malware.describe('Label')

Shape: (4464, 242)
Class proportion: 0.20 / 0.80


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) 
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 
  
# metadata 
print(aids_clinical_trials_group_study_175.metadata) 
  
# variable information 
print(aids_clinical_trials_group_study_175.variables) 


{'uci_id': 890, 'name': 'AIDS Clinical Trials Group Study 175', 'repository_url': 'https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175', 'data_url': 'https://archive.ics.uci.edu/static/public/890/data.csv', 'abstract': 'The AIDS Clinical Trials Group Study 175 Dataset contains healthcare statistics and categorical information about patients who have been diagnosed with AIDS. This dataset was initially published in 1996. The prediction task is to predict whether or not each patient died within a certain window of time or not. ', 'area': 'Life Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 2139, 'num_features': 23, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Sexual Orientation', 'Race', 'Gender'], 'target_col': ['cid'], 'index_col': ['pidnum'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1996, 'last_updated': 'Wed Sep 27 2023

In [6]:
print(type(aids_clinical_trials_group_study_175))
print(type(X))

<class 'ucimlrepo.dotdict.dotdict'>
<class 'pandas.core.frame.DataFrame'>
