In [10]:
import pandas as pd
import numpy as np
import polars as pl


def get_array_type(array):
    if isinstance(array, pd.DataFrame):
        return 'pandas'
    elif isinstance(array, np.ndarray):
        return 'numpy'
    elif isinstance(array, pl.DataFrame):
        return 'polars'
    else:
        return ''

In [1]:
import numpy as np
import pandas as pd
import polars as pl

import kaggle
import time
from pathlib import Path

kaggle.api.authenticate()

datasets = [
    {
        'link': 'https://www.kaggle.com/datasets/yasserh/titanic-dataset', 
        'target': 'Survived',
        'id_cols': ['PassengerId',],
    },
    {
        'link': 'https://www.kaggle.com/datasets/rabieelkharoua/predict-liver-disease-1700-records-dataset', 
        'target': 'Diagnosis',
    },
    {
        'link': 'https://www.kaggle.com/datasets/devzohaib/eligibility-prediction-for-loan', 
        'target': 'Loan_Status',
        'id_cols': ['Loan_ID',],
    },
    {
        'link': 'https://www.kaggle.com/datasets/deepcontractor/smoke-detection-dataset', 
        'target': 'Fire Alarm',
        'id_cols': ['index',],
    },
    {
        'link': 'https://www.kaggle.com/datasets/stealthtechnologies/employee-attrition-dataset', 
        'target': 'Attrition',
        'id_cols': ['Employee ID',],
    },
    {
        'link': 'https://www.kaggle.com/datasets/rabieelkharoua/cancer-prediction-dataset', 
        'target': 'Diagnosis',
    },
    {
        'link': 'https://www.kaggle.com/datasets/yasserh/heart-disease-dataset', 
        'target': 'target',
    },
    {
        'link': 'https://www.kaggle.com/datasets/rabieelkharoua/predict-online-course-engagement-dataset', 
        'target': 'CourseCompletion',
        'id_cols': ['UserID',],
    },
    {
        'link': 'https://www.kaggle.com/datasets/rabieelkharoua/predicting-hiring-decisions-in-recruitment-data', 
        'target': 'HiringDecision',
    },
    {
        'link': 'https://www.kaggle.com/datasets/barun2104/telecom-churn', 
        'target': 'Churn',
    },
    {
        'link': 'https://www.kaggle.com/datasets/rabieelkharoua/predicting-manufacturing-defects-dataset', 
        'target': 'DefectStatus',
    },
    {
        'link': 'https://www.kaggle.com/datasets/rabieelkharoua/predict-customer-purchase-behavior-dataset',
        'target': 'PurchaseStatus',
    },
    {
        'link': 'https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction', 
        'target': 'Satisfaction',
    },
    {
        'link': 'https://www.kaggle.com/datasets/dhanushnarayananr/credit-card-fraud', 
        'target': 'fraud',
    },
    {
        'link': 'https://www.kaggle.com/datasets/rameshmehta/credit-risk-analysis', 
        'target': 'default_ind',
        'id_cols': ['id', 'member_id',],
        'time_col': 'issue_d',
    },
    {
        'link': 'https://www.kaggle.com/datasets/marcpaulo/titanic-huge-dataset-1m-passengers', 
        'target': 'Survived',
        'id_cols': ['PassengerId',],
    },
    # {
    #     'link': 'https://www.kaggle.com/datasets/manishtripathi86/fedex-data', 
    #     'target': 'Delivery_Status',
    #     'time_col': 'Year',
    # },
    {
        'link': 'https://www.kaggle.com/datasets/ban7002/fraud-challenge-data', 
        'target': 'EVENT_LABEL',
        'time_col': 'EVENT_TIMESTAMP',
    },
    {
        'link': 'https://www.kaggle.com/datasets/ulrikthygepedersen/kickstarter-projects', 
        'target': 'State',
        'id_cols': ['ID',],
        'time_col': 'Launched',
    },
]

def get_train_test_datasets(datasets, datasets_path, test_size=0.3, random_state=0):
    from tqdm import tqdm
    from pathlib import Path
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    
    
    for dataset in tqdm(datasets):
        dataset_name = dataset['link'].replace('https://www.kaggle.com/datasets/', '')
        files = list((Path(datasets_path) / dataset_name).glob('*.csv'))
        if len(files) == 1:
            dataset_path = files[0]
            data = pd.read_csv(dataset_path)
            target_name = dataset['target']
            data = data.dropna(subset=[target_name])
            
            if dataset.get('id_cols'):
                data = data.drop(columns=[dataset['id_cols']], errors='ignore')

            le = LabelEncoder()
            data[target_name] = le.fit_transform(data[target_name])
            data = data.select_dtypes(include=[np.number]).dropna(how='any')
            train_data, test_data = train_test_split(
                data, 
                test_size=test_size, 
                stratify=data[target_name], 
                random_state=random_state
            )
            yield train_data, test_data, target_name, dataset_name

In [2]:
for train, test, target_name, dataset_name in get_train_test_datasets(datasets, datasets_path='./datasets', test_size=0.3, random_state=0):
    break

  0%|          | 0/18 [00:00<?, ?it/s]


In [16]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
349,350,0,3,42.0,0,0,8.6625
518,519,1,2,36.0,1,0,26.0000
44,45,1,3,19.0,0,0,7.8792
795,796,0,2,39.0,0,0,13.0000
69,70,0,3,26.0,2,0,8.6625
...,...,...,...,...,...,...,...
83,84,0,1,28.0,0,0,47.1000
856,857,1,1,45.0,1,1,164.8667
244,245,0,3,30.0,0,0,7.2250
808,809,0,2,39.0,0,0,13.0000


In [20]:
df = pd.DataFrame({
        'A': ['apple', 'banana', None],
        'B': [None, 'dog', 'cat'],
        'C': [1, None, 3],
        'D': ['red', None, 'red']
    })
df_obj = df.select_dtypes(include='object')
counter_ = df_obj.nunique()

In [24]:
X = pd.DataFrame({
        'A': [np.nan, 1, 2, 3, np.nan],
        'B': [4, 5, np.nan, np.nan, 8],
        'C': [1, 2, 3, np.nan, np.nan],
        'D': [1, 1, 1, 1, 1]  # Shouldn't affect selection
    })
X['A'].corr()

TypeError: Series.corr() missing 1 required positional argument: 'other'