In [2]:
import os
import openml
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer



In [3]:
#works good for openml dataset
def check_tabular_dataset_assumptions(X, y, categorical_indicator, task_type='classification'):
    report = {}

    # Convert to pandas for easier manipulation
    X = pd.DataFrame(X).copy()
    y = pd.Series(y).copy()

    # 1. Check for heterogeneous feature types
    has_categorical = any(categorical_indicator)
    has_numerical = not all(categorical_indicator)
    report['heterogeneous_columns'] = has_categorical and has_numerical

    # 2. d/n ratio
    d, n = X.shape[1], X.shape[0]
    report['low_dimensionality'] = (d / n) < 0.1

    # 3. Enough samples and features
    report['enough_samples'] = n >= 3000
    report['enough_features'] = d >= 4

    # 4. Remove columns with many missing, then rows with any missing
    missing_per_col = X.isnull().mean()
    valid_col_indices = [i for i, v in enumerate(missing_per_col) if v < 0.2]
    
    if not valid_col_indices:
        return {
            'accepted': False,
            'error': 'All columns dropped due to missing data',
        }

    X = X.iloc[:, valid_col_indices]
    categorical_indicator = [categorical_indicator[i] for i in valid_col_indices]

    # Drop rows with any missing and align y
    valid_rows = ~X.isnull().any(axis=1)
    X = X[valid_rows].reset_index(drop=True)
    y = pd.Series(y)[valid_rows].reset_index(drop=True)
    if len(X) == 0 or len(y) == 0:
        return {
            'accepted': False,
            'error': 'All rows dropped due to missing data',
        }

    report['no_missing_data'] = True

    # 5. Filter by feature cardinality and convert binary numeric to categorical
    keep_indices = []
    new_categorical_indicator = []

    for i, is_cat in enumerate(categorical_indicator):
        col = X.iloc[:, i]
        nunique = col.nunique()

        if is_cat:
            if nunique <= 20:
                keep_indices.append(i)
                new_categorical_indicator.append(True)
        else:
            if nunique >= 10:
                keep_indices.append(i)
                new_categorical_indicator.append(False)
            elif nunique == 2:
                # Convert to categorical
                keep_indices.append(i)
                new_categorical_indicator.append(True)

    X = X.iloc[:, keep_indices]
    categorical_indicator = new_categorical_indicator
    report['valid_cardinalities'] = True

    # 6. Binarize and balance classification target
    # if task_type == 'classification':
    #     y_counts = y.value_counts()

    #     # Binarize if more than 2 classes
    #     if len(y_counts) > 2:
    #         top_classes = y_counts.nlargest(2).index
    #         X = X[y.isin(top_classes)]
    #         y = y[y.isin(top_classes)]

    #     # Check class balance
    #     y_counts = y.value_counts()
    #     min_class, max_class = y_counts.min(), y_counts.max()
    #     balance_ratio = min_class / max_class
    #     report['class_balance_ratio'] = balance_ratio
    #     balanced = abs(min_class - max_class) / max_class <= 0.1
    #     report['balanced_classes'] = balanced

    #     # Downsample to balance if needed
    #     if not balanced:
    #         y_min = y[y == y_counts.idxmin()]
    #         y_max = y[y == y_counts.idxmax()].sample(n=len(y_min), random_state=42)
    #         y = pd.concat([y_min, y_max])
    #         X = X.loc[y.index]

    #     # Re-label to 0/1
    #     le = LabelEncoder()
    #     y = le.fit_transform(y)
    # Check if dataset is still valid after filtering
    if X.shape[0] < 10:
        report['not_too_easy'] = False
        report['accepted'] = False
        report['error'] = 'Too few samples after preprocessing'
        return report
    # # 7. Too-easy dataset check (LogReg vs boosted trees vs synthetic "ResNet")
    # Encode categorical features for regression (one-hot encoding)
    if any(categorical_indicator):
        cat_cols = [i for i, is_cat in enumerate(categorical_indicator) if is_cat]
        X_cat = X.iloc[:, cat_cols]
        X_num = X.drop(X.columns[cat_cols], axis=1)
        X_cat_encoded = pd.get_dummies(X_cat, drop_first=True)
        X = pd.concat([X_num, X_cat_encoded], axis=1)
  
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model_lr = LinearRegression().fit(X_train, y_train)
    print(model_lr.predict(X_test))
    acc_lr = r2_score(y_test, model_lr.predict(X_test))

    model_tree = HistGradientBoostingRegressor().fit(X_train, y_train)
    acc_tree = r2_score(y_test, model_tree.predict(X_test))

    too_easy = abs(acc_lr - acc_tree) / acc_tree < 0.05
    report['not_too_easy'] = not too_easy
    
    # # Final result
    report['accepted'] = all([
        report.get('heterogeneous_columns', False),
        report.get('low_dimensionality', False),
        report.get('enough_samples', False),
        report.get('enough_features', False),
        report.get('no_missing_data', False),
        report.get('valid_cardinalities', False),
        report.get('balanced_classes', False),
        report.get('not_too_easy', False)
    ])

    return report, acc_tree,acc_lr

In [4]:
#tasks = [361055, 361066, 317599]
task = openml.tasks.get_task(317599)  # download the OpenML task
dataset = task.get_dataset()
X, y, categorical_indicator, attribute_names = dataset.get_data(
         dataset_format="dataframe", target=dataset.default_target_attribute
     )

  exec(code_obj, self.user_global_ns, self.user_ns)
  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)


In [5]:
# X = pd.read_csv('Financial_DATA/1_X.csv')
# y =  pd.read_csv('Financial_DATA/1_y.csv')
# categorical_indicator = np.load('Financial_DATA/1_categorial_indicator.npy')
print(X.shape)

(30000, 23)


In [37]:
report, acc_tree,acc_lr = check_tabular_dataset_assumptions(X,y,categorical_indicator,'regression')

[0.78912056 0.79765406 0.82635759 ... 0.83059203 0.83453783 0.83847772]


In [38]:
print(acc_tree,acc_lr)

-0.003925117151469815 -6.946768675719639e-05


In [39]:
print(report)

{'heterogeneous_columns': True, 'low_dimensionality': True, 'enough_samples': True, 'enough_features': True, 'no_missing_data': True, 'valid_cardinalities': True, 'not_too_easy': False, 'accepted': False}


In [40]:
abs(acc_tree-acc_lr)/(acc_tree)

-0.9823017545524257