In [37]:
import os
import openml
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer



In [42]:
#works good for openml dataset
def check_tabular_dataset_assumptions(X, y, categorical_indicator, task_type='classification'):
    report = {}

    # Convert to pandas for easier manipulation
    X = pd.DataFrame(X).copy()
    y = pd.Series(y).copy()

    # 1. Check for heterogeneous feature types
    has_categorical = any(categorical_indicator)
    has_numerical = not all(categorical_indicator)
    report['heterogeneous_columns'] = has_categorical and has_numerical

    # 2. d/n ratio
    d, n = X.shape[1], X.shape[0]
    report['low_dimensionality'] = (d / n) < 0.1

    # 3. Enough samples and features
    report['enough_samples'] = n >= 3000
    report['enough_features'] = d >= 4

    # 4. Remove columns with many missing, then rows with any missing
    print(y)
    missing_per_col = X.isnull().mean()
    valid_col_indices = [i for i, v in enumerate(missing_per_col) if v < 0.2]
    print(valid_col_indices)
    if not valid_col_indices:
        return {
            'accepted': False,
            'error': 'All columns dropped due to missing data',
        }

    X = X.iloc[:, valid_col_indices]
    categorical_indicator = [categorical_indicator[i] for i in valid_col_indices]

    # Drop rows with any missing and align y
    valid_rows = ~X.isnull().any(axis=1)
    print(valid_rows.value_counts())
    X = X[valid_rows].reset_index(drop=True)
    print(X)
    y = pd.Series(y)[valid_rows].reset_index(drop=True)
    print(y)
    if len(X) == 0 or len(y) == 0:
        return {
            'accepted': False,
            'error': 'All rows dropped due to missing data',
        }

    report['no_missing_data'] = True

    # 5. Filter by feature cardinality and convert binary numeric to categorical
    keep_indices = []
    new_categorical_indicator = []

    for i, is_cat in enumerate(categorical_indicator):
        col = X.iloc[:, i]
        nunique = col.nunique()

        if is_cat:
            if nunique <= 20:
                keep_indices.append(i)
                new_categorical_indicator.append(True)
        else:
            if nunique >= 10:
                keep_indices.append(i)
                new_categorical_indicator.append(False)
            elif nunique == 2:
                # Convert to categorical
                keep_indices.append(i)
                new_categorical_indicator.append(True)

    X = X.iloc[:, keep_indices]
    categorical_indicator = new_categorical_indicator
    report['valid_cardinalities'] = True

    # 6. Binarize and balance classification target
    if task_type == 'classification':
        y_counts = y.value_counts()

        # Binarize if more than 2 classes
        if len(y_counts) > 2:
            top_classes = y_counts.nlargest(2).index
            X = X[y.isin(top_classes)]
            y = y[y.isin(top_classes)]

        # Check class balance
        print(y)
        y_counts = y.value_counts()
        print(y_counts)
        min_class, max_class = y_counts.min(), y_counts.max()
        balance_ratio = min_class / max_class
        report['class_balance_ratio'] = balance_ratio
        balanced = abs(min_class - max_class) / max_class <= 0.1
        report['balanced_classes'] = balanced

        # Downsample to balance if needed
        if not balanced:
            y_min = y[y == y_counts.idxmin()]
            y_max = y[y == y_counts.idxmax()].sample(n=len(y_min), random_state=42)
            y = pd.concat([y_min, y_max])
            X = X.loc[y.index]

        # Re-label to 0/1
        le = LabelEncoder()
        y = le.fit_transform(y)
    # Check if dataset is still valid after filtering
    if X.shape[0] < 10:
        report['not_too_easy'] = False
        report['accepted'] = False
        report['error'] = 'Too few samples after preprocessing'
        return report
    # # 7. Too-easy dataset check (LogReg vs boosted trees vs synthetic "ResNet")
    if task_type == 'classification':
        if any(categorical_indicator):
            cat_cols = [i for i, is_cat in enumerate(categorical_indicator) if is_cat]
            enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            X.iloc[:, cat_cols] = enc.fit_transform(X.iloc[:, cat_cols])
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=0.2, random_state=42
        )

        model_lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
        acc_lr = accuracy_score(y_test, model_lr.predict(X_test))

        model_tree = HistGradientBoostingClassifier().fit(X_train, y_train)
        acc_tree = accuracy_score(y_test, model_tree.predict(X_test))

        acc_resnet = acc_tree + 0.02  # Simulated better model
        best_score = max(acc_resnet, acc_tree)

        too_easy = abs(acc_lr - best_score) / best_score < 0.05
        report['not_too_easy'] = not too_easy
    else:
        report['not_too_easy'] = True  # Skip for regression for now

    # # Final result
    report['accepted'] = all([
        report.get('heterogeneous_columns', False),
        report.get('low_dimensionality', False),
        report.get('enough_samples', False),
        report.get('enough_features', False),
        report.get('no_missing_data', False),
        report.get('valid_cardinalities', False),
        report.get('balanced_classes', False),
        report.get('not_too_easy', False)
    ])

    return report

In [63]:
task = openml.tasks.get_task(361675)  # download the OpenML task
dataset = task.get_dataset()
X, y, categorical_indicator, attribute_names = dataset.get_data(
         dataset_format="dataframe", target=dataset.default_target_attribute
     )

  exec(code_obj, self.user_global_ns, self.user_ns)
  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)


In [64]:
# X = pd.read_csv('Financial_DATA/1_X.csv')
# y =  pd.read_csv('Financial_DATA/1_y.csv')
# categorical_indicator = np.load('Financial_DATA/1_categorial_indicator.npy')

In [65]:
print(X)

       age workclass  fnlwgt     education  education.num      marital.status  \
0       90      None   77053       HS-grad              9             Widowed   
1       82   Private  132870       HS-grad              9             Widowed   
2       66      None  186061  Some-college             10             Widowed   
3       54   Private  140359       7th-8th              4            Divorced   
4       41   Private  264663  Some-college             10           Separated   
...    ...       ...     ...           ...            ...                 ...   
32556   22   Private  310152  Some-college             10       Never-married   
32557   27   Private  257302    Assoc-acdm             12  Married-civ-spouse   
32558   40   Private  154374       HS-grad              9  Married-civ-spouse   
32559   58   Private  151910       HS-grad              9             Widowed   
32560   22   Private  201490       HS-grad              9       Never-married   

              occupation   

In [61]:
print(y)

0        1
1        1
2        1
3        1
4        1
        ..
10573    2
10574    2
10575    2
10576    2
10577    2
Name: Class, Length: 10578, dtype: category
Categories (2, object): ['1' < '2']


In [62]:
check_tabular_dataset_assumptions(X,y,categorical_indicator)

0        1
1        1
2        1
3        1
4        1
        ..
10573    2
10574    2
10575    2
10576    2
10577    2
Name: Class, Length: 10578, dtype: category
Categories (2, object): ['1' < '2']
[0, 1, 2, 3, 4, 5, 6]
True    10578
Name: count, dtype: int64
       V1      V6  V10     V12  V13    V14  V15
0      52    51.0   14   935.0    2   -1.0    0
1      53   232.0   29    32.0    1   -1.0    0
2      50   525.0   18   177.0    2   -1.0    0
3      32   648.0   21   166.0    1   -1.0    0
4      58  2348.0    9   226.0    1   -1.0    0
...    ..     ...  ...     ...  ...    ...  ...
10573  73  2850.0   17   300.0    1   40.0    8
10574  25   505.0   17   386.0    2   -1.0    0
10575  51   825.0   17   977.0    3   -1.0    0
10576  71  1729.0   17   456.0    2   -1.0    0
10577  72  5715.0   17  1127.0    5  184.0    3

[10578 rows x 7 columns]
0        1
1        1
2        1
3        1
4        1
        ..
10573    2
10574    2
10575    2
10576    2
10577    2
Name: Class, L

{'heterogeneous_columns': False,
 'low_dimensionality': True,
 'enough_samples': True,
 'enough_features': True,
 'no_missing_data': True,
 'valid_cardinalities': True,
 'class_balance_ratio': 1.0,
 'balanced_classes': True,
 'not_too_easy': True,
 'accepted': False}