In [1]:
import numpy as np
from ucimlrepo import fetch_ucirepo 
import os
from sklearn.preprocessing import LabelEncoder
import pandas as pd


In [12]:
namelist = ["mobile"]
for dataname in namelist:
    
    datapath = f'dataset/{dataname}'
    os.makedirs(datapath, exist_ok=True)

    load_data(dataname)

dataset/mobile Save CSV
dataset/mobile
Files saved successfully


In [11]:
# wine
def load_data(dataname):
    if dataname =="mobile":
        data = fetch_ucirepo(id = 10)
        shuffled_df = data.data.original.sample(frac=1).reset_index(drop=True)
        features = shuffled_df.iloc[:, :-1]
        label = shuffled_df.iloc[:, -1]

    elif dataname == "kidney":
        data = fetch_ucirepo(id = 336)
        shuffled_df = data.data.original.sample(frac=1).reset_index(drop=True)
        features = shuffled_df.iloc[:, :-1]
        label = shuffled_df.iloc[:, -1]
        label_encoder = LabelEncoder()
        label = label_encoder.fit_transform(label)

        
    save_data(features,label,datapath)

In [10]:
def save_data(features,label,datapath):

    pd.DataFrame(features).to_csv(f'{datapath}/features.csv', index=False)
    pd.DataFrame(label).to_csv(f'{datapath}/label.csv', index=False)

    print(f"{datapath} Save CSV")
    # Convert to NumPy arrays
    features_np = features
    label_np = label


    try:
        print(datapath)
        np.save(f'{datapath}/features.npy', features_np)
        np.save(f'{datapath}/label.npy', label_np)
        print("Files saved successfully")
    except Exception as e:
        print("Fail to save npy")
        print(f"Error: {e}")
    

In [13]:
cluster_data_name = ["hepatitis","horse","kidney","mammo","pima","winconsin"]

d = {"hepatitis" : ["Numeric","Nominal","Nominal",
 "Nominal","Nominal","Nominal",
 "Nominal","Nominal","Nominal",
 "Nominal","Nominal","Nominal",
 "Nominal","Numeric","Numeric",
 "Numeric","Numeric","Numeric","Nominal"],

"horse" :  ["Nominal","Nominal",
         "Numeric","Numeric","Numeric",
         "Ordinal", "Ordinal", "Ordinal", 
         "Ordinal", "Ordinal","Ordinal", 
         "Ordinal", "Ordinal","Ordinal", 
         "Numeric","Ordinal", "Ordinal", 
        "Numeric", "Numeric","Ordinal",
        "Numeric","Ordinal",
          ],

"kidney" :  [ "Numeric","Numeric","Numeric",
          "Ordinal","Ordinal",
          "Nominal","Nominal","Nominal","Nominal",
            "Numeric","Numeric","Numeric",
            "Numeric","Numeric","Numeric",
            "Numeric","Numeric","Numeric",
            "Nominal","Nominal","Nominal",
            "Nominal","Nominal","Nominal"
          ],
"mammo":  ["Ordinal","Numeric","Ordinal","Ordinal","Ordinal"],

"pima" :  ["Numeric","Numeric","Numeric","Numeric",
        "Numeric","Numeric","Numeric","Numeric"],
"winconsin" :  ["Ordinal","Ordinal",
             "Ordinal","Ordinal",
             "Ordinal","Ordinal",
             "Ordinal","Ordinal",
             "Ordinal","Ordinal",
             ]
}



In [10]:

cluster_data_name = ["hepatitis","horse","kidney","mammo","pima","winconsin"]
classify_and_encode_columns_np("hepatitis",hepatitis)

hepatitis
(155, 19)


({'attribute': [{'type': 'Numeric'},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Nominal', 'values': [1, 2]},
   {'type': 'Numeric'},
   {'type': 'Numeric'},
   {'type': 'Numeric'},
   {'type': 'Numeric'},
   {'type': 'Numeric'},
   {'type': 'Nominal', 'values': [1, 2]}]},
 'column_types.json')

In [21]:
classify_and_encode_columns_np(d)

Processing dataset: hepatitis
Shape: (155, 19)
data_clustering/hepatitis/column_info.json
Processing dataset: horse
Shape: (368, 22)
data_clustering/horse/column_info.json
Processing dataset: kidney
Shape: (400, 24)
data_clustering/kidney/column_info.json
Processing dataset: mammo
Shape: (961, 5)
data_clustering/mammo/column_info.json
Processing dataset: pima
Shape: (768, 8)
data_clustering/pima/column_info.json
Processing dataset: winconsin
Shape: (699, 9)
data_clustering/winconsin/column_info.json


In [20]:
import numpy as np
import pandas as pd
import json
from tabulate import tabulate

def classify_and_encode_columns_np(datasets_info):
    table_summary = []
    
    for dataname, column_info in datasets_info.items():
        # Define the data path and load the data
        datapath = f"data_clustering/{dataname}"
        data = np.load(datapath + "/feature.npy")
        print(f"Processing dataset: {dataname}")
        print(f"Shape: {data.shape}")

        # Convert the numpy array into a DataFrame
        df = pd.DataFrame(data)

        # Initialize an empty list to store the result
        attribute_info = []

        # Counters for ordinal, nominal, and numeric columns
        num_ord = 0
        num_nom = 0
        num_num = 0

        # Iterate through each column and its type in column_info
        for col in df.columns:
            unique_vals = sorted(df[col].dropna().unique())
            unique_count = len(unique_vals)
            col_type = column_info[col]
            
            # Handle the Nominal case
            if col_type == 'Nominal':
                attribute_info.append({'type': 'Nominal', 'values': unique_vals})
                num_nom += 1
            # Handle the Numeric case
            elif col_type == 'Numeric':
                attribute_info.append({'type': 'Numeric'})
                num_num += 1
            # Count Ordinal columns if applicable (you can adapt based on your logic)
            elif col_type == 'Ordinal':
                attribute_info.append({'type': 'Ordinal', 'values': unique_vals})
                num_ord += 1

        # Convert NumPy types to native Python types for JSON serialization
        for entry in attribute_info:
            if 'values' in entry:
                entry['values'] = [int(v) for v in entry['values']]

        # Save the result to a JSON file
        output_json = {'attribute': attribute_info}
        json_output_path = f'data_clustering/{dataname}/column_info.json'
        print(json_output_path)
        with open(json_output_path, 'w') as f:
            json.dump(output_json, f, indent=4)

        # Append dataset summary to table
    #     num_instances = data.shape[0]
    #     table_summary.append([dataname, num_instances, num_ord, num_nom, num_num])

    # # Print the summary table
    # headers = ["Dataset", "# Inst", "# Ord", "# Nom", "# Num"]
    # print(tabulate(table_summary, headers=headers, tablefmt="grid"))

    # return table_summary


In [37]:
def stats_convert(column_info):
    if all(column == "numerical" for column in column_info.values()):
        return None
    stats = {"attribute": []}
    for column_name, column in column_info.items():
        col_dic = {'type': ''}
        if column == "numerical":
            col_dic['type'] = "Numeric"
        elif isinstance(column, dict):
            key = next(iter(column))
            if key in {"ordinal", "nominal"}:
                col_dic['type'] = key.capitalize()
                col_dic['values'] = list(column[key].values())
        stats["attribute"].append(col_dic)

    return stats

In [38]:
d= {
    "0": {
        "nominal": {
            "0": 0,
            "1": 1
        }
    },
    "1": "numerical",
    "2": "numerical",
    "3": {
        "nominal": {
            "1": 0,
            "2": 1,
            "3": 2
        }
    },
    "4": {
        "nominal": {
            "1": 0,
            "2": 1,
            "3": 2,
            "4": 3,
            "5": 4,
            "6": 5,
            "7": 6,
            "8": 7,
            "9": 8,
            "10": 9,
            "11": 10,
            "12": 11,
            "13": 12,
            "14": 13
        }
    },
    "5": {
        "nominal": {
            "1": 0,
            "2": 1,
            "3": 2,
            "4": 3,
            "5": 4,
            "6": 5,
            "7": 6,
            "8": 7,
            "9": 8
        }
    },
    "6": "numerical",
    "7": {
        "nominal": {
            "0": 0,
            "1": 1
        }
    },
    "8": {
        "nominal": {
            "0": 0,
            "1": 1
        }
    },
    "9": "numerical",
    "10": {
        "nominal": {
            "0": 0,
            "1": 1
        }
    },
    "11": {
        "nominal": {
            "1": 0,
            "2": 1,
            "3": 2
        }
    },
    "12": "numerical",
    "13": "numerical"
}

In [9]:
import numpy as np
import pandas as pd
import json

import numpy as np
import pandas as pd
import json

def classify_and_encode_columns_np(dataname, column_info):
    # Define the data path and load the data
    datapath = f"data_clustering/{dataname}"
    data = np.load(datapath + "/feature.npy")
    print(dataname)
    print(data.shape)

    # Convert the numpy array into a DataFrame
    df = pd.DataFrame(data)

    # Initialize an empty list to store the result
    attribute_info = []

    # Iterate through each column and its type in column_info
    for col in df.columns:
        unique_vals = sorted(df[col].dropna().unique())
        unique_count = len(unique_vals)
        col_type = column_info[col]
        # Handle the Nominal case
        if col_type == 'Nominal':
            attribute_info.append({'type': 'Nominal', 'values': unique_vals})
        # Handle the Numeric case
        elif col_type == 'Numeric':
            attribute_info.append({'type': 'Numeric'})

    # Convert NumPy types to native Python types for JSON serialization
    for entry in attribute_info:
        if 'values' in entry:
            entry['values'] = [int(v) for v in entry['values']]

    # Save the result to a JSON file
    output_json = {'attribute': attribute_info}
    json_output_path = 'column_types.json'

    with open(json_output_path, 'w') as f:
        json.dump(output_json, f, indent=4)

    return output_json, json_output_path



In [6]:
for data in cluster_data_name:

    classify_and_encode_columns_np(data, None)

hepatitis
(155, 19)
[7.0, 20.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 64.0, 65.0, 66.0, 67.0, 69.0, 70.0, 72.0, 78.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[1.0, 2.0]
[0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.2, 2.3, 2.4, 2.5, 2.8, 2.9, 3.0, 3.2, 3.5, 3.9, 4.1, 4.2, 4.6, 4.8, 7.6, 8.0]
[26.0, 30.0, 34.0, 40.0, 44.0, 45.0, 46.0, 48.0, 50.0, 52.0, 53.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 62.0, 63.0, 65.0, 67.0, 68.0, 70.0, 71.0, 72.0, 74.0, 75.0, 76.0, 78.0, 80.0, 81.0, 82.0, 84.0, 85.0, 86.0, 89.0, 90.0, 92.0, 95.0, 96.0, 100.0, 102.0, 103.0, 107.0, 109.0, 110.0, 114.0, 115.0, 118.0, 119.0, 120.0, 123.0, 125.0, 126.0, 127.0, 130.0, 133.0, 135.0, 138.0, 139.0, 14

In [28]:
import json
import numpy as np

def generate_column_info(dataname, column_types):

   for i, col_type in enumerate(column_types):
      if col_type == "Numeric":
         column_info[str(i)] = "Numeric"
      elif col_type == "Ordinal":
         # For ordinal columns, we need to capture the sorted unique values and assign them an order
         unique_vals = sorted(set(data[:, i]))
         ordinal_mapping = {str(val): idx for idx, val in enumerate(unique_vals)}
         column_info[str(i)] = {"Ordinal": ordinal_mapping}
      elif col_type == "Nominal":
         # For nominal columns, capture unique values but no specific order
         unique_vals = sorted(set(data[:, i]))
         nominal_mapping = {str(val): idx for idx, val in enumerate(unique_vals)}
         column_info[str(i)] = {"Nominal": nominal_mapping}

   json_path = f'{datapath}/column_info.json'

   with open(json_path, 'w') as json_file:
      json.dump(column_info, json_file, indent=4)

   return column_info