In [1]:
import pandas as pd
import glob
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import concurrent.futures
import time

In [2]:
def load_csv(file):
    return pd.read_csv(file)

In [2]:
%ls
# Step 1: Read all CSV files into a single DataFrame
file_paths = glob.glob("experiments/data/census/train_census_enc.csv/*")  # Adjust the pattern as needed
with concurrent.futures.ThreadPoolExecutor() as executor:
    df_list = list(executor.map(load_csv, file_paths))
df = pd.concat(df_list, ignore_index=True)
df.shape

[0m[01;34mexperiments[0m/  kmeans.dml.ipynb


(2458273, 378)

In [3]:
file_paths

['experiments/data/census/train_census_enc.csv/0-m-00009',
 'experiments/data/census/train_census_enc.csv/0-m-00006',
 'experiments/data/census/train_census_enc.csv/0-m-00011',
 'experiments/data/census/train_census_enc.csv/0-m-00001',
 'experiments/data/census/train_census_enc.csv/0-m-00002',
 'experiments/data/census/train_census_enc.csv/0-m-00000',
 'experiments/data/census/train_census_enc.csv/0-m-00005',
 'experiments/data/census/train_census_enc.csv/0-m-00004',
 'experiments/data/census/train_census_enc.csv/0-m-00010',
 'experiments/data/census/train_census_enc.csv/0-m-00008',
 'experiments/data/census/train_census_enc.csv/0-m-00007',
 'experiments/data/census/train_census_enc.csv/0-m-00003']

In [5]:
# Step 2: Preprocessing (Handle missing values, standardize)
#df = df.dropna()  # Remove missing values (or handle differently)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)  # Normalize features

In [7]:
# Step 3: Apply K-Means Clustering
k = 16  # Choose the number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, max_iter=28, tol=1e-17, verbose=True)
kmeans.fit(df_scaled)
print(kmeans.cluster_centers_)

Initialization complete
Iteration 0, inertia 214910817.4941845.
Iteration 1, inertia 158307702.60867172.
Iteration 2, inertia 152534502.57173908.
Iteration 3, inertia 150436712.24376184.
Iteration 4, inertia 149325912.2557903.
Iteration 5, inertia 148117296.79167384.
Iteration 6, inertia 146889139.31222188.
Iteration 7, inertia 146544900.119735.
Iteration 8, inertia 146448517.68799174.
Iteration 9, inertia 146327097.3923279.
Iteration 10, inertia 146154178.81179875.
Iteration 11, inertia 145912827.7683926.
Iteration 12, inertia 145579443.94535673.
Iteration 13, inertia 145493195.95616245.
Iteration 14, inertia 145487311.470254.
Iteration 15, inertia 145484565.16992974.
Iteration 16, inertia 145482886.30261904.
Iteration 17, inertia 145481744.17636025.
Iteration 18, inertia 145481096.53001603.
Iteration 19, inertia 145480792.16373014.
Iteration 20, inertia 145480709.32047847.
Iteration 21, inertia 145480690.05050677.
Iteration 22, inertia 145480683.85522068.
Iteration 23, inertia 145480

In [3]:
run_id = 0
stats_per_run = {}

In [7]:
stats = {}
start = time.perf_counter()
file_paths = glob.glob("experiments/data/census/train_census_enc.csv/*")
with concurrent.futures.ThreadPoolExecutor() as executor:
    df_list = list(executor.map(load_csv, file_paths))
t0 = time.perf_counter() 
stats["Reading"] = (t0- start)*1000
df = pd.concat(df_list, ignore_index=True)
t1 = time.perf_counter() 
stats["Concat"] =  (t1 - t0)*1000
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)  # Normalize features
t2 = time.perf_counter() 
stats["Scaling"] =  (t2 - t1)*1000
k = 16  # Choose the number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, max_iter=28, tol=1e-17, verbose=False)
kmeans.fit(df_scaled)
t3 = time.perf_counter() 
stats["Kmeans"] = (t3 - t2)*1000
stats["Total"] = (t3 - start)*1000
stats_per_run[str(run_id) + "_" + str(k)] = stats
run_id += 1
stats

{'Reading': 25600.95548200002,
 'Concat': 1531.5822889999708,
 'Scaling': 8705.232496000008,
 'Kmeans': 24391.446082000017,
 'Total': 60229.21634900001}

In [8]:
stats_per_run

{'0_16': {'Reading': 37600.20683299999,
  'Concat': 1538.2471089999967,
  'Scaling': 8690.557845000001,
  'Kmeans': 25011.119250000007,
  'Total': 72840.13103699998},
 '1_16': {'Reading': 31060.696595000023,
  'Concat': 1529.7703019999744,
  'Scaling': 9140.093003999993,
  'Kmeans': 25165.067292999993,
  'Total': 66895.62719399999},
 '2_16': {'Reading': 25600.95548200002,
  'Concat': 1531.5822889999708,
  'Scaling': 8705.232496000008,
  'Kmeans': 24391.446082000017,
  'Total': 60229.21634900001}}