In [1]:
import pandas as pd
import numpy as np

# Sample data: 20 students (Mã SV) across 10 subjects (Mã HP)
np.random.seed(0)  # for reproducibility

data = pd.DataFrame({
    "Mã SV": np.random.choice([f"SV{i}" for i in range(1, 1000)], 1000),
    "Mã HP": np.random.choice([f"HP{j}" for j in range(1, 20)], 1000),
    "Time slot": np.random.choice([1, 2, 3, 4], 1000)
})

# Ensuring some subjects have a high number of students
data.loc[:20, "Mã HP"] = "HP1"  # Force HP1 to have a high count
data.loc[20:40, "Mã HP"] = "HP2"  # Force HP2 to have a high count

# Check data distribution
print(data["Mã HP"].value_counts())


Mã HP
HP2     69
HP1     67
HP18    64
HP17    63
HP8     59
HP6     57
HP5     57
HP16    56
HP11    55
HP12    53
HP4     51
HP15    49
HP19    47
HP13    46
HP3     45
HP14    44
HP7     43
HP9     40
HP10    35
Name: count, dtype: int64


In [2]:
# Define minimum students per cluster
min_hp_sv = 25  # Minimum students threshold for demonstration

# Group by "Mã HP" and aggregate to find total students per subject
hp = data.groupby("Mã HP").agg({"Mã SV": "count", "Time slot": "first"})
hp = hp.rename(columns={"Mã SV": "sv"})

# Subjects with student count >= 2 * min_hp_sv
split_set = hp[hp["sv"] >= min_hp_sv * 2].index.tolist()
print("Subjects to split:", split_set)

Subjects to split: ['HP1', 'HP11', 'HP12', 'HP16', 'HP17', 'HP18', 'HP2', 'HP4', 'HP5', 'HP6', 'HP8']


In [3]:
from sklearn.preprocessing import StandardScaler
from k_means_constrained import KMeansConstrained
from sklearn.metrics import silhouette_score

# Pivot data to create a "co-occurrence" matrix of students with each subject
cm = pd.crosstab(data["Mã SV"], data["Mã HP"])

for i in split_set:
    if hp.loc[i, "sv"] >= min_hp_sv * 2:
        # Select students taking the subject `i` and drop subject column `i` itself from X
        X = cm.loc[cm[i] == 1].drop(i, axis=1)
        
        # Remove columns with all 0s or all 1s (students who only take subject `i`)
        X = X.loc[:, (X.max(axis=0) > 0)]
        X = X.loc[:, (X.min(axis=0) < 1)]
        
        if X.shape[1] == 0:
            continue
        
        # Initialize best parameters for clustering
        best_y = None
        best_nc = 0
        best_score = -1
        
        # K-means constrained clustering
        for nc in range(2, min((hp.loc[i, "sv"] // min_hp_sv) + 1, 11)):
            
            kmeans = KMeansConstrained(
                            n_clusters=nc, size_min=min_hp_sv, n_jobs=-1
                        )
            yy = kmeans.fit_predict(StandardScaler().fit_transform(X))
            
            score = silhouette_score(X, yy)
            
            # Select the best clustering result based on silhouette score
            if score > best_score:
                best_nc = nc
                best_score = score
                best_y = yy
        
        print(f"Splitting {i} ({hp.loc[i, 'sv']} students) into {best_nc} clusters")
        
        # Add clustering labels to X
        X["cluster"] = best_y

        # Update original data with new cluster labels
        for s in range(best_nc):
            s_i = X.loc[X["cluster"] == s].index.tolist()
            data.loc[(data["Mã HP"] == i) & (data["Mã SV"].isin(s_i)), "Mã HP"] = f"{i}_{s}"


Splitting HP1 (67 students) into 2 clusters
Splitting HP11 (55 students) into 2 clusters
Splitting HP12 (53 students) into 2 clusters
Splitting HP16 (56 students) into 2 clusters
Splitting HP17 (63 students) into 2 clusters
Splitting HP18 (64 students) into 2 clusters
Splitting HP2 (69 students) into 2 clusters
Splitting HP4 (51 students) into 2 clusters
Splitting HP5 (57 students) into 2 clusters
Splitting HP6 (57 students) into 2 clusters
Splitting HP8 (59 students) into 2 clusters
