# K-means clustering based threshold and Labeling

# Activity 2 bins

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Function to read data from Excel
def read_data(file_path, sheet_name):
    try:
        data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
        print(f"Data read from sheet '{sheet_name}' (first 5 rows):")
        print(data.head())
        if data.empty:
            print(f"Warning: The sheet '{sheet_name}' is empty or contains only headers.")
        return data
    except Exception as e:
        print(f"Error reading '{sheet_name}' from {file_path}: {e}")
        return None

# Function to perform K-means clustering and determine thresholds
def determine_kmeans_thresholds(data, num_clusters):
    flat_data = data.values.flatten().reshape(-1, 1)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(flat_data)
    
    # Extract cluster centers (thresholds)
    thresholds = np.sort(kmeans.cluster_centers_.flatten())
    print(f"Determined thresholds based on K-means clustering with {num_clusters} clusters: {thresholds}")
    return thresholds

# Function to classify data based on K-means thresholds
def classify_data(data, thresholds):
    def classify(value):
        if value < thresholds[0]:
            return "low"
        else:
            return "high"
    
    # Apply classification to each element
    classified_data = data.applymap(classify)
    return classified_data

# Function to save classified labels to a new sheet in the same Excel file
def save_classified_data(classified_data, file_path, sheet_name):
    try:
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            classified_data.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"Classified data successfully saved to sheet '{sheet_name}' in {file_path}")
    except Exception as e:
        print(f"Error saving classified data to {file_path}: {e}")

# Main function
def main():
    file_path = r'C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Anderson_group_Data6_2013\Activity\InVitro_Activity.xlsx'  # Input file path
    normalized_sheet_name = "Min-Max Normalized"  # Sheet containing normalized data
    classified_sheet_name = "Min-Max 2 bins Kmeans"  # Sheet to save classified labels
    num_clusters = 2  # Number of clusters for K-means
    
    # Read normalized data
    data = read_data(file_path, normalized_sheet_name)
    if data is None or data.empty:
        print("Data could not be read or is empty. Exiting.")
        return

    # Determine thresholds based on K-means clustering
    thresholds = determine_kmeans_thresholds(data, num_clusters)

    # Classify data based on K-means thresholds
    classified_data = classify_data(data, thresholds)
    print("Classified Data (first 5 rows):")
    print(classified_data.head())
    
    # Save classified labels
    save_classified_data(classified_data, file_path, classified_sheet_name)

if __name__ == "__main__":
    main()


Data read from sheet 'Min-Max Normalized' (first 5 rows):
   Normalized Data
0         0.226427
1         0.083525
2         0.047840
3         0.117302
4         0.122228


  super()._check_params_vs_input(X, default_n_init=10)


Determined thresholds based on K-means clustering with 2 clusters: [0.06883671 0.67386713]
Classified Data (first 5 rows):
  Normalized Data
0            high
1            high
2             low
3            high
4            high
Classified data successfully saved to sheet 'Min-Max 2 bins Kmeans' in C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Anderson_group_Data6_2013\Activity\InVitro_Activity.xlsx


# Activity 4 bins

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Function to read data from Excel
def read_data(file_path, sheet_name):
    try:
        data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
        print(f"Data read from sheet '{sheet_name}' (first 5 rows):")
        print(data.head())
        if data.empty:
            print(f"Warning: The sheet '{sheet_name}' is empty or contains only headers.")
        return data
    except Exception as e:
        print(f"Error reading '{sheet_name}' from {file_path}: {e}")
        return None

# Function to perform K-means clustering and determine thresholds
def determine_kmeans_thresholds(data, num_clusters):
    flat_data = data.values.flatten().reshape(-1, 1)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(flat_data)
    
    # Extract cluster centers (thresholds)
    thresholds = np.sort(kmeans.cluster_centers_.flatten())
    print(f"Determined thresholds based on K-means clustering with {num_clusters} clusters: {thresholds}")
    return thresholds

# Function to classify data based on K-means thresholds
def classify_data(data, thresholds):
    def classify(value):
        if value < thresholds[0]:
            return "low"
        elif value < thresholds[1]:
            return "low-mid"
        elif value < thresholds[2]:
            return "mid-high"
        else:
            return "high"
    
    # Apply classification to each element
    classified_data = data.applymap(classify)
    return classified_data

# Function to save classified labels to a new sheet in the same Excel file
def save_classified_data(classified_data, file_path, sheet_name):
    try:
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            classified_data.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"Classified data successfully saved to sheet '{sheet_name}' in {file_path}")
    except Exception as e:
        print(f"Error saving classified data to {file_path}: {e}")

# Main function
def main():
    file_path = r'C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Anderson_group_Data6_2013\Activity\InVitro_Activity.xlsx'  # Input file path
    normalized_sheet_name = "Min-Max Normalized"  # Sheet containing normalized data
    classified_sheet_name = "Min-Max 4 bins Kmeans"  # Sheet to save classified labels
    num_clusters = 4  # Number of clusters for K-means
    
    # Read normalized data
    data = read_data(file_path, normalized_sheet_name)
    if data is None or data.empty:
        print("Data could not be read or is empty. Exiting.")
        return

    # Determine thresholds based on K-means clustering
    thresholds = determine_kmeans_thresholds(data, num_clusters)

    # Classify data based on K-means thresholds
    classified_data = classify_data(data, thresholds)
    print("Classified Data (first 5 rows):")
    print(classified_data.head())
    
    # Save classified labels
    save_classified_data(classified_data, file_path, classified_sheet_name)

if __name__ == "__main__":
    main()


Data read from sheet 'Min-Max Normalized' (first 5 rows):
   Normalized Data
0         0.226427
1         0.083525
2         0.047840
3         0.117302
4         0.122228


  super()._check_params_vs_input(X, default_n_init=10)


Determined thresholds based on K-means clustering with 4 clusters: [0.03269253 0.18839056 0.56515617 1.        ]
Classified Data (first 5 rows):
  Normalized Data
0        mid-high
1         low-mid
2         low-mid
3         low-mid
4         low-mid
Classified data successfully saved to sheet 'Min-Max 4 bins Kmeans' in C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Anderson_group_Data6_2013\Activity\InVitro_Activity.xlsx


# Cell Viability 2 bins

In [10]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Function to read data from Excel
def read_data(file_path, sheet_name):
    try:
        data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
        print(f"Data read from sheet '{sheet_name}' (first 5 rows):")
        print(data.head())
        if data.empty:
            print(f"Warning: The sheet '{sheet_name}' is empty or contains only headers.")
        return data
    except Exception as e:
        print(f"Error reading '{sheet_name}' from {file_path}: {e}")
        return None

# Function to perform K-means clustering and determine thresholds
def determine_kmeans_thresholds(data, num_clusters):
    flat_data = data.values.flatten().reshape(-1, 1)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(flat_data)
    
    # Extract cluster centers (thresholds)
    thresholds = np.sort(kmeans.cluster_centers_.flatten())
    print(f"Determined thresholds based on K-means clustering with {num_clusters} clusters: {thresholds}")
    return thresholds

# Function to classify data based on K-means thresholds
def classify_data(data, thresholds):
    def classify(value):
        if value < thresholds[0]:
            return "mid-high"
        else:
            return "high"
    
    # Apply classification to each element
    classified_data = data.applymap(classify)
    return classified_data

# Function to save classified labels to a new sheet in the same Excel file
def save_classified_data(classified_data, file_path, sheet_name):
    try:
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            classified_data.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"Classified data successfully saved to sheet '{sheet_name}' in {file_path}")
    except Exception as e:
        print(f"Error saving classified data to {file_path}: {e}")

# Main function
def main():
    file_path = r'C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data5_2021\Cell_Viability\InVitro_Cell_Viability.xlsx'  # Input file path
    normalized_sheet_name = "Min-Max Normalized"  # Sheet containing normalized data
    classified_sheet_name = "Min-Max 2 bins Kmeans"  # Sheet to save classified labels
    num_clusters = 2  # Number of clusters for K-means
    
    # Read normalized data
    data = read_data(file_path, normalized_sheet_name)
    if data is None or data.empty:
        print("Data could not be read or is empty. Exiting.")
        return

    # Determine thresholds based on K-means clustering
    thresholds = determine_kmeans_thresholds(data, num_clusters)

    # Classify data based on K-means thresholds
    classified_data = classify_data(data, thresholds)
    print("Classified Data (first 5 rows):")
    print(classified_data.head())
    
    # Save classified labels
    save_classified_data(classified_data, file_path, classified_sheet_name)

if __name__ == "__main__":
    main()


Data read from sheet 'Min-Max Normalized' (first 5 rows):
   Normalized Data
0         0.660780
1         0.825996
2         0.764480
3         0.908604
4         0.825996


  super()._check_params_vs_input(X, default_n_init=10)


Determined thresholds based on K-means clustering with 2 clusters: [0.39962084 0.79191963]
Classified Data (first 5 rows):
  Normalized Data
0            high
1            high
2            high
3            high
4            high
Classified data successfully saved to sheet 'Min-Max 2 bins Kmeans' in C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data5_2021\Cell_Viability\InVitro_Cell_Viability.xlsx


# Cell Viability 4 bins

In [11]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Function to read data from Excel
def read_data(file_path, sheet_name):
    try:
        data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
        print(f"Data read from sheet '{sheet_name}' (first 5 rows):")
        print(data.head())
        if data.empty:
            print(f"Warning: The sheet '{sheet_name}' is empty or contains only headers.")
        return data
    except Exception as e:
        print(f"Error reading '{sheet_name}' from {file_path}: {e}")
        return None

# Function to perform K-means clustering and determine thresholds
def determine_kmeans_thresholds(data, num_clusters):
    flat_data = data.values.flatten().reshape(-1, 1)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(flat_data)
    
    # Extract cluster centers (thresholds)
    thresholds = np.sort(kmeans.cluster_centers_.flatten())
    print(f"Determined thresholds based on K-means clustering with {num_clusters} clusters: {thresholds}")
    return thresholds

# Function to classify data based on K-means thresholds
def classify_data(data, thresholds):
    def classify(value):
        if value < thresholds[0]:
            return "low"
        elif value < thresholds[1]:
            return "low-mid"
        elif value < thresholds[2]:
            return "mid-high"
        else:
            return "high"
    
    # Apply classification to each element
    classified_data = data.applymap(classify)
    return classified_data

# Function to save classified labels to a new sheet in the same Excel file
def save_classified_data(classified_data, file_path, sheet_name):
    try:
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            classified_data.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"Classified data successfully saved to sheet '{sheet_name}' in {file_path}")
    except Exception as e:
        print(f"Error saving classified data to {file_path}: {e}")

# Main function
def main():
    file_path = r'C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data5_2021\Cell_Viability\InVitro_Cell_Viability.xlsx'  # Input file path
    normalized_sheet_name = "Min-Max Normalized"  # Sheet containing normalized data
    classified_sheet_name = "Min-Max 4 bins Kmeans"  # Sheet to save classified labels
    num_clusters = 4  # Number of clusters for K-means
    
    # Read normalized data
    data = read_data(file_path, normalized_sheet_name)
    if data is None or data.empty:
        print("Data could not be read or is empty. Exiting.")
        return

    # Determine thresholds based on K-means clustering
    thresholds = determine_kmeans_thresholds(data, num_clusters)

    # Classify data based on K-means thresholds
    classified_data = classify_data(data, thresholds)
    print("Classified Data (first 5 rows):")
    print(classified_data.head())
    
    # Save classified labels
    save_classified_data(classified_data, file_path, classified_sheet_name)

if __name__ == "__main__":
    main()


Data read from sheet 'Min-Max Normalized' (first 5 rows):
   Normalized Data
0         0.660780
1         0.825996
2         0.764480
3         0.908604
4         0.825996


  super()._check_params_vs_input(X, default_n_init=10)


Determined thresholds based on K-means clustering with 4 clusters: [0.11695106 0.43120685 0.69375981 0.84864318]
Classified Data (first 5 rows):
  Normalized Data
0        mid-high
1            high
2            high
3            high
4            high
Classified data successfully saved to sheet 'Min-Max 4 bins Kmeans' in C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data5_2021\Cell_Viability\InVitro_Cell_Viability.xlsx
