In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [None]:
pd_df_voice_data = pd.read_excel('../data/KNOW_19_07_2024_Report - ADL 04SEP2024.xlsx')

In [None]:
pd_df_voice_data

In [None]:
pd_df_voice_data_for_clustering = pd_df_voice_data.drop(
    columns = pd_df_voice_data.columns[0:11]
)

In [None]:
pd_df_voice_data_for_clustering

In [None]:
pd_df_voice_data_for_clustering = pd_df_voice_data_for_clustering.fillna(pd_df_voice_data_for_clustering.mean())

In [None]:
scaler_object = StandardScaler()
pd_df_features_scaled = scaler_object.fit_transform(pd_df_voice_data_for_clustering)

In [None]:
num_max_clusters = 10
list_inertias = []
list_k_values = list(range(1, num_max_clusters + 1))

for k in list_k_values:
    kmeans_object = KMeans(n_clusters=k, random_state=42)
    kmeans_object.fit(pd_df_voice_data_for_clustering)
    list_inertias.append(kmeans_object.inertia_)

In [None]:
# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(list_k_values, list_inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
#plt.yscale('log')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()

In [None]:
kmeans_object = KMeans(n_clusters=2, random_state=42)
kmeans_object.fit(pd_df_voice_data_for_clustering)
pd_df_voice_data['Cluster'] = kmeans_object.labels_

In [None]:
pd_df_voice_data

In [None]:
# reorder columns so that Cluster is the first column
pd_df_voice_data = pd_df_voice_data[['Cluster'] + [col for col in pd_df_voice_data.columns if col != 'Cluster']]

In [None]:
pd_df_voice_data

In [None]:
pd_df_voice_data[pd_df_voice_data['Cluster'] == 0] 

In [None]:
list_test_statistics = []
list_p_values = []
for str_feature in pd_df_voice_data_for_clustering.columns:
    list_groups = [group[str_feature].values for name, group in pd_df_voice_data.groupby('Cluster')]
    #statistic, p_val = stats.f_oneway(*list_groups)
    statistic, p_val = stats.ttest_ind(list_groups[0], list_groups[1])
    list_test_statistics.append(statistic)
    list_p_values.append(p_val)

In [None]:
pd_df_stat_importance = pd.DataFrame({
    'Feature': pd_df_voice_data_for_clustering.columns,
    'Test Statistic': list_test_statistics,
    'P_Value': list_p_values
}).sort_values('Test Statistic', ascending=False)

In [None]:
pd_df_stat_importance

In [None]:
plt.scatter(pd_df_voice_data['READ_SYL_COUNT'], pd_df_voice_data['READ_SPEECH_PERCENT'], c=pd_df_voice_data['Cluster'])
plt.show()