## t-sne with all data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

In [None]:
data=pd.read_csv('Zephyr Data Emo with new categories.csv', engine="python", delimiter=';')

In [None]:
data.fillna(0, inplace=True)
data.set_index('Time', inplace=True)
data.index = pd.to_datetime(data.index, format='%d.%m.%Y %H:%M:%S')

In [None]:
data = data[data['Activities Detailed'] != 'papper work']
data = data[data['HRConfidence'] >= 20]

In [None]:
data['Activity'] = data['Activity'].str.replace(',', '.')
data['Activity'] = data['Activity'].astype(float)

data['PeakAcceleration'] = data['PeakAcceleration'].str.replace(',', '.')
data['PeakAcceleration'] = data['PeakAcceleration'].astype(float)

data['ECGAmplitude'] = data['ECGAmplitude'].str.replace(',', '.')
data['ECGAmplitude'] = data['ECGAmplitude'].astype(float)

data['ECGNoise'] = data['ECGNoise'].str.replace(',', '.')
data['ECGNoise'] = data['ECGNoise'].astype(float)

data['CoreTemp'] = data['CoreTemp'].str.replace(',', '.')
data['CoreTemp'] = data['CoreTemp'].astype(float)

data['AvForceDevRate'] = data['AvForceDevRate'].str.replace(',', '.')
data['AvForceDevRate'] = data['AvForceDevRate'].astype(float)

data['AvStepImpulse'] = data['AvStepImpulse'].str.replace(',', '.')
data['AvStepImpulse'] = data['AvStepImpulse'].astype(float)

data['AvStepPeriod'] = data['AvStepPeriod'].str.replace(',', '.')
data['AvStepPeriod'] = data['AvStepPeriod'].astype(float)

In [None]:
columns_to_exclude = ['Year','Month', 'Weekday', 'Hour', 'Date','Activities','Name of the volunteer', 
                      'PeakAcceleration','HRConfidence','AvForceDevRate','AvStepImpulse','WalkSteps','RunSteps','Bounds',
                      'CoreTemp', 'MinorImpacts', 'MajorImpacts']
numerical_columns  = data.drop(columns=columns_to_exclude)

# Get the numeric columns including "Activities Detailed" column
numeric_columns = numerical_columns.select_dtypes(include=[np.number, "object"])

non_binary_columns_tsne = numerical_columns.loc[:, ~(numerical_columns.isin([0, 1])).all()]  # Exclude columns with only 0s and 1s
# Exclude these columns as they don't provide important information

In [None]:
X_all = non_binary_columns_tsne.drop('Activities Detailed', axis=1)
X_all = feature_normalize(X_all)
y_all = non_binary_columns_tsne['Activities Detailed']

In [None]:
def assign_labels_to_clusters(df_kmeans):
    # Group the data by 'label' and 'cluster' and count the occurrences
    label_cluster_counts = df_kmeans.groupby(['label', 'cluster']).size().reset_index(name='count')

    # Find the dominant cluster for each label
    idx = label_cluster_counts.groupby(['label'])['count'].transform(max) == label_cluster_counts['count']
    dominant_clusters = label_cluster_counts[idx]

    # Create a dictionary mapping each label to its dominant cluster
    label_to_cluster = dict(zip(dominant_clusters['label'], dominant_clusters['cluster']))

    # Assign labels to clusters based on the dominant clusters
    df_kmeans['assigned_cluster'] = df_kmeans['label'].map(label_to_cluster)

    return df_kmeans

def perform_tsne_with_kmeans(X_data, y_data, perplexities, n_clusters_range, n_iter=1000, img_name_prefix='t-sne'):
    colors = sns.color_palette('tab20', n_colors=20)  # Choose a larger color palette

    for index, perplexity in enumerate(perplexities):
        X_reduced = TSNE(verbose=2, perplexity=perplexity).fit_transform(X_data)

        df = pd.DataFrame({'x': X_reduced[:, 0], 'y': X_reduced[:, 1], 'label': y_data})
        unique_labels = df['label'].unique()

        # set for each level different color
        color_dict = dict(zip(unique_labels, colors[:len(unique_labels)]))
        df['color'] = df['label'].map(color_dict)

        # plot the results
        plt.figure(figsize=(14, 10))
        sns.scatterplot(data=df, x='x', y='y', hue='label', style='label', palette=color_dict, markers=True)
        plt.title("Perplexity: {} and Max_iter: {}".format(perplexity, n_iter))
        img_name = img_name_prefix + '_perp_{}_iter_{}.png'.format(perplexity, n_iter)
        plt.savefig(img_name)
        plt.show()
        print('All good')

        # Apply K-Means clustering on the t-SNE reduced data
        for n_clusters in n_clusters_range:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            clusters = kmeans.fit_predict(X_reduced)

            # Prepare the data for seaborn
            df_kmeans = pd.DataFrame({'x': X_reduced[:, 0], 'y': X_reduced[:, 1], 'cluster': clusters, 'label': y_data})

            # Create a dictionary mapping each cluster to a color
            color_dict_kmeans = dict(zip(range(n_clusters), colors[:n_clusters]))

            # Map the colors to the 'cluster' column
            df_kmeans['color'] = df_kmeans['cluster'].map(color_dict_kmeans)

            # Draw the K-Means clustering plot
            plt.figure(figsize=(14, 10))
            sns.scatterplot(data=df_kmeans, x='x', y='y', hue='cluster', palette=color_dict_kmeans, markers=True)
            plt.title("K-Means Clustering with {} clusters".format(n_clusters))
            img_name = img_name_prefix + '_perp_{}_iter_{}_kmeans_{}.png'.format(perplexity, n_iter, n_clusters)
            print('Saving this K-Means clustering plot as an image in the present working directory...')
            plt.savefig(img_name)
            plt.show()
            print('Done')

            # Assign labels to clusters based on the majority of points
            df_kmeans = assign_labels_to_clusters(df_kmeans)

            # Display labels included in each cluster
            for cluster_num in range(n_clusters):
                cluster_labels = df_kmeans[df_kmeans['assigned_cluster'] == cluster_num]['label'].unique()
                print("Cluster {}: {}".format(cluster_num, ", ".join(cluster_labels)))

In [None]:
perform_tsne_with_kmeans(X_data = X_all, y_data = y_all, perplexities=[2,10,20], n_clusters_range=[5, 6, 7])