In [13]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, make_scorer, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer


In [14]:
involvement_columns = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 
                           'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML', 
                           'Funding', 'Application-Oriented', 'Number of Members', 
                           'Academic Collaborations', 'System Maturity', 'Demos', 'Industrial Collaborations']

In [15]:
industry_cols = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML']

In [4]:
comp_cols = ['Number of Members', 'Application-Oriented', 'Academic Collaborations', 
                      'System Maturity', 'Demos', 'Industrial Collaborations']

In [16]:
feature_weights = {
    'Security': 1.0, 'Humanities': 1.0, 'Nat. Sci': 1.0, 'Health': 1.0, 'AI Ethics': 1.0, 'Big Data': 1.0, 'Robotics': 1.0, 
    'Documents': 1.0, 'Multimedia': 1.0, 'NLP': 1.0, 'KRR': 1.0, 'Graphs': 1.0, 'DL/ML': 1.0, 
    'Number of Members': 0.5, 'Application-Oriented': 0.5, 'Academic Collaborations': 0.5, 'System Maturity': 0.5, 
    'Demos': 0.5, 'Industrial Collaborations': 0.5
}

In [17]:
weights = {'Strong': 3, 'Good': 2, 'Average': 1, 'None': 0}

In [18]:
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path, index_col=0).transpose()
   
    for column in involvement_columns:
        if column in data.columns:
            data[column] = data[column].map(weights).fillna(0)
    
    return data

In [19]:
def assign_weights(data):
    for column, weight in feature_weights.items():
        if column in data.columns:
            data[column] = data[column]*weight

In [20]:
def reverse_com_values(data):
    max_complementary_value = max(weights.values())
    for column in comp_cols:
        if column in data.columns:
            data[column] = max_complementary_value - data[column]

In [21]:
def plot_elbow_method(X):
    inertia = []
    K = range(1, 11)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(K, inertia, 'bx-')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.show()

    optimal_clusters = 1
    for i in range(1, len(inertia) - 1):
        if inertia[i-1] - inertia[i] < inertia[i] - inertia[i+1]:
            optimal_clusters = i + 1
            break

    return optimal_clusters

In [33]:
# Path to the CSV file
from sklearn.preprocessing import StandardScaler 
filepath = 'data/synthetic_data.csv'

# Load and preprocess the data
dfs = load_and_preprocess_data(filepath)



In [34]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(dfs), columns=dfs.columns, index=dfs.index)

In [36]:
def calculate_custom_similarity(data):
    num_teams = len(data)
    sim_matrix = np.zeros((num_teams, num_teams))
    
    for i in range(num_teams):
        for j in range(num_teams):
            if i != j:
                similarity_score = 0
                # Similarity for industry columns
                for col in industry_cols:
                    if data[col].iloc[i] == data[col].iloc[j]:
                        similarity_score += data[col].iloc[i]
                
                # Complementary for complementary columns
                for col in comp_cols:
                    diff = abs(data[col].iloc[i] - data[col].iloc[j])
                    similarity_score += max(weights.values()) - diff
                
                sim_matrix[i][j] = similarity_score
    
    return sim_matrix

# Calculate the custom similarity matrix
custom_similarity_matrix = calculate_custom_similarity(dfs)


In [37]:
# Normalize the custom similarity matrix
max_sim = np.max(custom_similarity_matrix)
normalized_similarity_matrix = custom_similarity_matrix / max_sim

# Convert similarity matrix to distance matrix
distance_matrix = 1 - normalized_similarity_matrix

# Set the diagonal to zero
np.fill_diagonal(distance_matrix, 0)

In [38]:
def apply_agglomerative_clustering(distance_matrix, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='complete')
    labels = clustering.fit_predict(distance_matrix)
    return labels

def apply_kmeans_clustering(encoded_data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    kmeans.fit(encoded_data)
    labels = kmeans.labels_
    return labels

In [39]:
# Determine the optimal number of clusters for Agglomerative Clustering
silhouette_avg_scores = []
for n_clusters in range(2, 11):
    labels = apply_agglomerative_clustering(distance_matrix, n_clusters)
    silhouette_avg = silhouette_score(distance_matrix, labels, metric='precomputed')
    silhouette_avg_scores.append(silhouette_avg)

optimal_clusters_agg = silhouette_avg_scores.index(max(silhouette_avg_scores)) + 2
print(f'Optimal number of clusters for Agglomerative Clustering: {optimal_clusters_agg}')

best_labels_agg = apply_agglomerative_clustering(distance_matrix, optimal_clusters_agg)

Optimal number of clusters for Agglomerative Clustering: 2


In [40]:
silhouette_avg_agg = silhouette_score(distance_matrix, best_labels_agg, metric='precomputed')
davies_bouldin_agg = davies_bouldin_score(distance_matrix, best_labels_agg)
print(f'Agglomerative - Silhouette Score: {silhouette_avg_agg}, Davies-Bouldin Score: {davies_bouldin_agg}')

Agglomerative - Silhouette Score: 0.026520328414729524, Davies-Bouldin Score: 6.225512108725167


In [41]:
silhouette_avg_scores = []
for n_clusters in range(2, 11):
    labels = apply_kmeans_clustering(distance_matrix, n_clusters)
    silhouette_avg = silhouette_score(distance_matrix, labels, metric='precomputed')
    silhouette_avg_scores.append(silhouette_avg)

optimal_clusters_kmeans = silhouette_avg_scores.index(max(silhouette_avg_scores)) + 2
print(f'Optimal number of clusters for KMeans: {optimal_clusters_kmeans}')

best_labels_kmeans = apply_kmeans_clustering(distance_matrix, optimal_clusters_kmeans)

Optimal number of clusters for KMeans: 2


In [32]:
silhouette_avg_kmeans = silhouette_score(distance_matrix, best_labels_kmeans, metric='precomputed')
davies_bouldin_kmeans = davies_bouldin_score(distance_matrix, best_labels_kmeans)
print(f'KMeans - Silhouette Score: {silhouette_avg_kmeans}, Davies-Bouldin Score: {davies_bouldin_kmeans}')

KMeans - Silhouette Score: 0.03928745074582179, Davies-Bouldin Score: 2.5421986804325463
