In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, make_scorer, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer


In [16]:
involvement_columns = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 
                           'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML', 
                           'Funding', 'Application-Oriented', 'Number of Members', 
                           'Academic Collaborations', 'System Maturity', 'Demos', 'Industrial Collaborations']

In [17]:
industry_cols = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML']

In [18]:
comp_cols = ['Number of Members', 'Application-Oriented', 'Academic Collaborations', 
                      'System Maturity', 'Demos', 'Industrial Collaborations']

In [19]:
feature_weights = {
    'Security': 1.0, 'Humanities': 1.0, 'Nat. Sci': 1.0, 'Health': 1.0, 'AI Ethics': 1.0, 'Big Data': 1.0, 'Robotics': 1.0, 
    'Documents': 1.0, 'Multimedia': 1.0, 'NLP': 1.0, 'KRR': 1.0, 'Graphs': 1.0, 'DL/ML': 1.0, 
    'Number of Members': 0.5, 'Application-Oriented': 0.5, 'Academic Collaborations': 0.5, 'System Maturity': 0.5, 
    'Demos': 0.5, 'Industrial Collaborations': 0.5
}

In [20]:
weights = {'Strong': 3, 'Good': 2, 'Average': 1, 'None': 0}

In [12]:
expanded_involvement_columns = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 
                                'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML', 
                                'Funding', 'Application-Oriented', 'Cybersecurity', 'Biotech', 
                                'FinTech', 'Agritech', 'MedTech', 'ClimateTech', 'EdTech', 
                                'Renewable Energy', 'Telecom', 'E-commerce']
granular_strength_map = {f'Level {i}': i for i in range(11)}

In [21]:
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path, index_col=0).transpose()
   
    for column in involvement_columns:
        if column in data.columns:
            data[column] = data[column].map(weights).fillna(0)
    
    return data

In [22]:
def assign_weights(data):
    for column, weight in feature_weights.items():
        if column in data.columns:
            data[column] = data[column]*weight

In [7]:
def reverse_com_values(data):
    max_complementary_value = max(weights.values())
    for column in comp_cols:
        if column in data.columns:
            data[column] = max_complementary_value - data[column]

In [23]:
def plot_elbow_method(X):
    inertia = []
    K = range(1, 11)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(K, inertia, 'bx-')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.show()

    optimal_clusters = 1
    for i in range(1, len(inertia) - 1):
        if inertia[i-1] - inertia[i] < inertia[i] - inertia[i+1]:
            optimal_clusters = i + 1
            break

    return optimal_clusters

In [24]:
# Path to the CSV file
from sklearn.preprocessing import StandardScaler 
filepath = 'data/synthetic_data.csv'

# Load and preprocess the data
dfs = load_and_preprocess_data(filepath)



In [25]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(dfs), columns=dfs.columns, index=dfs.index)

In [35]:
def calculate_pairwise_similarity(data):
 
    sim_matrix = np.zeros((len(data), len(data)))
    
   
    scoring_matrix = np.array([
        [0, 1, 2, 3],  # None
        [1, 2, 3, 4],  # Average
        [2, 3, 4, 5],  # Good
        [3, 4, 5, 6]   # Strong
    ])
    
    # Map involvement levels to indices
    involvement_index = {'None': 0, 'Average': 1, 'Good': 2, 'Strong': 3}
    
    # Reverse the numerical mapping for correct index lookups
    reverse_weights = {0: 'None', 1: 'Average', 2: 'Good', 3: 'Strong'}
    
    # Calculate pairwise similarity scores
    for i in range(len(data)):
        for j in range(len(data)):
            if i != j:
                similarity_score = 0
                for col in industry_cols:
                    level_i = data[col].iloc[i]
                    level_j = data[col].iloc[j]
                    index_i = int(level_i)
                    index_j = int(level_j)
                    similarity_score += scoring_matrix[index_i][index_j]
                sim_matrix[i][j] = similarity_score
    
    return sim_matrix

# Calculate the custom similarity matrix
custom_similarity_matrix = calculate_pairwise_similarity(dfs)


In [36]:
# Normalize the custom similarity matrix
max_sim = np.max(custom_similarity_matrix)
normalized_similarity_matrix = custom_similarity_matrix / max_sim

# Convert similarity matrix to distance matrix
distance_matrix = 1 - normalized_similarity_matrix

# Set the diagonal to zero
np.fill_diagonal(distance_matrix, 0)

In [37]:
def apply_agglomerative_clustering(distance_matrix, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='complete')
    labels = clustering.fit_predict(distance_matrix)
    return labels

def apply_kmeans_clustering(encoded_data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    kmeans.fit(encoded_data)
    labels = kmeans.labels_
    return labels

In [38]:
# Determine the optimal number of clusters for Agglomerative Clustering
silhouette_avg_scores = [] #FIXME: Use find_optimal_clusters function and evaluate results
for n_clusters in range(2, 11):
    labels = apply_agglomerative_clustering(distance_matrix, n_clusters)
    silhouette_avg = silhouette_score(distance_matrix, labels, metric='precomputed')
    silhouette_avg_scores.append(silhouette_avg)

optimal_clusters_agg = silhouette_avg_scores.index(max(silhouette_avg_scores)) + 2
print(f'Optimal number of clusters for Agglomerative Clustering: {optimal_clusters_agg}')

best_labels_agg = apply_agglomerative_clustering(distance_matrix, optimal_clusters_agg)

Optimal number of clusters for Agglomerative Clustering: 3


In [39]:
silhouette_avg_agg = silhouette_score(distance_matrix, best_labels_agg, metric='precomputed')
davies_bouldin_agg = davies_bouldin_score(distance_matrix, best_labels_agg)
print(f'Agglomerative - Silhouette Score: {silhouette_avg_agg}, Davies-Bouldin Score: {davies_bouldin_agg}')

Agglomerative - Silhouette Score: 0.28331042364020925, Davies-Bouldin Score: 0.3195661140914868


In [40]:
silhouette_avg_scores = []
for n_clusters in range(2, 11):
    labels = apply_kmeans_clustering(distance_matrix, n_clusters)
    silhouette_avg = silhouette_score(distance_matrix, labels, metric='precomputed')
    silhouette_avg_scores.append(silhouette_avg)

optimal_clusters_kmeans = silhouette_avg_scores.index(max(silhouette_avg_scores)) + 2
print(f'Optimal number of clusters for KMeans: {optimal_clusters_kmeans}')

best_labels_kmeans = apply_kmeans_clustering(distance_matrix, optimal_clusters_kmeans)

Optimal number of clusters for KMeans: 2


In [41]:
silhouette_avg_kmeans = silhouette_score(distance_matrix, best_labels_kmeans, metric='precomputed')
davies_bouldin_kmeans = davies_bouldin_score(distance_matrix, best_labels_kmeans)
print(f'KMeans - Silhouette Score: {silhouette_avg_kmeans}, Davies-Bouldin Score: {davies_bouldin_kmeans}')

KMeans - Silhouette Score: 0.008953567736218861, Davies-Bouldin Score: 0.6392769308855364


In [33]:
#Choose the best clustering algorithm
clustering_scores = {
    'KMeans': silhouette_avg_kmeans,
    'Agglomerative': silhouette_avg_agg,
    #'DBSCAN': best_dbscan_score if len(set(best_labels_dbscan)) > 1 else -1
}

best_clustering_method = max(clustering_scores, key=clustering_scores.get)
print(f'Best Clustering Method: {best_clustering_method}')

Best Clustering Method: KMeans


In [45]:
if best_clustering_method == 'KMeans':
    best_labels = best_labels_kmeans
else:
    best_labels = best_labels_agg

In [46]:
df_scaled['Cluster'] = best_labels
cluster_label_series = pd.Series(best_labels)
cluster_counts = cluster_label_series.value_counts().sort_index()
cluster_counts

0    538
1    462
Name: count, dtype: int64

In [47]:
def profile_clusters(df):
    """
    Generates statistical profiles for each cluster.

    This function groups the DataFrame by the 'cluster' column and calculates
    the mean, standard deviation, minimum, and maximum for each feature within
    each cluster.

    Parameters:
    - df (DataFrame): The DataFrame containing the team data, including a 'cluster' column.

    Returns:
    - profiles (DataFrame): A DataFrame with the mean, standard deviation, minimum, and maximum
                            values for each feature within each cluster.
    """
    
    profiles = df.groupby('Cluster').agg(['mean', 'std', 'min', 'max'])
    return profiles

cluster_profiles = profile_clusters(df_scaled)
print("Cluster Profiles:")
print(cluster_profiles)

Cluster Profiles:
Field   Security                               Humanities                      \
            mean       std       min       max       mean       std       min   
Cluster                                                                         
0        0.00517  0.987479 -0.448676  3.269633   0.100402  1.007252 -0.905336   
1       -0.00602  1.016493 -0.448676  3.269633  -0.116919  0.980774 -0.905336   

Field              Nat. Sci            ... System Maturity            \
              max      mean       std  ...             min       max   
Cluster                                ...                             
0        1.977899  0.026982  1.033429  ...       -2.598027  1.042366   
1        1.977899 -0.031421  0.960894  ...       -2.598027  1.042366   

Field       Demos                               Industrial Collaborations  \
             mean       std       min       max                      mean   
Cluster                                                      

In [48]:
X = df_scaled.drop(columns=['Cluster'])
y = df_scaled['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
class QualitativeToQuantitativeTransformer(BaseEstimator, TransformerMixin):
    
    """
    A custom transformer to map qualitative descriptions to quantitative values.

    Parameters:
    - columns (list of str): List of column names to be transformed.
    - mapping (dict): A dictionary mapping qualitative descriptions to quantitative values.

    Methods:
    - fit(X, y=None): Fits the transformer on the dataset. (No action needed for this transformer)
    - transform(X): Transforms the specified columns in the dataset using the provided mapping.
    """
     
    def __init__(self, columns, mapping):
        self.columns = columns
        self.mapping = mapping
        
    def fit(self, X, y=None):
        """
        Fit the transformer. This transformer doesn't need to learn anything, so fit does nothing.
        
        Parameters:
        - X (DataFrame): Input data.
        - y (Series or None): Target data (not used).
        
        Returns:
        - self: Fitted transformer.
        """
        return self
    
    def transform(self, X):
        """
        Transform the specified columns using the provided mapping.
        
        Parameters:
        - X (DataFrame): Input data.
        
        Returns:
        - X_transformed (DataFrame): Transformed data with specified columns mapped to quantitative values.
        """
        X_transformed = X.copy()
        for column in self.columns:
            if column in X_transformed.columns:
                X_transformed[column] = X_transformed[column].map(self.mapping).fillna(0)
        return X_transformed

weights = {'Strong': 3, 'Good': 2, 'Average': 1, 'None': 0}

preprocessor = ColumnTransformer(
    transformers=[
        ('qualitative', QualitativeToQuantitativeTransformer(involvement_columns, weights), involvement_columns),
        ('scaler', StandardScaler(), involvement_columns + comp_cols)
    ],
)



In [50]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [51]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': [None, 'sqrt', 'log2']  
}


In [None]:
# Perform hyperparameter optimization using GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_pipeline = grid_search.best_estimator_

train_score = best_pipeline.score(X_train, y_train)
test_score = best_pipeline.score(X_test, y_test)

y_pred = best_pipeline.predict(X_test)

print(f"Optimized Train Accuracy: {train_score}")
print(f"Optimized Test Accuracy: {test_score}")
print("Optimized Classification Report:\n", classification_report(y_test, y_pred))
print("Optimized Confusion Matrix:\n", confusion_matrix(y_test, y_pred))