In [98]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, make_scorer, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA


In [126]:
!pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25ldone
[?25h  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3541 sha256=136cc31649c6bdf80ee7fb98c78234f44d4f89e06442d157c50cf793240b7f5d
  Stored in directory: /Users/kimonanagnostopoulos/Library/Caches/pip/wheels/48/4a/1c/1d511cbb0413a448d8546e958f8e82b98d9bb493038d19ece2
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [82]:
involvement_columns = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 
                           'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML', 
                           'Funding', 'Application-Oriented', 'Number of Members', 
                           'Academic Collaborations', 'System Maturity', 'Demos', 'Industrial Collaborations']

In [83]:
industry_cols = ['Security', 'Humanities', 'Nat. Sci', 'Health', 'AI Ethics', 'Big Data', 'Robotics', 'Documents', 'Multimedia', 'NLP', 'KRR', 'Graphs', 'DL/ML']

In [84]:
comp_cols = ['Number of Members', 'Application-Oriented', 'Academic Collaborations', 
                      'System Maturity', 'Demos', 'Industrial Collaborations']

In [85]:
feature_weights = {
    'Security': 1.0, 'Humanities': 1.0, 'Nat. Sci': 1.0, 'Health': 1.0, 'AI Ethics': 1.0, 'Big Data': 1.0, 'Robotics': 1.0, 
    'Documents': 1.0, 'Multimedia': 1.0, 'NLP': 1.0, 'KRR': 1.0, 'Graphs': 1.0, 'DL/ML': 1.0, 
    'Number of Members': 0.5, 'Application-Oriented': 0.5, 'Academic Collaborations': 0.5, 'System Maturity': 0.5, 
    'Demos': 0.5, 'Industrial Collaborations': 0.5
}

In [86]:
weights = {'Strong': 3, 'Good': 2, 'Average': 1, 'None': 0}

In [215]:
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path, index_col=0).transpose()
    for column in involvement_columns:
        if column in data.columns:
            data[column] = data[column].map(weights).fillna(0)
    return data

In [88]:
def assign_weights(data):
    for column, weight in feature_weights.items():
        if column in data.columns:
            data[column] = data[column]*weight

In [216]:
def standardize_data(data):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)

In [136]:
!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, LeakyReLU, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model



Hyperparameter Optimization
-Adjusting encoding dimension
-Learning Rate and Epochs
-Additional Layers

In [235]:
def create_autoencoder(input_dim, encoding_dim, layer_sizes, dropout_rate=0.2):
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    
    # Add encoder layers
    for size in layer_sizes:
        x = Dense(size)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)
        x = Dropout(dropout_rate)(x)
    
    encoded = Dense(encoding_dim, activation='relu', name='encoded_layer')(x)
    
    # Add decoder layers (reverse of encoder)
    for size in reversed(layer_sizes):
        x = Dense(size)(encoded)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)
        x = Dropout(dropout_rate)(x)
    
    decoded = Dense(input_dim, activation='sigmoid')(x)
    
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
    
    return autoencoder


In [219]:
def train_autoencoder(autoencoder, data, epochs=50, batch_size=32):
    print(f"Data shape: {data.shape}")
    autoencoder.fit(data, data, epochs=epochs, batch_size=batch_size, shuffle=True, verbose=2)


In [220]:
def get_encoded_data(autoencoder, data):
    encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoded_layer').output)
    return encoder_model.predict(data)


In [221]:
def apply_agglomerative_clustering(data, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    return clustering.fit_predict(data)

In [222]:
def apply_kmeans_clustering(data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    kmeans.fit(data)
    return kmeans.labels_

In [223]:
def apply_dbscan_clustering(data, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    return dbscan.fit_predict(data)

In [224]:
def evaluate_clustering(data, labels):
    silhouette_avg = silhouette_score(data, labels)
    davies_bouldin = davies_bouldin_score(data, labels)
    return silhouette_avg, davies_bouldin

In [225]:
def find_optimal_clusters(data, clustering_func, cluster_range):
    best_score = -1
    best_n_clusters = 0
    for n_clusters in cluster_range:
        labels = clustering_func(data, n_clusters)
        silhouette_avg = silhouette_score(data, labels)
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n_clusters = n_clusters
    return best_n_clusters

In [226]:
def visualize_clusters(data, labels, method='pca'):
    if method == 'pca':
        reducer = PCA(n_components=2)
    elif method == 'tsne':
        from sklearn.manifold import TSNE
        reducer = TSNE(n_components=2, random_state=42)
    else:
        raise ValueError("Method not recognized: choose 'pca', or 'tsne'")
        
    reduced_data = reducer.fit_transform(data)
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis')
    plt.title(f'Clusters visualization using {method.upper()}')
    plt.show()

In [261]:
# Main execution
filepath = 'data/synthetic_data.csv'

# Load and preprocess data
dfs = load_and_preprocess_data(filepath)
assign_weights(dfs)
df_scaled = standardize_data(dfs)
input_dim = df_scaled.shape[1]

In [230]:
df_scaled

Field,Security,Humanities,Nat. Sci,Health,AI Ethics,Big Data,Robotics,Documents,Multimedia,NLP,KRR,Graphs,DL/ML,Funding,Application-Oriented,Number of Members,Academic Collaborations,System Maturity,Demos,Industrial Collaborations
"Hall, Nelson and Parks",-0.448676,1.977899,-0.470697,-0.688054,-0.706794,-0.224874,-0.302825,2.124205,-0.539798,-0.723123,-1.325477,-0.586008,0.444830,0.458095,-0.251883,1.343749,1.155235,1.042366,1.202662,-0.154439
Krueger LLC,0.790760,-0.905336,2.940148,-0.688054,-0.706794,-1.079909,3.302236,-0.526633,-0.539798,-0.723123,0.766831,-0.586008,-0.566148,-1.207704,0.848042,0.084378,1.155235,-0.171098,0.029333,-0.154439
Lewis Ltd,-0.448676,-0.905336,-0.470697,-0.688054,-0.706794,-0.224874,-0.302825,-0.526633,-0.539798,1.799400,0.766831,-0.586008,-0.566148,-1.207704,-0.251883,1.343749,1.155235,1.042366,1.202662,1.918574
Simpson Group,-0.448676,-0.905336,2.940148,-0.688054,-0.706794,1.485195,-0.302825,1.240593,-0.539798,0.958559,-1.325477,-0.586008,1.455809,-1.207704,-0.251883,0.084378,1.155235,-2.598027,-1.143995,-1.190946
"Campbell, Gonzalez and Perez",3.269633,-0.905336,1.234726,-0.688054,-0.706794,-0.224874,-0.302825,-0.526633,-0.539798,1.799400,-0.279323,-0.586008,0.444830,1.290994,0.848042,0.084378,1.155235,-0.171098,-1.143995,-0.154439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hill and Sons,-0.448676,-0.905336,-0.470697,-0.688054,-0.706794,1.485195,-0.302825,2.124205,-0.539798,-0.723123,-0.279323,-0.586008,-0.566148,-1.207704,-1.351808,-1.174993,1.155235,1.042366,-1.143995,-0.154439
"Anderson, Thomas and Miller",-0.448676,-0.905336,-0.470697,-0.688054,-0.706794,0.630161,-0.302825,-0.526633,-0.539798,0.958559,-1.325477,-0.586008,-1.577126,0.458095,-2.451733,-1.174993,1.155235,-0.171098,0.029333,-1.190946
Williams PLC,-0.448676,-0.905336,-0.470697,-0.688054,1.175481,-1.079909,-0.302825,1.240593,-0.539798,-0.723123,0.766831,-0.586008,0.444830,1.290994,0.848042,-1.174993,1.155235,-0.171098,0.029333,-0.154439
Williams-Beard,-0.448676,-0.905336,-0.470697,1.040725,1.175481,-1.079909,-0.302825,-0.526633,-0.539798,-0.723123,0.766831,1.434709,1.455809,-0.374805,0.848042,0.084378,1.155235,-0.171098,-1.143995,0.882067


In [231]:
input_dim

20

In [238]:
layer_configurations = [
    [64, 32],
    [128, 64, 32],
    [256, 128, 64, 32],
    [128, 64]
]

In [240]:
encoding_dims = [8, 10, 12]
batch_sizes = [32, 64, 128]
epochs_list = [50, 100]
dropout_rates = [0.2, 0.3]

best_params = None
best_score = -1


In [241]:
for layer_sizes in layer_configurations:
    for encoding_dim in encoding_dims:
        for batch_size in batch_sizes:
            for epochs in epochs_list:
                for dropout_rate in dropout_rates:
                    print(f"Testing with layers={layer_sizes}, encoding_dim={encoding_dim}, batch_size={batch_size}, epochs={epochs}, dropout_rate={dropout_rate}")
                    
                    autoencoder = create_autoencoder(input_dim, encoding_dim, layer_sizes, dropout_rate)
                    train_autoencoder(autoencoder, df_scaled, epochs=epochs, batch_size=batch_size)
                    encoded_data = get_encoded_data(autoencoder, df_scaled)
                    
                    optimal_clusters_kmeans = find_optimal_clusters(encoded_data, apply_kmeans_clustering, range(2, 11))
                    best_labels_kmeans = apply_kmeans_clustering(encoded_data, optimal_clusters_kmeans)
                    silhouette_avg_kmeans, davies_bouldin_kmeans = evaluate_clustering(encoded_data, best_labels_kmeans)
                    
                    if silhouette_avg_kmeans > best_score:
                        best_score = silhouette_avg_kmeans
                        best_params = {
                            'layer_sizes': layer_sizes,
                            'encoding_dim': encoding_dim,
                            'batch_size': batch_size,
                            'epochs': epochs,
                            'dropout_rate': dropout_rate
                        }
                        print(f"New best score: {best_score} with params: {best_params}")

print(f"Best parameters found: {best_params} with silhouette score: {best_score}")

Testing with layers=[64, 32], encoding_dim=8, batch_size=32, epochs=50, dropout_rate=0.2
Data shape: (1000, 20)
Epoch 1/50
32/32 - 2s - 52ms/step - loss: 1.2038
Epoch 2/50
32/32 - 0s - 2ms/step - loss: 1.0818
Epoch 3/50
32/32 - 0s - 2ms/step - loss: 1.0112
Epoch 4/50
32/32 - 0s - 1ms/step - loss: 0.9653
Epoch 5/50
32/32 - 0s - 1ms/step - loss: 0.9350
Epoch 6/50
32/32 - 0s - 1ms/step - loss: 0.9124
Epoch 7/50
32/32 - 0s - 1ms/step - loss: 0.8975
Epoch 8/50
32/32 - 0s - 1ms/step - loss: 0.8858
Epoch 9/50
32/32 - 0s - 2ms/step - loss: 0.8733
Epoch 10/50
32/32 - 0s - 1ms/step - loss: 0.8628
Epoch 11/50
32/32 - 0s - 1ms/step - loss: 0.8528
Epoch 12/50
32/32 - 0s - 897us/step - loss: 0.8455
Epoch 13/50
32/32 - 0s - 958us/step - loss: 0.8372
Epoch 14/50
32/32 - 0s - 927us/step - loss: 0.8344
Epoch 15/50
32/32 - 0s - 908us/step - loss: 0.8283
Epoch 16/50
32/32 - 0s - 858us/step - loss: 0.8215
Epoch 17/50
32/32 - 0s - 880us/step - loss: 0.8220
Epoch 18/50
32/32 - 0s - 899us/step - loss: 0.8156


In [262]:
# Validate the Best Model
best_autoencoder = create_autoencoder(
    input_dim=input_dim,
    encoding_dim=best_params['encoding_dim'],
    layer_sizes=best_params['layer_sizes'],
    dropout_rate=best_params['dropout_rate']
)

In [273]:
from sklearn.model_selection import train_test_split, KFold

train_data, test_data = train_test_split(df_scaled, test_size=0.2, random_state=42)

In [274]:
def train_multiple_autoencoders(num_autoencoders, input_dim, best_params, data):
    autoencoders = []
    for _ in range(num_autoencoders):
        autoencoder = create_autoencoder(
            input_dim=input_dim,
            encoding_dim=best_params['encoding_dim'],
            layer_sizes=best_params['layer_sizes'],
            dropout_rate=best_params['dropout_rate']
        )
        train_autoencoder(autoencoder, data, epochs=best_params['epochs'], batch_size=best_params['batch_size'])
        autoencoders.append(autoencoder)
    return autoencoders


In [275]:
def ensemble_encoding(autoencoders, data):
    encoded_representations = [get_encoded_data(autoencoder, data) for autoencoder in autoencoders]
    combined_encoding = np.mean(encoded_representations, axis=0)
    return combined_encoding

In [276]:

def cross_validate_ensemble_model(autoencoder_func, train_data, n_splits=5, num_autoencoders=3):
    kf = KFold(n_splits=n_splits)
    fold = 1
    for train_index, val_index in kf.split(train_data):
        kf_train_data, val_data = train_data[train_index], train_data[val_index]
        autoencoders = autoencoder_func(num_autoencoders, input_dim, best_params, kf_train_data)
        combined_encoding = ensemble_encoding(autoencoders, val_data)
        
        optimal_clusters_kmeans = find_optimal_clusters(combined_encoding, apply_kmeans_clustering, range(2, 11))
        best_labels_kmeans = apply_kmeans_clustering(combined_encoding, optimal_clusters_kmeans)
        silhouette_avg_kmeans, davies_bouldin_kmeans = evaluate_clustering(combined_encoding, best_labels_kmeans)
        
        print(f'Fold {fold} - KMeans Silhouette Score: {silhouette_avg_kmeans}, Davies-Bouldin Score: {davies_bouldin_kmeans}')
        
        optimal_clusters_agg = find_optimal_clusters(combined_encoding, apply_agglomerative_clustering, range(2, 11))
        best_labels_agg = apply_agglomerative_clustering(combined_encoding, optimal_clusters_agg)
        silhouette_avg_agg, davies_bouldin_agg = evaluate_clustering(combined_encoding, best_labels_agg)
    
        print(f'Fold {fold} - Agglomerative Silhouette Score: {silhouette_avg_agg}, Davies-Bouldin Score: {davies_bouldin_agg}')
        
        fold += 1

In [277]:
def autoencoder_func(num_autoencoders, input_dim, best_params, data):
    return train_multiple_autoencoders(num_autoencoders, input_dim, best_params, data)

In [278]:
cross_validate_ensemble_model(autoencoder_func, train_data)

KeyError: "None of [Index([160, 161, 162, 163, 164, 165, 166, 167, 168, 169,\n       ...\n       790, 791, 792, 793, 794, 795, 796, 797, 798, 799],\n      dtype='int64', name='Field', length=640)] are in the [columns]"