In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from tqdm import tqdm
import os
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
train_terms = pd.read_csv("Train/train_terms.tsv",sep="\t")
print(train_terms.shape)
train_terms.head()

In [None]:
train_protein_ids = np.load('train_ids.npy')
print(train_protein_ids.shape)
print(train_protein_ids[:5])

train_embeddings = np.load('train_embeds.npy')

# Now lets convert embeddings numpy array(train_embeddings) into pandas dataframe.
column_num = train_embeddings.shape[1]
train_df = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(train_df.shape)
train_df.head()

In [None]:
print(train_terms['term'].value_counts().shape)

# Select first 1500 values for plotting
plot_df = train_terms['term'].value_counts().iloc[:100]

figure, axis = plt.subplots(1, 1, figsize=(12, 6))

bp = sns.barplot(ax=axis, x=np.array(plot_df.index), y=plot_df.values)
bp.set_xticklabels(bp.get_xticklabels(), rotation=90, size = 6)
axis.set_title('Top 100 frequent GO term IDs')
bp.set_xlabel("GO term IDs", fontsize = 12)
bp.set_ylabel("Count", fontsize = 12)
plt.show()

In [None]:
# Set the limit for label
num_of_labels = 500

# Take value counts in descending order and fetch first 1500 `GO term ID` as labels
labels = train_terms['term'].value_counts().index[:num_of_labels].tolist()

# Fetch the train_terms data for the relevant labels only
train_terms_updated = train_terms.loc[train_terms['term'].isin(labels)]
print(train_terms_updated.shape)

pie_df = train_terms_updated['aspect'].value_counts()
palette_color = sns.color_palette('bright')
plt.pie(pie_df.values, labels=np.array(pie_df.index), colors=palette_color, autopct='%.0f%%')
plt.show()

In [None]:
# Create an empty dataframe of required size for storing the labels,
# i.e, train_size x num_of_labels (142246 x 1500)
train_size = train_protein_ids.shape[0] # len(X)
train_labels = np.zeros((train_size ,num_of_labels))

# Convert from numpy to pandas series for better handling
series_train_protein_ids = pd.Series(train_protein_ids)

# Loop through each label
for i in tqdm(range(num_of_labels)):
    # For each label, fetch the corresponding train_terms data
    n_train_terms = train_terms_updated[train_terms_updated['term'] ==  labels[i]]

    # Fetch all the unique EntryId aka proteins related to the current label(GO term ID)
    label_related_proteins = n_train_terms['EntryID'].unique()

    # In the series_train_protein_ids pandas series, if a protein is related
    # to the current label, then mark it as 1, else 0.
    # Replace the ith column of train_Y with with that pandas series.
    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)

# Convert train_Y numpy into pandas dataframe
labels_df = pd.DataFrame(data = train_labels, columns = labels)
print(labels_df.shape)
labels_df.head()

In [None]:
labels_df.head()

In [None]:
labels_df.to_csv('labels_df.csv')

In [None]:
labels_df

In [None]:
corr = train_df.corr()
means_of_correlations = abs(corr).mean()
plt.hist(means_of_correlations,bins=100)
plt.show()
highly_correlated_cols= np.where(means_of_correlations>0.12)
print('number of bad cols:' ,len(highly_correlated_cols[0]))
print('col numbers:',(highly_correlated_cols[0]+1))

In [None]:
reduced_train_df = train_df.drop(train_df.columns[highly_correlated_cols], axis =1)
reduced_train_df.shape

In [None]:
#PCA with threshold:
def apply_pca_with_threshold(data, threshold):
    # Standardize the features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # Apply PCA
    pca = PCA()
    pca_result = pca.fit_transform(scaled_data)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    plt.show()
    # Get the explained variance ratios
    explained_variance_ratio = pca.explained_variance_ratio_

    # Determine the number of components to retain based on the threshold
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)
    num_components = np.argmax(cumulative_explained_variance >= threshold) + 1

    # Fit PCA with the selected number of components
    pca_selected = PCA(n_components=num_components)
    pca_result_selected = pca_selected.fit_transform(scaled_data)

    # Transform the selected components back to the original space
    selected_features = pca_selected.inverse_transform(pca_result_selected)

    # Create a DataFrame with the selected features
    selected_df = pd.DataFrame(data=selected_features, columns=data.columns)

    # Identify the dropped columns
    dropped_columns = data.columns.difference(selected_df.columns)

    pca_threshold = pd.DataFrame(data=selected_features, columns=train_df.columns)

    return num_components, dropped_columns, pca_threshold

threshold = 0.99

# Apply PCA with the specified threshold
num_components_retained, dropped_columns, pca_threshold_df = apply_pca_with_threshold(train_df, threshold)
print("Number of components retained:", num_components_retained)
print(pca_threshold_df)



In [None]:
INPUT_SHAPE = [pca_threshold_df.shape[1]]
BATCH_SIZE = 5120
epochs = 200

model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=num_of_labels,activation='sigmoid')
])


# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC()],
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,          # Number of epochs with no improvement before stopping
    restore_best_weights=True  # Restore weights from the epoch with the best validation loss
)

history = model.fit(
    pca_threshold_df, labels_df,
    batch_size=BATCH_SIZE,
    epochs=epochs
    callbacks=[early_stopping]  
)

In [None]:
pred = model.predict(np.array(pca_threshold_df)[0][np.newaxis,...])
label  = np.array(labels_df)[0]
print(pred)
print(label)

In [None]:
evaluation = model.evaluate(pca_threshold_df, labels_df)
accuracy = evaluation[1]  # binary_accuracy
print("Overall Accuracy:", accuracy)