In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import umap


In [55]:
# read proteinfer output into dataframe
proteinfer = pd.read_csv('../exploration/proteinfer/proteinfer_categories_wide.csv')
eggnog = pd.read_csv('../tables/eggnog_PG_annotation.csv')

In [4]:
proteinfer.head()

Unnamed: 0,gene,Pfam:CL0023,EC:3.-.-.-,GO:0005524,GO:0016887,GO:0030554,GO:0032559,GO:0043168,GO:0005488,GO:0008150,...,GO:0051091,GO:0031346,GO:0044089,GO:1903658,GO:1903656,EC:1.1.1.65,GO:0050236,EC:4.1.2.4,GO:0004139,GO:2001023
0,group_9396,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,group_2011,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,rob_2~~~rob_1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,group_3075,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,dan,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# set UMAP parameters, use multicore 
reducer_2D = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42, metric='cosine', n_jobs=8)
reducer_3D = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42, metric='cosine', n_jobs=8)

In [7]:
# drop the first column, which is the protein name
proteinfer_matrix = proteinfer.drop(columns=['gene'])
proteinfer_matrix.head()

Unnamed: 0,Pfam:CL0023,EC:3.-.-.-,GO:0005524,GO:0016887,GO:0030554,GO:0032559,GO:0043168,GO:0005488,GO:0008150,GO:0097367,...,GO:0051091,GO:0031346,GO:0044089,GO:1903658,GO:1903656,EC:1.1.1.65,GO:0050236,EC:4.1.2.4,GO:0004139,GO:2001023
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# fit the UMAP model
embedding_2D = reducer_2D.fit_transform(proteinfer_matrix)
embedding_3D = reducer_3D.fit_transform(proteinfer_matrix)

In [17]:
print(f'2D embedding shape: {embedding_2D.shape}')
print(f'3D embedding shape: {embedding_3D.shape}')

2D embedding shape: (21892, 2)
3D embedding shape: (21892, 3)


In [18]:
# plot the 2D embedding in plotly, using the protein name as the hover text
fig = px.scatter(x=embedding_2D[:,0], y=embedding_2D[:,1], hover_name=proteinfer['gene'])
fig.show()

In [19]:
# plot the 3D embedding in plotly, using the protein name as the hover text
fig = px.scatter_3d(x=embedding_3D[:,0], y=embedding_3D[:,1], z=embedding_3D[:,2], hover_name=proteinfer['gene'])
fig.show()

In [24]:
# grid search for best UMAP parameters

from sklearn.model_selection import ParameterGrid

# define parameter grid
param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25, 30],
    'min_dist': [0.1, 0.2, 0.3, 0.4, 0.5],
    'n_components': [3],
    'random_state': [42],
    'metric': ['euclidean', 'cosine', 'manhattan'],
    'n_jobs': [8]
}

# this will run N number of models, where N is the number of combinations in the parameter grid
# N = 5 * 5 * 1 * 1 * 3 * 1 = 375

# create a list of all combinations of parameters
param_list = list(ParameterGrid(param_grid))

# create a list to store the results
results = []

# loop through each combination of parameters
for params in param_list:
    # create a UMAP model with the current parameters
    reducer = umap.UMAP(**params)
    # fit the model
    embedding = reducer.fit_transform(proteinfer_matrix)
    # store the results
    results.append({
        'n_neighbors': params['n_neighbors'],
        'min_dist': params['min_dist'],
        'n_components': params['n_components'],
        'random_state': params['random_state'],
        'metric': params['metric'],
        'n_jobs': params['n_jobs'],
        'embedding': embedding
    })

# create a dataframe from the results
results_df = pd.DataFrame(results)




Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.



In [28]:
# save the plot of the embeddings for each combination of parameters, using the protein name as the hover text and the parameters combination as the title
# save the plot as a html file
for i, row in results_df.iterrows():
    print(f'Plotting {i} of {len(results_df)}')
    fig = px.scatter_3d(x=row['embedding'][:,0], y=row['embedding'][:,1], z=row['embedding'][:,2], hover_name=proteinfer['gene'])
    fig.update_layout(title=f'n_neighbors: {row["n_neighbors"]}, min_dist: {row["min_dist"]}, metric: {row["metric"]}')
    fig.write_html(f'../exploration/proteinfer/umap_plots/html_plots/umap_plot_{i}.html')

Plotting 0 of 90
Plotting 1 of 90
Plotting 2 of 90
Plotting 3 of 90
Plotting 4 of 90
Plotting 5 of 90
Plotting 6 of 90
Plotting 7 of 90
Plotting 8 of 90
Plotting 9 of 90
Plotting 10 of 90
Plotting 11 of 90
Plotting 12 of 90
Plotting 13 of 90
Plotting 14 of 90
Plotting 15 of 90
Plotting 16 of 90
Plotting 17 of 90
Plotting 18 of 90
Plotting 19 of 90
Plotting 20 of 90
Plotting 21 of 90
Plotting 22 of 90
Plotting 23 of 90
Plotting 24 of 90
Plotting 25 of 90
Plotting 26 of 90
Plotting 27 of 90
Plotting 28 of 90
Plotting 29 of 90
Plotting 30 of 90
Plotting 31 of 90
Plotting 32 of 90
Plotting 33 of 90
Plotting 34 of 90
Plotting 35 of 90
Plotting 36 of 90
Plotting 37 of 90
Plotting 38 of 90
Plotting 39 of 90
Plotting 40 of 90
Plotting 41 of 90
Plotting 42 of 90
Plotting 43 of 90
Plotting 44 of 90
Plotting 45 of 90
Plotting 46 of 90
Plotting 47 of 90
Plotting 48 of 90
Plotting 49 of 90
Plotting 50 of 90
Plotting 51 of 90
Plotting 52 of 90
Plotting 53 of 90
Plotting 54 of 90
Plotting 55 of 90
Pl

In [29]:
results_df.to_csv('../exploration/proteinfer/umap_grid_results.csv', index=False)

array([[ 2.478485 , 10.843754 , -6.2398705],
       [ 6.089166 ,  6.9787197,  1.3918666],
       [ 6.5486655,  1.5164227,  3.439683 ],
       ...,
       [ 6.6506624, -1.4174998, -1.6654941],
       [ 6.688608 ,  6.5024486, -6.055646 ],
       [ 6.0018797, -3.5929508, -5.3837066]], dtype=float32)

In [50]:
# create a reduced dataframe where the row sum is greater than 2, keep the protein names
proteinfer_reduced = proteinfer[proteinfer_matrix.sum(axis=1) > 2]
# proteinfer_reduced = proteinfer_matrix[proteinfer_matrix.sum(axis=1) > 2]
print(f'Original shape: {proteinfer_matrix.shape}')
print(f'Reduced shape: {proteinfer_reduced.shape}')

proteinfer_matrix_reduced = proteinfer_reduced.drop(columns=['gene'])

Original shape: (21892, 8397)
Reduced shape: (18612, 8398)


In [51]:
proteinfer_reduced.head()

Unnamed: 0,gene,Pfam:CL0023,EC:3.-.-.-,GO:0005524,GO:0016887,GO:0030554,GO:0032559,GO:0043168,GO:0005488,GO:0008150,...,GO:0051091,GO:0031346,GO:0044089,GO:1903658,GO:1903656,EC:1.1.1.65,GO:0050236,EC:4.1.2.4,GO:0004139,GO:2001023
0,group_9396,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,group_2011,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,rob_2~~~rob_1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,group_3075,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,dan,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# grid search with the reduced dataset

# this will run N number of models, where N is the number of combinations in the parameter grid
# N = 5 * 5 * 1 * 1 * 3 * 1 = 375

# create a list of all combinations of parameters
param_list = list(ParameterGrid(param_grid))

# create a list to store the results
results = []

# loop through each combination of parameters
for params in param_list:
    print(f'Running model {len(results) + 1} of {len(param_list)}')
    # create a UMAP model with the current parameters
    reducer = umap.UMAP(**params)
    # fit the model
    embedding = reducer.fit_transform(proteinfer_matrix_reduced)
    # store the results
    results.append({
        'n_neighbors': params['n_neighbors'],
        'min_dist': params['min_dist'],
        'n_components': params['n_components'],
        'random_state': params['random_state'],
        'metric': params['metric'],
        'n_jobs': params['n_jobs'],
        'embedding': embedding
    })

# create a dataframe from the results
results_df_reduced = pd.DataFrame(results)



Running model 1 of 90
Running model 2 of 90
Running model 3 of 90
Running model 4 of 90
Running model 5 of 90
Running model 6 of 90
Running model 7 of 90
Running model 8 of 90
Running model 9 of 90
Running model 10 of 90
Running model 11 of 90
Running model 12 of 90
Running model 13 of 90
Running model 14 of 90
Running model 15 of 90
Running model 16 of 90
Running model 17 of 90
Running model 18 of 90
Running model 19 of 90
Running model 20 of 90
Running model 21 of 90
Running model 22 of 90
Running model 23 of 90
Running model 24 of 90
Running model 25 of 90
Running model 26 of 90
Running model 27 of 90
Running model 28 of 90
Running model 29 of 90
Running model 30 of 90
Running model 31 of 90
Running model 32 of 90
Running model 33 of 90
Running model 34 of 90
Running model 35 of 90
Running model 36 of 90
Running model 37 of 90
Running model 38 of 90
Running model 39 of 90
Running model 40 of 90
Running model 41 of 90
Running model 42 of 90
Running model 43 of 90
Running model 44 of 


Graph is not fully connected, spectral embedding may not work as expected.



Running model 62 of 90
Running model 63 of 90
Running model 64 of 90
Running model 65 of 90
Running model 66 of 90
Running model 67 of 90



Graph is not fully connected, spectral embedding may not work as expected.



Running model 68 of 90
Running model 69 of 90
Running model 70 of 90
Running model 71 of 90
Running model 72 of 90
Running model 73 of 90



Graph is not fully connected, spectral embedding may not work as expected.



Running model 74 of 90
Running model 75 of 90
Running model 76 of 90
Running model 77 of 90
Running model 78 of 90
Running model 79 of 90



Graph is not fully connected, spectral embedding may not work as expected.



Running model 80 of 90
Running model 81 of 90
Running model 82 of 90
Running model 83 of 90
Running model 84 of 90
Running model 85 of 90



Graph is not fully connected, spectral embedding may not work as expected.



Running model 86 of 90
Running model 87 of 90
Running model 88 of 90
Running model 89 of 90
Running model 90 of 90


In [53]:
# save the plot of the embeddings for each combination of parameters, using the protein name as the hover text and the parameters combination as the title
# save the plot as a html file
for i, row in results_df_reduced.iterrows():
    print(f'Plotting {i} of {len(results_df_reduced)}')
    fig = px.scatter_3d(x=row['embedding'][:,0], y=row['embedding'][:,1], z=row['embedding'][:,2], hover_name=proteinfer_reduced['gene'])
    fig.update_layout(title=f'n_neighbors: {row["n_neighbors"]}, min_dist: {row["min_dist"]}, metric: {row["metric"]}')
    fig.write_html(f'../exploration/proteinfer/umap_plots/html_plots/umap_plot_reduced_{i}.html')

Plotting 0 of 90
Plotting 1 of 90
Plotting 2 of 90
Plotting 3 of 90
Plotting 4 of 90
Plotting 5 of 90
Plotting 6 of 90
Plotting 7 of 90
Plotting 8 of 90
Plotting 9 of 90
Plotting 10 of 90
Plotting 11 of 90
Plotting 12 of 90
Plotting 13 of 90
Plotting 14 of 90
Plotting 15 of 90
Plotting 16 of 90
Plotting 17 of 90
Plotting 18 of 90
Plotting 19 of 90
Plotting 20 of 90
Plotting 21 of 90
Plotting 22 of 90
Plotting 23 of 90
Plotting 24 of 90
Plotting 25 of 90
Plotting 26 of 90
Plotting 27 of 90
Plotting 28 of 90
Plotting 29 of 90
Plotting 30 of 90
Plotting 31 of 90
Plotting 32 of 90
Plotting 33 of 90
Plotting 34 of 90
Plotting 35 of 90
Plotting 36 of 90
Plotting 37 of 90
Plotting 38 of 90
Plotting 39 of 90
Plotting 40 of 90
Plotting 41 of 90
Plotting 42 of 90
Plotting 43 of 90
Plotting 44 of 90
Plotting 45 of 90
Plotting 46 of 90
Plotting 47 of 90
Plotting 48 of 90
Plotting 49 of 90
Plotting 50 of 90
Plotting 51 of 90
Plotting 52 of 90
Plotting 53 of 90
Plotting 54 of 90
Plotting 55 of 90
Pl

In [54]:
# plot the 3D embedding in plotly, using the protein name as the hover text and the parameters combination as the title
# plt.show 
# I want to be able to search for text in the plot, but I can't figure out how to do that
# plot only one example
fig = px.scatter_3d(x=results_df_reduced['embedding'][0][:,0], y=results_df_reduced['embedding'][0][:,1], z=results_df_reduced['embedding'][0][:,2], hover_name=proteinfer_reduced['gene'])
fig.update_layout(title=f'n_neighbors: {results_df_reduced["n_neighbors"][0]}, min_dist: {results_df_reduced["min_dist"][0]}, metric: {results_df_reduced["metric"][0]}')
fig.show()

In [68]:
# create a UMAP in 2D with the best parameters
reducer = umap.UMAP(n_neighbors=5, min_dist=0.2, metric='cosine', n_components=2, n_jobs=9)
embedding = reducer.fit_transform(proteinfer_matrix_reduced)

# join eggnog columns Gene and COG_category and proteinfer_reduced (by variables Gene and gene)
# this will create a dataframe with the protein names, the COG category, and the UMAP embedding
proteinfer_reduced_eggnog = proteinfer_reduced.merge(eggnog[['Gene', 'COG_category']], left_on='gene', right_on='Gene', how='left')
proteinfer_reduced_eggnog.head()


# plot the 2D embedding in plotly, using the protein name as the hover text and the parameters combination as the title
# use the COG category as the color
fig = px.scatter(x=embedding[:,0], y=embedding[:,1], hover_name=proteinfer_reduced_eggnog['gene'], color=proteinfer_reduced_eggnog['COG_category'])
fig.update_layout(title=f'n_neighbors: {results_df_reduced["n_neighbors"][0]}, min_dist: {results_df_reduced["min_dist"][0]}, metric: {results_df_reduced["metric"][0]}')
fig.show()

In [69]:
# grid search with the reduced dataset and colors by COG category

# this will run N number of models, where N is the number of combinations in the parameter grid

param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25],
    'min_dist': [0.1, 0.2, 0.3, 0.4, 0.5],
    'n_components': [2],
    'random_state': [42],
    'metric': ['cosine', 'euclidean', 'manhattan', 'chebyshev'],
    'n_jobs': [8]
}

# create a list of all combinations of parameters
param_list = list(ParameterGrid(param_grid))

# create a list to store the results
results = []

# loop through each combination of parameters
for params in param_list:
    print(f'Running model {len(results) + 1} of {len(param_list)}')
    # create a UMAP model with the current parameters
    reducer = umap.UMAP(**params)
    # fit the model
    embedding = reducer.fit_transform(proteinfer_matrix_reduced)
    # store the results
    results.append({
        'n_neighbors': params['n_neighbors'],
        'min_dist': params['min_dist'],
        'n_components': params['n_components'],
        'random_state': params['random_state'],
        'metric': params['metric'],
        'n_jobs': params['n_jobs'],
        'embedding': embedding
    })

# create a dataframe from the results
results_df_reduced = pd.DataFrame(results)

# save the plot of the embeddings for each combination of parameters, using the protein name as the hover text and the parameters combination as the title
# save the plot as a html file
for i, row in results_df_reduced.iterrows():
    print(f'Plotting {i} of {len(results_df_reduced)}')
    fig = px.scatter(x=row['embedding'][:,0], y=row['embedding'][:,1], hover_name=proteinfer_reduced['gene'], color=proteinfer_reduced_eggnog['COG_category'])
    fig.update_layout(title=f'n_neighbors: {row["n_neighbors"]}, min_dist: {row["min_dist"]}, metric: {row["metric"]}')
    fig.write_html(f'../exploration/proteinfer/umap_plots/html_plots/umap_plot_reduced_2D_{i}.html')



Running model 1 of 100
Running model 2 of 100
Running model 3 of 100
Running model 4 of 100
Running model 5 of 100
Running model 6 of 100
Running model 7 of 100
Running model 8 of 100
Running model 9 of 100
Running model 10 of 100
Running model 11 of 100
Running model 12 of 100
Running model 13 of 100
Running model 14 of 100
Running model 15 of 100
Running model 16 of 100
Running model 17 of 100
Running model 18 of 100
Running model 19 of 100
Running model 20 of 100
Running model 21 of 100
Running model 22 of 100
Running model 23 of 100
Running model 24 of 100
Running model 25 of 100
Running model 26 of 100
Running model 27 of 100
Running model 28 of 100
Running model 29 of 100
Running model 30 of 100
Running model 31 of 100
Running model 32 of 100
Running model 33 of 100
Running model 34 of 100
Running model 35 of 100
Running model 36 of 100
Running model 37 of 100
Running model 38 of 100
Running model 39 of 100
Running model 40 of 100
Running model 41 of 100
Running model 42 of 100
R


Graph is not fully connected, spectral embedding may not work as expected.



Running model 52 of 100
Running model 53 of 100
Running model 54 of 100
Running model 55 of 100
Running model 56 of 100



Graph is not fully connected, spectral embedding may not work as expected.



Running model 57 of 100
Running model 58 of 100
Running model 59 of 100
Running model 60 of 100
Running model 61 of 100



Graph is not fully connected, spectral embedding may not work as expected.



Running model 62 of 100
Running model 63 of 100
Running model 64 of 100
Running model 65 of 100
Running model 66 of 100



Graph is not fully connected, spectral embedding may not work as expected.



Running model 67 of 100
Running model 68 of 100
Running model 69 of 100
Running model 70 of 100
Running model 71 of 100



Graph is not fully connected, spectral embedding may not work as expected.



Running model 72 of 100
Running model 73 of 100
Running model 74 of 100
Running model 75 of 100
Running model 76 of 100
Running model 77 of 100
Running model 78 of 100
Running model 79 of 100
Running model 80 of 100
Running model 81 of 100
Running model 82 of 100
Running model 83 of 100
Running model 84 of 100
Running model 85 of 100
Running model 86 of 100
Running model 87 of 100
Running model 88 of 100
Running model 89 of 100
Running model 90 of 100
Running model 91 of 100
Running model 92 of 100
Running model 93 of 100
Running model 94 of 100
Running model 95 of 100
Running model 96 of 100
Running model 97 of 100
Running model 98 of 100
Running model 99 of 100
Running model 100 of 100
Plotting 0 of 100
Plotting 1 of 100
Plotting 2 of 100
Plotting 3 of 100
Plotting 4 of 100
Plotting 5 of 100
Plotting 6 of 100
Plotting 7 of 100
Plotting 8 of 100
Plotting 9 of 100
Plotting 10 of 100
Plotting 11 of 100
Plotting 12 of 100
Plotting 13 of 100
Plotting 14 of 100
Plotting 15 of 100
Plotting 