In [1]:
# pip install -U openai scipy plotly-express scikit-learn umap-learn
# pip install numpy == 1.23.0 # umap requires an older version of numpy

In [2]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.cluster import KMeans
from scipy.spatial import distance
import plotly.express as px
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [3]:
'''
Performs AI embedding on data
'''

# load data

data = pd.read_csv('../data/GSE910_SMOTE_log2_top100.csv')
data = data.drop(data.columns[0], axis =1)
print(data)

labels = pd.read_csv('../data/GSE910_SMOTE_labels_top100.csv')
print(labels)



        RN7SL2        FTL     MT-CO3     MT-CO2    MT-ATP8    MT-ATP6  \
0    15.038124  10.267509   6.096489   6.029887   4.996209   4.828822   
1    15.313833   9.622666   9.860376   9.738486   9.475843   9.217649   
2    16.739384   9.744946  10.067124  10.726867   8.760113   8.636872   
3     6.150741  11.761988  11.113195  10.876473   9.272044  10.677880   
4     6.737094  11.851334  10.608619  10.246659   8.657706   9.951498   
..         ...        ...        ...        ...        ...        ...   
111   4.355103  11.250609  13.168966  12.982082  12.444777  12.334313   
112   6.131375   9.797686  13.330152  13.433663  13.033499  13.012454   
113   4.201878  10.747962  13.237784  12.853114  12.543654  12.216011   
114   7.196152  11.728989  13.865689  14.082163  13.449037  13.490130   
115   5.810201  11.861459  13.483685  14.015472  12.733038  12.980847   

        MT-CO1       IGKC     MT-ND4    MT-RNR2  ...      CTSB     IGKJ1  \
0     7.590253   5.647115   5.412997   5.024420

In [4]:
# openAI API key

client = OpenAI(
  api_key='',  
)


In [5]:
# embedding transcriptomic data

def get_embedding(df):

	# model
	
	response = client.embeddings.create(
    	model= 'text-embedding-ada-002',
    	input=[df]
	)
	# Get embedded data
	
	embedding = response.data[0].embedding
    
	return embedding

In [6]:
# drop first column of sample IDs

embedded_data = data.drop(data.columns[0], axis = 1)
print(embedded_data.shape)

# change numeric to str and concatenate everything by row (sentence-like input structure required)

embedded_data['concatenated'] = embedded_data.astype(str).apply(lambda row: ' '.join(row), axis = 1)
print(embedded_data['concatenated'])

# apply embeddings

embedded_data['embedding'] = embedded_data['concatenated'].apply(get_embedding)



(116, 99)
0      10.26750924509026 6.096489077610898 6.02988704...
1      9.6226660795167 9.86037615653056 9.73848598171...
2      9.744945814413533 10.06712393608517 10.7268670...
3      11.761988404115765 11.113195214781117 10.87647...
4      11.851334460576384 10.608618955940033 10.24665...
                             ...                        
111    11.250608516979582 13.168966216330713 12.98208...
112    9.797685814655683 13.330152007589266 13.433663...
113    10.74796235187887 13.237783506785176 12.853114...
114    11.728988798545844 13.865689050483866 14.08216...
115    11.861458887347524 13.483685277296948 14.01547...
Name: concatenated, Length: 116, dtype: object


In [7]:
# run kmeans based on embeddings 

# 2 clusters for responder, non-responder

kmeans = KMeans(n_clusters = 2, n_init = 'auto')
model = kmeans.fit(embedded_data['embedding'].tolist())

In [8]:
# dimensionality reduction and visualization using UMAP

um = umap.UMAP()
embedded_data_2d = um.fit_transform(embedded_data['embedding'].tolist())

color_map = {
    '0' : 'purple',
    '1' : 'yellow'
}

fig = px.scatter(x = embedded_data_2d[:, 0], y = embedded_data_2d[:, 1], color = labels['Response'], color_discrete_map = color_map, width = 500, height = 400)
fig.update_layout(xaxis_title = 'umap 1', yaxis_title = 'umap 2')

fig.show()
# requires Kaleido installation
fig.write_image("../figures/GSE910_embeddings_umap.png")

In [9]:
# Train a logistic regression model using the embeddings before optimizing hyperparameters
X_train, X_test, y_train, y_test = train_test_split(embedded_data['embedding'].tolist(), labels['Response'], test_size = 0.3, random_state=1234)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_train)
training_accuracy = accuracy_score(y_pred, y_train)
y_pred = logreg.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)
print("Training accuracy before hyperparameter optimization:", training_accuracy)
print("Test set accuracy before hyperparameter optimization:", test_accuracy)

Training accuracy before hyperparameter optimization: 0.5432098765432098
Test set accuracy before hyperparameter optimization: 0.45714285714285713


In [10]:
# find the best hyperparameters for an L1 regularized LogReg model
hyperparameters = {
    'C': [0.1, 1, 10, 100, 1000],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

grid_search = GridSearchCV(LogisticRegression(penalty="l1"), hyperparameters,  cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best hyperparameters:", best_params)


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge



Best hyperparameters: {'C': 1000, 'max_iter': 1000, 'solver': 'liblinear'}



The max_iter was reached which means the coef_ did not converge



In [36]:
# Train a logistic regression model using the top 2 PCAs after optimizing hyperparameters
y_pred = best_model.predict(X_train)
training_accuracy = accuracy_score(y_pred, y_train)
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)

print("Training accuracy after hyperparameter optimization:", training_accuracy)
print("Test set accuracy after hyperparameter optimization:", test_accuracy)

Training accuracy after hyperparameter optimization: 1.0
Test set accuracy after hyperparameter optimization: 0.4857142857142857


In [38]:
# export cluster assignments

clusters_embedding = pd.concat([pd.DataFrame(y_pred, columns = ['y_pred']), pd.DataFrame(y_test.to_list(), columns = ['y_test'])], axis = 1)

clusters_embedding.to_csv('../data/clusters_embedding.csv', index = False)

# export data for L1 log reg
embedded_data['embedding'].to_csv('../data/embedded_data.csv', index = False)