In [6]:
# pip install -U openai scipy plotly-express scikit-learn umap-learn
# pip install numpy == 1.23.0 # umap requires an older version of numpy

In [61]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.cluster import KMeans
from scipy.spatial import distance
import plotly.express as px
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [79]:
'''
Performs AI embedding on data
'''

# load data

data = pd.read_csv('../data/GSE910_smote_data.csv')

labels = pd.DataFrame(data['Response'])
labels = labels.rename(columns={labels.columns[0]: 'Response'})
print(labels)

data = data.drop(columns = ['Response'])
print(data)


    Response
0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         1
12         1
13         1
14         1
15         1
16         1
17         1
18         0
19         0
20         0
21         1
22         0
23         0
24         1
25         1
26         1
27         1
28         1
29         1
30         1
31         1
    Unnamed: 0    S100A7  HLA-DQA1    MAGEA3    S100A8     DMBT1    MAGEC2  \
0            0  0.289860  0.634494 -0.001243 -0.360320 -0.906716  1.843763   
1            1 -0.774348  0.772281 -1.493807 -0.912727 -0.893726 -1.423186   
2            2  1.429504 -1.002635  1.240063  1.551765  0.719683  0.780976   
3            3 -0.897511 -0.774126  1.457942 -0.397389 -0.864712 -0.408723   
4            4 -0.624348 -1.162335 -1.204793 -0.531867  1.530381 -1.000433   
5            5 -0.921433 -1.039624 -1.433164 -1.270253 -0.558488 -1.050044   
6            6 -0.452806 

In [5]:
# openAI API key

client = OpenAI(
  api_key='',  
)


In [71]:
# embedding transcriptomic data

def get_embedding(df):

	# model
	
	response = client.embeddings.create(
    	model= 'text-embedding-ada-002',
    	input=[df]
	)
	# Get embedded data
	
	embedding = response.data[0].embedding
    
	return embedding

In [80]:
# drop first column of sample IDs

embedded_data = data.drop(data.columns[0], axis = 1)

# testing: drop columns

keep_thresh = 100
embedded_data = embedded_data.iloc[:, :keep_thresh]
print(embedded_data.shape)

# change numeric to str and concatenate everything by row (sentence-like input structure required)

embedded_data['concatenated'] = embedded_data.astype(str).apply(lambda row: ' '.join(row), axis = 1)
print(embedded_data['concatenated'])

# apply embeddings

embedded_data['embedding'] = embedded_data['concatenated'].apply(get_embedding)



(32, 100)
0     0.28985955262628305 0.6344944293624388 -0.0012...
1     -0.7743479953826805 0.7722814544436405 -1.4938...
2     1.4295035157971616 -1.0026351623438754 1.24006...
3     -0.8975110432535529 -0.7741258645722956 1.4579...
4     -0.6243476721738939 -1.1623352751155436 -1.204...
5     -0.9214327243331752 -1.039623590685269 -1.4331...
6     -0.4528063163076792 0.7699709819397569 -0.8601...
7     -0.8364445937238594 -0.8951763280139385 0.4959...
8     -0.7499238028248473 -1.07011779652627 -0.99335...
9     1.5706779576668246 -0.9221937135932072 0.99534...
10    1.2704387407910072 0.33116956650678997 -1.6263...
11    -0.7484567903552781 1.0662627218806249 1.07172...
12    1.5735691116652235 0.9795642167928036 0.670856...
13    -0.6072201226017688 -0.7238362274417525 -0.086...
14    -0.7787617991899041 1.1511750667009752 0.82954...
15    0.9658367415507008 1.5234101847023058 0.058365...
16    -0.5792079416549456 -0.7882646801834792 -1.158...
17    -0.9420623159005226 1.1026516704

In [81]:
# run kmeans based on embeddings 

# 2 clusters for responder, non-responder

kmeans = KMeans(n_clusters = 2, n_init = 'auto')
model = kmeans.fit(embedded_data['embedding'].tolist())

In [82]:
# dimensionality reduction and visualization using UMAP

um = umap.UMAP()
embedded_data_2d = um.fit_transform(embedded_data['embedding'].tolist())

color_map = {
    '0' : 'orange',
    '1' : 'blue'
}

fig = px.scatter(x = embedded_data_2d[:, 0], y = embedded_data_2d[:, 1], color = model.labels_.astype(str), color_discrete_map = color_map, symbol = labels['Response'])
fig.update_layout(xaxis_title = 'umap 1', yaxis_title = 'umap 2')

fig.show()

In [89]:
# Train a logistic regression model using the embeddings before optimizing hyperparameters
X_train, X_test, y_train, y_test = train_test_split(embedded_data_2d, np.ravel(labels), test_size = 0.3, random_state=1234)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_train)
training_accuracy = accuracy_score(y_pred, y_train)
y_pred = logreg.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)
print("Training accuracy before hyperparameter optimization:", training_accuracy)
print("Test set accuracy before hyperparameter optimization:", test_accuracy)

Training accuracy before hyperparameter optimization: 0.5909090909090909
Test set accuracy before hyperparameter optimization: 0.6


In [90]:
# find the best hyperparameters for an L1 regularized LogReg model
hyperparameters = {
    'C': [0.1, 1, 10, 100, 1000],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

grid_search = GridSearchCV(LogisticRegression(penalty="l1"), hyperparameters,  cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best hyperparameters:", best_params)

Best hyperparameters: {'C': 0.1, 'max_iter': 1000, 'solver': 'liblinear'}



The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge



In [91]:
# Train a logistic regression model using the top 2 PCAs after optimizing hyperparameters
y_pred = best_model.predict(X_train)
training_accuracy = accuracy_score(y_pred, y_train)
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)

print("Training accuracy after hyperparameter optimization:", training_accuracy)
print("Test set accuracy after hyperparameter optimization:", test_accuracy)

Training accuracy after hyperparameter optimization: 0.5909090909090909
Test set accuracy after hyperparameter optimization: 0.3


In [92]:
# export cluster assignments

clusters_embedding = pd.concat([labels, pd.DataFrame(model.labels_, columns = ['cluster'])], axis = 1)

clusters_embedding.to_csv('../data/clusters_embedding.csv', index = False)

# export data for L1 log reg
embedded_data['embedding'].to_csv('../data/embedded_data.csv', index = False)