In [2]:
pip install -U openai, scipy, plotly-express, scikit-learn, umap-learn
pip install numpy==1.23.0 # umap requires an older version of numpy

In [23]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.cluster import KMeans
from scipy.spatial import distance
import plotly.express as px
import umap.umap_ as umap

In [87]:
'''
Performs AI embedding on data
'''

# load data

data = pd.read_csv('../data/log2_data_v3.csv')
data.head()

labels = pd.read_csv('../data/labels_v3.csv')
labels.head()

Unnamed: 0.1,Unnamed: 0,Response
0,SRR7344546,R
1,SRR7344554,N
2,SRR7344556,N
3,SRR7344564,N
4,SRR7344565,N


In [49]:
client = OpenAI(
  api_key='',  
)


In [66]:
def get_embedding(df):
	# Embed transcriptomic data
	response = client.embeddings.create(
    	model= 'text-embedding-ada-002',
    	input=[df]
	)
	# Get embedded data
	embedding = response.data[0].embedding
    
	return embedding

In [70]:
# drop first column of sample IDs

embedded_data = data.drop(data.columns[0], axis = 1)

# testing: drop columns
keep_thresh = 100
embedded_data = embedded_data.iloc[:, :keep_thresh]
print(embedded_data.shape)

# change numeric to str and concatenate everything by row 
embedded_data['concatenated'] = embedded_data.astype(str).apply(lambda row: ' '.join(row), axis = 1)
print(embedded_data['concatenated'])

# apply embeddings
embedded_data['embedding'] = embedded_data['concatenated'].apply(get_embedding)



(125, 100)
0      0.0 1.1649346867152932 0.17798414980298533 0.6...
1      0.12880729789026002 1.358726836522174 0.070844...
2      0.16219362419847294 0.14344533937100276 0.0 0....
3      0.2677332475807369 0.03057601299596717 0.32604...
4      0.13566506647355595 0.053567868543348565 0.241...
                             ...                        
120    0.18652448727387372 0.0 0.1414241399271249 0.9...
121    0.020039051950253725 0.0 0.06217935087677976 0...
122    0.07587424138298625 0.0 0.2033058122184411 0.6...
123    0.12628848729363462 0.0 0.2261974310538789 1.1...
124    0.1012481118342196 0.0 0.04323628856435453 0.7...
Name: concatenated, Length: 125, dtype: object


In [97]:
# run kmeans based on embeddings 
# 2 clusters for responder, non-responder

kmeans = KMeans(n_clusters = 2, n_init = 'auto')
kmeans.fit(embedded_data['embedding'].tolist())

In [100]:
# dimensionality reduction and visualization using UMAP

um = umap.UMAP()
embedded_data_2d = um.fit_transform(embedded_data['embedding'].tolist())

color_map = {
    '0' : 'orange',
    '1' : 'blue'
}


fig = px.scatter(x = embedded_data_2d[:, 0], y = embedded_data_2d[:, 1], color = kmeans.labels_.astype(str), color_discrete_map = color_map, symbol = labels['Response'])
fig.update_layout(xaxis_title = 'umap 1', yaxis_title = 'umap 2')

fig.show()