In [6]:
# pip install -U openai scipy plotly-express scikit-learn umap-learn
# pip install numpy == 1.23.0 # umap requires an older version of numpy

In [45]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.cluster import KMeans
from scipy.spatial import distance
import plotly.express as px
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [59]:
'''
Performs AI embedding on data
'''

# load data

data = pd.read_csv('../data/log2_top_100.csv')
print(data.head())

labels = pd.read_csv('../data/labels_top_100.csv')
print(labels.head())

# encode categorical labels to numerical codes
labels_values = labels['Response'].values
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels_values)
print("Label Map:")
for encoded_value, original_label in enumerate(label_encoder.classes_):
    print(f"{encoded_value}: {original_label}")

   Unnamed: 0     RN7SL2        FTL     MT-CO3     MT-CO2   MT-ATP8  \
0  SRR7344546  15.038124  10.267509   6.096489   6.029887  4.996209   
1  SRR7344554  15.313833   9.622666   9.860376   9.738486  9.475843   
2  SRR7344556  16.739384   9.744946  10.067124  10.726867  8.760113   
3  SRR7344564   6.150741  11.761988  11.113195  10.876473  9.272044   
4  SRR7344565   6.737094  11.851334  10.608619  10.246659  8.657706   

     MT-ATP6     MT-CO1       IGKC     MT-ND4  ...      CTSB     IGKJ1  \
0   4.828822   7.590253   5.647115   5.412997  ...  7.253257  0.000000   
1   9.217649  10.936979   5.360864   8.928858  ...  7.433607  0.000000   
2   8.636872  11.444593   9.147465   8.696023  ...  5.344500  1.691215   
3  10.677880  11.052563  10.066972  10.668391  ...  7.083906  4.836296   
4   9.951498  10.451394  10.013964   9.771394  ...  7.081397  2.933361   

     SPRR2E   IGKV4-1  TFF1        HP     HLA-A   IGHV3-7  HLA-DRB1    KRTDAP  
0  0.000000  0.217787   0.0  0.474323  9.465274 

In [5]:
# openAI API key

client = OpenAI(
  api_key='',  
)


In [28]:
# embedding transcriptomic data

def get_embedding(df):

	# model
	
	response = client.embeddings.create(
    	model= 'text-embedding-ada-002',
    	input=[df]
	)
	# Get embedded data
	
	embedding = response.data[0].embedding
    
	return embedding

In [29]:
# drop first column of sample IDs

embedded_data = data.drop(data.columns[0], axis = 1)

# testing: drop columns

keep_thresh = 100
embedded_data = embedded_data.iloc[:, :keep_thresh]
print(embedded_data.shape)

# change numeric to str and concatenate everything by row (sentence-like input structure required)

embedded_data['concatenated'] = embedded_data.astype(str).apply(lambda row: ' '.join(row), axis = 1)
print(embedded_data['concatenated'])

# apply embeddings

embedded_data['embedding'] = embedded_data['concatenated'].apply(get_embedding)



(84, 100)
0     15.038123971976647 10.26750924509026 6.0964890...
1     15.313832801838668 9.6226660795167 9.860376156...
2     16.73938425784841 9.744945814413533 10.0671239...
3     6.150741097513215 11.761988404115765 11.113195...
4     6.7370942412119295 11.851334460576384 10.60861...
                            ...                        
79    4.104171984522506 10.825797954659636 13.427096...
80    3.0978353769841624 10.580103071458565 13.26022...
81    2.738509312483404 11.591330573704969 12.450635...
82    3.1536593152545835 11.6962664255976 14.0314130...
83    3.2770374365591306 15.363024488549184 13.88975...
Name: concatenated, Length: 84, dtype: object


In [56]:
# run kmeans based on embeddings 

# 2 clusters for responder, non-responder

kmeans = KMeans(n_clusters = 2, n_init = 'auto')
model = kmeans.fit(embedded_data['embedding'].tolist())

In [31]:
# dimensionality reduction and visualization using UMAP

um = umap.UMAP()
embedded_data_2d = um.fit_transform(embedded_data['embedding'].tolist())

color_map = {
    '0' : 'orange',
    '1' : 'blue'
}

fig = px.scatter(x = embedded_data_2d[:, 0], y = embedded_data_2d[:, 1], color = model.labels_.astype(str), color_discrete_map = color_map, symbol = labels['Response'])
fig.update_layout(xaxis_title = 'umap 1', yaxis_title = 'umap 2')

fig.show()

In [49]:
# Train a logistic regression model using the top 2 PCAs before optimizing hyperparameters
X_train, X_test, y_train, y_test = train_test_split(embedded_data_2d, labels_encoded, test_size = 0.3, random_state=1234)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_train)
training_accuracy = accuracy_score(y_pred, y_train)
y_pred = logreg.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)
print("Training accuracy before hyperparameter optimization:", training_accuracy)
print("Test set accuracy before hyperparameter optimization:", test_accuracy)

Training accuracy before hyperparameter optimization: 0.7413793103448276
Test set accuracy before hyperparameter optimization: 0.5769230769230769


In [52]:
# find the best hyperparameters for an L1 regularized LogReg model
hyperparameters = {
    'C': [0.1, 1, 10, 100, 1000],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

grid_search = GridSearchCV(LogisticRegression(penalty="l1"), hyperparameters,  cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best hyperparameters:", best_params)

Best hyperparameters: {'C': 0.1, 'max_iter': 1000, 'solver': 'liblinear'}



The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_i

In [53]:
# Train a logistic regression model using the top 2 PCAs after optimizing hyperparameters
y_pred = best_model.predict(X_train)
training_accuracy = accuracy_score(y_pred, y_train)
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_pred, y_test)

print("Training accuracy after hyperparameter optimization:", training_accuracy)
print("Test set accuracy after hyperparameter optimization:", test_accuracy)

Training accuracy after hyperparameter optimization: 0.7413793103448276
Test set accuracy after hyperparameter optimization: 0.5769230769230769


In [60]:
# export cluster assignments

clusters_embedding = pd.concat([labels, pd.DataFrame(model.labels_, columns = ['cluster'])], axis = 1)

clusters_embedding.to_csv('../data/clusters_embedding.csv', index = False)

# export data for L1 log reg
embedded_data['embedding'].to_csv('../data/embedded_data.csv', index = False)