In [29]:
print("Starting the script...")

Starting the script...


In [30]:
import pandas as pd
import glob
import os

In [31]:
from dotenv import load_dotenv
import os

env_path = '/home/chb3333/yulab/chb3333/gem-patho/data_extraction/cancertype_location_description/OpenAI_key.env'
load_dotenv(env_path)

True

In [32]:
# ----- Part 1: Process the TCGA sample sheet -----

In [33]:
print("Loading the cleaned TCGA sample sheet...")
sample_sheet_path = "/home/chb3333/yulab/chb3333/data_extraction/wxs_sample_sheet_clean.tsv"
sample_df = pd.read_csv(sample_sheet_path, sep="\t")
print("Sample sheet loaded. Total rows:", len(sample_df))

Loading the cleaned TCGA sample sheet...
Sample sheet loaded. Total rows: 17773


In [34]:
print("Filtering for TCGA projects...")
tcga_df = sample_df[sample_df["Project ID"].str.contains("TCGA", na=False)].copy()
print("TCGA projects filtered. Rows after filtering:", len(tcga_df))

Filtering for TCGA projects...
TCGA projects filtered. Rows after filtering: 10640


In [35]:
print("Normalizing sample types...")
def normalize_sample_type(sample_str):
    parts = [s.strip() for s in str(sample_str).split(",")]
    return ", ".join(sorted(parts))

Normalizing sample types...


In [36]:
tcga_df["Normalized Sample Type"] = tcga_df["Sample Type"].apply(normalize_sample_type)
print("Sample types normalized.")

Sample types normalized.


In [37]:
print("Extracting TCGA Cancer Type Abbreviation...")
tcga_df["Cancer Type Abbrev"] = tcga_df["Project ID"].str.replace("TCGA-", "", regex=False)

Extracting TCGA Cancer Type Abbreviation...


In [38]:
print("Merging full TCGA Cancer Type name from mapping file...")
tcga_map_path = "/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/tcga_study_abbreviations.csv"
tcga_map_df = pd.read_csv(tcga_map_path)
tcga_df = pd.merge(tcga_df, tcga_map_df, left_on="Cancer Type Abbrev", right_on="Study Abbreviation", how="left")
print("TCGA mapping merge complete. Rows in tcga_df:", len(tcga_df))

Merging full TCGA Cancer Type name from mapping file...
TCGA mapping merge complete. Rows in tcga_df: 10640


In [39]:
tcga_df = tcga_df.drop(['Cancer Type Abbrev', "File ID", "File Name", "Data Category", "Data Type" ], axis=1)

In [40]:
tcga_df

Unnamed: 0,Project ID,Case ID,Sample ID,Sample Type,Normalized Sample Type,Study Abbreviation,Study Name
0,TCGA-OV,TCGA-42-2588,"TCGA-42-2588-10A, TCGA-42-2588-01A","Blood Derived Normal, Primary Tumor","Blood Derived Normal, Primary Tumor",OV,Ovarian serous cystadenocarcinoma
1,TCGA-OV,TCGA-20-1686,"TCGA-20-1686-01A, TCGA-20-1686-10A","Primary Tumor, Blood Derived Normal","Blood Derived Normal, Primary Tumor",OV,Ovarian serous cystadenocarcinoma
2,TCGA-OV,TCGA-09-0366,"TCGA-09-0366-10B, TCGA-09-0366-01A","Blood Derived Normal, Primary Tumor","Blood Derived Normal, Primary Tumor",OV,Ovarian serous cystadenocarcinoma
3,TCGA-OV,TCGA-13-0916,"TCGA-13-0916-10A, TCGA-13-0916-01A","Blood Derived Normal, Primary Tumor","Blood Derived Normal, Primary Tumor",OV,Ovarian serous cystadenocarcinoma
4,TCGA-OV,TCGA-61-1740,"TCGA-61-1740-01A, TCGA-61-1740-11A","Primary Tumor, Solid Tissue Normal","Primary Tumor, Solid Tissue Normal",OV,Ovarian serous cystadenocarcinoma
...,...,...,...,...,...,...,...
10635,TCGA-UCEC,TCGA-BS-A0T9,"TCGA-BS-A0T9-01A, TCGA-BS-A0T9-10C","Primary Tumor, Blood Derived Normal","Blood Derived Normal, Primary Tumor",UCEC,Uterine Corpus Endometrial Carcinoma
10636,TCGA-UCEC,TCGA-AJ-A3BK,"TCGA-AJ-A3BK-10A, TCGA-AJ-A3BK-01A","Blood Derived Normal, Primary Tumor","Blood Derived Normal, Primary Tumor",UCEC,Uterine Corpus Endometrial Carcinoma
10637,TCGA-UCEC,TCGA-EY-A547,"TCGA-EY-A547-01A, TCGA-EY-A547-10A","Primary Tumor, Blood Derived Normal","Blood Derived Normal, Primary Tumor",UCEC,Uterine Corpus Endometrial Carcinoma
10638,TCGA-UCEC,TCGA-AX-A3FT,"TCGA-AX-A3FT-01A, TCGA-AX-A3FT-10A","Primary Tumor, Blood Derived Normal","Blood Derived Normal, Primary Tumor",UCEC,Uterine Corpus Endometrial Carcinoma


In [41]:
icd_df = pd.read_parquet("/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/icd_codes/icd_data.parquet")

In [42]:
icd_df

Unnamed: 0,PATIENT_ID,CANCER_TYPE_ACRONYM,ICD_10,ICD_O_3_HISTOLOGY,ICD_O_3_SITE
0,TCGA-OR-A5J1,ACC,C74.0,8370/1,C74.0
1,TCGA-OR-A5J2,ACC,C74.0,8370/3,C74.0
2,TCGA-OR-A5J3,ACC,C74.0,8370/3,C74.0
3,TCGA-OR-A5J4,ACC,C74.0,8370/3,C74.0
4,TCGA-OR-A5J5,ACC,C74.0,8370/3,C74.0
...,...,...,...,...,...
10948,TCGA-V4-A9F3,UVM,,,
10949,TCGA-V4-A9EY,UVM,,,
10950,TCGA-V4-A9E5,UVM,,,
10951,TCGA-WC-A883,UVM,,,


In [43]:
merged_df = pd.merge(tcga_df, icd_df, left_on="Case ID", right_on="PATIENT_ID", how="inner")

In [44]:
merged_df = merged_df.drop(columns=['Project ID', 'Sample ID', 'Sample Type'])

# Select only the desired columns
result_df = merged_df[['Case ID', 'CANCER_TYPE_ACRONYM', 'ICD_10', 'ICD_O_3_HISTOLOGY', 'ICD_O_3_SITE', 'Normalized Sample Type']]



In [45]:
# Display the resulting dataframe
result_df.head()

Unnamed: 0,Case ID,CANCER_TYPE_ACRONYM,ICD_10,ICD_O_3_HISTOLOGY,ICD_O_3_SITE,Normalized Sample Type
0,TCGA-42-2588,OV,C56.9,8441/3,C56.9,"Blood Derived Normal, Primary Tumor"
1,TCGA-20-1686,OV,C56.9,8441/3,C56.9,"Blood Derived Normal, Primary Tumor"
2,TCGA-09-0366,OV,,,,"Blood Derived Normal, Primary Tumor"
3,TCGA-13-0916,OV,,,,"Blood Derived Normal, Primary Tumor"
4,TCGA-61-1740,OV,C56.9,8441/3,C56.9,"Primary Tumor, Solid Tissue Normal"


In [46]:
group_cols = ['CANCER_TYPE_ACRONYM', 'ICD_10', 'ICD_O_3_HISTOLOGY', 'ICD_O_3_SITE', 'Normalized Sample Type']

# Group by these columns and aggregate the Case IDs into a list
grouped_df = result_df.groupby(group_cols, dropna=False)['Case ID'].apply(list).reset_index()


# If you prefer a comma-separated string instead of a list, you can convert it as follows:
grouped_df['Case IDs'] = grouped_df['Case ID'].apply(lambda ids: ', '.join(ids))

# Drop the original 'Case ID' column (if only the aggregated version is needed)
grouped_df = grouped_df.drop(columns=['Case ID'])

In [47]:
grouped_df

Unnamed: 0,CANCER_TYPE_ACRONYM,ICD_10,ICD_O_3_HISTOLOGY,ICD_O_3_SITE,Normalized Sample Type,Case IDs
0,ACC,C74.0,8370/1,C74.0,"Blood Derived Normal, Primary Tumor","TCGA-OR-A5L9, TCGA-OR-A5J1, TCGA-OR-A5KU, TCGA..."
1,ACC,C74.0,8370/3,C74.0,"Blood Derived Normal, Primary Tumor","TCGA-OR-A5J8, TCGA-OR-A5KV, TCGA-OR-A5JP, TCGA..."
2,ACC,C74.0,8370/3,C74.0,"Primary Tumor, Solid Tissue Normal","TCGA-OR-A5KB, TCGA-PK-A5HC, TCGA-PK-A5HB, TCGA..."
3,BLCA,C67.0,8120/3,C67.0,"Blood Derived Normal, Primary Tumor","TCGA-DK-A2I6, TCGA-GV-A3QH, TCGA-FD-A62O, TCGA..."
4,BLCA,C67.0,8120/3,C67.0,"Primary Tumor, Solid Tissue Normal","TCGA-BT-A20U, TCGA-BT-A20N, TCGA-BT-A20P, TCGA..."
...,...,...,...,...,...,...
726,UCS,C54.3,8980/3,C54.3,"Blood Derived Normal, Primary Tumor",TCGA-N8-A4PM
727,UCS,C55,8950/3,C55.9,"Blood Derived Normal, Primary Tumor","TCGA-N5-A4RO, TCGA-N7-A4Y8, TCGA-N5-A4R8, TCGA..."
728,UCS,,,,"Blood Derived Normal, Primary Tumor","TCGA-N9-A4Q7, TCGA-NF-A4X2, TCGA-QM-A5NM, TCGA..."
729,UCS,,,,"Primary Tumor, Solid Tissue Normal","TCGA-N9-A4Q8, TCGA-N9-A4PZ"


In [48]:
# Translating Codes to Text

In [49]:

file_path = "/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/icd_codes/ICD-O-3.2_MFin_17042019_web.csv"

# Load the Excel file
morphology_description = pd.read_csv(file_path)

In [50]:
morphology_description.columns

Index(['Codes', 'Morphology_Description'], dtype='object')

In [51]:
file_path = "/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/icd_codes/ICD-O3 Topography.csv"
icd_o3 = pd.read_csv(file_path)

In [52]:
icd_o3.columns

Index(['icdo3_code', 'description'], dtype='object')

In [53]:
morphology_description,


(       Codes                             Morphology_Description
 0     8000/0                                   Neoplasm, benign
 1     8000/1    Neoplasm, uncertain whether benign or malignant
 2     8000/3                                Neoplasm, malignant
 3     8000/6                               Neoplasm, metastatic
 4     8000/9  Neoplasm, malignant, uncertain whether primary...
 ...      ...                                                ...
 1138  9985/3  Myelodysplastic syndrome with multilineage dys...
 1139  9986/3    Myelodysplastic syndrome with isolated del (5q)
 1140  9987/3      Therapy-related myelodysplastic syndrome, NOS
 1141  9989/3                      Myelodysplastic syndrome, NOS
 1142  9993/3  Myelodysplastic syndrome with ring sideroblast...
 
 [1143 rows x 2 columns],)

In [54]:
icd_mapping = dict(zip(icd_o3['icdo3_code'], icd_o3['description']))
morph_mapping = dict(zip(morphology_description['Codes'], morphology_description['Morphology_Description']))

# Translate ICD_10 and ICD_O_3_SITE codes using icd_o3 mapping
grouped_df['ICD_10_desc'] = grouped_df['ICD_10'].map(icd_mapping)
grouped_df['ICD_O_3_SITE_desc'] = grouped_df['ICD_O_3_SITE'].map(icd_mapping)

# Translate ICD_O_3_HISTOLOGY codes using morphology_description mapping
grouped_df['ICD_O_3_HISTOLOGY_desc'] = grouped_df['ICD_O_3_HISTOLOGY'].map(morph_mapping)

study_abbrev = pd.read_csv("/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/tcga_study_abbreviations.csv")
study_mapping = dict(zip(study_abbrev['Study Abbreviation'], study_abbrev['Study Name']))
grouped_df['CANCER_TYPE_NAME'] = grouped_df['CANCER_TYPE_ACRONYM'].map(study_mapping)

In [57]:
grouped_df

Unnamed: 0,CANCER_TYPE_ACRONYM,ICD_10,ICD_O_3_HISTOLOGY,ICD_O_3_SITE,Normalized Sample Type,Case IDs,ICD_10_desc,ICD_O_3_SITE_desc,ICD_O_3_HISTOLOGY_desc,CANCER_TYPE_NAME
0,ACC,C74.0,8370/1,C74.0,"Blood Derived Normal, Primary Tumor","TCGA-OR-A5L9, TCGA-OR-A5J1, TCGA-OR-A5KU, TCGA...",Adrenal gland cortex,Adrenal gland cortex,,Adrenocortical carcinoma
1,ACC,C74.0,8370/3,C74.0,"Blood Derived Normal, Primary Tumor","TCGA-OR-A5J8, TCGA-OR-A5KV, TCGA-OR-A5JP, TCGA...",Adrenal gland cortex,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenocortical carcinoma
2,ACC,C74.0,8370/3,C74.0,"Primary Tumor, Solid Tissue Normal","TCGA-OR-A5KB, TCGA-PK-A5HC, TCGA-PK-A5HB, TCGA...",Adrenal gland cortex,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenocortical carcinoma
3,BLCA,C67.0,8120/3,C67.0,"Blood Derived Normal, Primary Tumor","TCGA-DK-A2I6, TCGA-GV-A3QH, TCGA-FD-A62O, TCGA...","Trigone, bladder","Trigone, bladder","Transitional cell carcinoma, NOS",Bladder Urothelial Carcinoma
4,BLCA,C67.0,8120/3,C67.0,"Primary Tumor, Solid Tissue Normal","TCGA-BT-A20U, TCGA-BT-A20N, TCGA-BT-A20P, TCGA...","Trigone, bladder","Trigone, bladder","Transitional cell carcinoma, NOS",Bladder Urothelial Carcinoma
...,...,...,...,...,...,...,...,...,...,...
726,UCS,C54.3,8980/3,C54.3,"Blood Derived Normal, Primary Tumor",TCGA-N8-A4PM,Fundus uteri,Fundus uteri,"Carcinosarcoma, NOS",Uterine Carcinosarcoma
727,UCS,C55,8950/3,C55.9,"Blood Derived Normal, Primary Tumor","TCGA-N5-A4RO, TCGA-N7-A4Y8, TCGA-N5-A4R8, TCGA...",,Uterus NOS,Mullerian mixed tumor,Uterine Carcinosarcoma
728,UCS,,,,"Blood Derived Normal, Primary Tumor","TCGA-N9-A4Q7, TCGA-NF-A4X2, TCGA-QM-A5NM, TCGA...",,,,Uterine Carcinosarcoma
729,UCS,,,,"Primary Tumor, Solid Tissue Normal","TCGA-N9-A4Q8, TCGA-N9-A4PZ",,,,Uterine Carcinosarcoma


In [55]:
grouped_df.to_parquet("/home/chb3333/yulab/chb3333/gem-patho/data_extraction/cancertype_location_description/location_description/description_meta.parquet")

In [56]:
grouped_df

Unnamed: 0,CANCER_TYPE_ACRONYM,ICD_10,ICD_O_3_HISTOLOGY,ICD_O_3_SITE,Normalized Sample Type,Case IDs,ICD_10_desc,ICD_O_3_SITE_desc,ICD_O_3_HISTOLOGY_desc,CANCER_TYPE_NAME
0,ACC,C74.0,8370/1,C74.0,"Blood Derived Normal, Primary Tumor","TCGA-OR-A5L9, TCGA-OR-A5J1, TCGA-OR-A5KU, TCGA...",Adrenal gland cortex,Adrenal gland cortex,,Adrenocortical carcinoma
1,ACC,C74.0,8370/3,C74.0,"Blood Derived Normal, Primary Tumor","TCGA-OR-A5J8, TCGA-OR-A5KV, TCGA-OR-A5JP, TCGA...",Adrenal gland cortex,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenocortical carcinoma
2,ACC,C74.0,8370/3,C74.0,"Primary Tumor, Solid Tissue Normal","TCGA-OR-A5KB, TCGA-PK-A5HC, TCGA-PK-A5HB, TCGA...",Adrenal gland cortex,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenocortical carcinoma
3,BLCA,C67.0,8120/3,C67.0,"Blood Derived Normal, Primary Tumor","TCGA-DK-A2I6, TCGA-GV-A3QH, TCGA-FD-A62O, TCGA...","Trigone, bladder","Trigone, bladder","Transitional cell carcinoma, NOS",Bladder Urothelial Carcinoma
4,BLCA,C67.0,8120/3,C67.0,"Primary Tumor, Solid Tissue Normal","TCGA-BT-A20U, TCGA-BT-A20N, TCGA-BT-A20P, TCGA...","Trigone, bladder","Trigone, bladder","Transitional cell carcinoma, NOS",Bladder Urothelial Carcinoma
...,...,...,...,...,...,...,...,...,...,...
726,UCS,C54.3,8980/3,C54.3,"Blood Derived Normal, Primary Tumor",TCGA-N8-A4PM,Fundus uteri,Fundus uteri,"Carcinosarcoma, NOS",Uterine Carcinosarcoma
727,UCS,C55,8950/3,C55.9,"Blood Derived Normal, Primary Tumor","TCGA-N5-A4RO, TCGA-N7-A4Y8, TCGA-N5-A4R8, TCGA...",,Uterus NOS,Mullerian mixed tumor,Uterine Carcinosarcoma
728,UCS,,,,"Blood Derived Normal, Primary Tumor","TCGA-N9-A4Q7, TCGA-NF-A4X2, TCGA-QM-A5NM, TCGA...",,,,Uterine Carcinosarcoma
729,UCS,,,,"Primary Tumor, Solid Tissue Normal","TCGA-N9-A4Q8, TCGA-N9-A4PZ",,,,Uterine Carcinosarcoma


In [38]:
chatgpt_query_info = grouped_df[['CANCER_TYPE_NAME', 'ICD_10_desc', 'ICD_O_3_HISTOLOGY_desc', 'ICD_O_3_SITE_desc', 'Normalized Sample Type']]

In [None]:
#chatgpt_query_info_deduplicated = chatgpt_query_info.drop_duplicates()

In [40]:
chatgpt_query_info_deduplicated = chatgpt_query_info

In [41]:
chatgpt_query_info_deduplicated

Unnamed: 0,CANCER_TYPE_NAME,ICD_10_desc,ICD_O_3_HISTOLOGY_desc,ICD_O_3_SITE_desc,Normalized Sample Type
0,Adrenocortical carcinoma,Adrenal gland cortex,,Adrenal gland cortex,"Blood Derived Normal, Primary Tumor"
1,Adrenocortical carcinoma,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenal gland cortex,"Blood Derived Normal, Primary Tumor"
2,Adrenocortical carcinoma,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenal gland cortex,"Primary Tumor, Solid Tissue Normal"
3,Bladder Urothelial Carcinoma,"Trigone, bladder","Transitional cell carcinoma, NOS","Trigone, bladder","Blood Derived Normal, Primary Tumor"
4,Bladder Urothelial Carcinoma,"Trigone, bladder","Transitional cell carcinoma, NOS","Trigone, bladder","Primary Tumor, Solid Tissue Normal"
...,...,...,...,...,...
710,Uterine Carcinosarcoma,Endometrium,Mullerian mixed tumor,Endometrium,"Primary Tumor, Solid Tissue Normal"
711,Uterine Carcinosarcoma,Endometrium,"Carcinosarcoma, NOS",Endometrium,"Blood Derived Normal, Primary Tumor"
712,Uterine Carcinosarcoma,Myometrium,Mullerian mixed tumor,Myometrium,"Blood Derived Normal, Primary Tumor"
713,Uterine Carcinosarcoma,Fundus uteri,"Carcinosarcoma, NOS",Fundus uteri,"Blood Derived Normal, Primary Tumor"


In [42]:
chatgpt_query_info_deduplicated = chatgpt_query_info_deduplicated.rename(columns={'Normalized Sample Type': 'Normalized_Sample_Type'})

In [None]:
final_chatgpt_query_info = chatgpt_query_info_deduplicated.fillna("UNKOWN")

In [None]:
final_chatgpt_query_info

Unnamed: 0,CANCER_TYPE_NAME,ICD_10_desc,ICD_O_3_HISTOLOGY_desc,ICD_O_3_SITE_desc,Normalized_Sample_Type
0,Adrenocortical carcinoma,Adrenal gland cortex,UNKOWN,Adrenal gland cortex,"Blood Derived Normal, Primary Tumor"
1,Adrenocortical carcinoma,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenal gland cortex,"Blood Derived Normal, Primary Tumor"
2,Adrenocortical carcinoma,Adrenal gland cortex,Adrenal cortical carcinoma,Adrenal gland cortex,"Primary Tumor, Solid Tissue Normal"
3,Bladder Urothelial Carcinoma,"Trigone, bladder","Transitional cell carcinoma, NOS","Trigone, bladder","Blood Derived Normal, Primary Tumor"
4,Bladder Urothelial Carcinoma,"Trigone, bladder","Transitional cell carcinoma, NOS","Trigone, bladder","Primary Tumor, Solid Tissue Normal"
...,...,...,...,...,...
710,Uterine Carcinosarcoma,Endometrium,Mullerian mixed tumor,Endometrium,"Primary Tumor, Solid Tissue Normal"
711,Uterine Carcinosarcoma,Endometrium,"Carcinosarcoma, NOS",Endometrium,"Blood Derived Normal, Primary Tumor"
712,Uterine Carcinosarcoma,Myometrium,Mullerian mixed tumor,Myometrium,"Blood Derived Normal, Primary Tumor"
713,Uterine Carcinosarcoma,Fundus uteri,"Carcinosarcoma, NOS",Fundus uteri,"Blood Derived Normal, Primary Tumor"


In [None]:
# template = """
# response = client.chat.completions.create(
#   model="gpt-4o",
#   messages=[
#     {{
#       "role": "system",
#       "content": [
#         {{
#           "type": "text",
#           "text": "You are an expert oncologist specializing in cancer survival. You are explaining complex oncological and genomic factors to another expert in a structured, precise manner. Focus on step-by-step reasoning, integrating knowledge of tumor origin, histology, genetic mutations, and sampling biases. Avoid redundant explanations and ensure biological clarity while emphasizing survival impact."
#         }}
#       ]
#     }},
#     {{
#       "role": "user",
#       "content": [
#         {{
#           "type": "text",
#           "text": "Describe how {CANCER_TYPE_NAME} (per ICD-10: {ICD_10_desc}) impacts survival, focusing on:\n- Tumor Origin: How {ICD_O_3_SITE_desc} influences metastatic patterns, patient survival, and treatment accessibility.\n- Histology: How {ICD_O_3_HISTOLOGY_desc} interacts with mutation profiles to drive outcomes.\n- Sampling Bias: Limitations of {Normalized_Sample_Type} in genomic analysis.\n- Key Genes: Identify 5-8 survival-associated genes, explaining their biological mechanisms (e.g., proliferation, immune response, apoptosis regulation).\n\nFor each aspect, explain step by step:\n- How it influences survival (positive/negative)\n- Biological rationale without excessive jargon\n- Potential biases in genomic interpretation\n\nConclude with a synthesis of key survival risk factors, emphasizing clinically relevant insights and their impact on patient survival."
#         }}
#       ]
#     }}
#   ],
#   response_format={{ "type": "text" }},
#   temperature=1,
#   max_completion_tokens=2048,
#   top_p=1,
#   frequency_penalty=0,
#   presence_penalty=0
# )
# """

In [None]:
# prompts = []
# for index, row in final_chatgpt_query_info.iterrows():
#     prompt_filled = template.format(
#         CANCER_TYPE_NAME=row["CANCER_TYPE_NAME"],
#         ICD_10_desc=row["ICD_10_desc"],
#         ICD_O_3_SITE_desc=row["ICD_O_3_SITE_desc"],
#         ICD_O_3_HISTOLOGY_desc=row["ICD_O_3_HISTOLOGY_desc"],
#         Normalized_Sample_Type=row["Normalized_Sample_Type"]
#     )
#     prompts.append(prompt_filled)

# # Now 'prompts' is a list where each element is a fully formatted prompt.
# print(prompts[0])  # Print the first prompt as an example.


response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": "You are an expert oncologist specializing in cancer survival. You are explaining complex oncological and genomic factors to another expert in a structured, precise manner. Focus on step-by-step reasoning, integrating knowledge of tumor origin, histology, genetic mutations, and sampling biases. Avoid redundant explanations and ensure biological clarity while emphasizing survival impact."
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "Describe how Adrenocortical carcinoma (per ICD-10: Adrenal gland cortex) impacts survival, focusing on:
- Tumor Origin: How Adrenal gland cortex influences metastatic patterns, patient survival, and treatment accessibility.
- Histology: How UNKOWN interacts with mutation profiles to drive outcomes.
- Sa

In [None]:
# from openai import OpenAI

In [None]:
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [None]:
# def get_response(prompt):
#     try:
#         response = openai.ChatCompletion.create(
#             model="gpt-4o",
#             messages=[
#                 {
#                     "role": "system",
#                     "content": [
#                         {
#                             "type": "text",
#                             "text": (
#                                 "You are an expert oncologist specializing in cancer survival. "
#                                 "You are explaining complex oncological and genomic factors to another expert "
#                                 "in a structured, precise manner. Focus on step-by-step reasoning, integrating knowledge "
#                                 "of tumor origin, histology, genetic mutations, and sampling biases. Avoid redundant explanations "
#                                 "and ensure biological clarity while emphasizing survival impact."
#                             )
#                         }
#                     ]
#                 },
#                 {
#                     "role": "user",
#                     "content": [
#                         {
#                             "type": "text",
#                             "text": prompt
#                         }
#                     ]
#                 }
#             ],
#             response_format={"type": "text"},
#             temperature=1,
#             max_tokens=2048,
#             top_p=1,
#             frequency_penalty=0,
#             presence_penalty=0
#         )
#         return response
#     except Exception as e:
#         print(f"Error processing prompt: {e}")
#         return None


In [None]:
# responses = []

# for idx, prompt in enumerate(prompts):
#     print(f"Processing prompt {idx+1}/{len(prompts)}...")
#     try:
#         response = client.chat.completions.create(
#             model="gpt-4o",
#             messages=[
#                 {
#                     "role": "system",
#                     "content": [
#                         {
#                             "type": "text",
#                             "text": "You are an expert oncologist specializing in cancer survival. You are explaining complex oncological and genomic factors to another expert in a structured, precise manner. Focus on step-by-step reasoning, integrating knowledge of tumor origin, histology, genetic mutations, and sampling biases. Avoid redundant explanations and ensure biological clarity while emphasizing survival impact."
#                         }
#                     ]
#                 },
#                 {
#                     "role": "user",
#                     "content": [
#                         {
#                             "type": "text",
#                             "text": prompt
#                         }
#                     ]
#                 }
#             ],
#             response_format={"type": "text"},
#             temperature=1,
#             max_completion_tokens=2048,
#             top_p=1,
#             frequency_penalty=0,
#             presence_penalty=0
#         )
#         responses.append(response)
#         print(response)
#     except Exception as e:
#         print(f"Error processing prompt {idx+1}: {e}")

Processing prompt 1/704...
ChatCompletion(id='chatcmpl-BAiDlUHNHkknGlvawXAMOwjM0XPKx', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="### Tumor Origin: Adrenal Gland Cortex\n\n**Influence on Survival:**\nAdrenocortical carcinoma (ACC) originates in the adrenal cortex, which is crucial for hormone production, particularly glucocorticoids, mineralocorticoids, and androgens. The origin significantly affects metastatic patterns, often spreading early to the liver, lungs, and peritoneum due to abundant vascularization of the adrenal glands.\n\n**Biological Rationale:**\nThe adrenal gland's function in hormone secretion plays a critical role in the metabolic and immune environment, potentially enhancing tumor spread. Thus, the complex regulation of hormones by tumors influences cachexia and immunosuppression, further impacting mortality rates.\n\n**Treatment Accessibility:**\nTreatment options are limited, often constrained by early metas

KeyboardInterrupt: 

In [1]:
# data = []
# for prompt, resp in zip(prompts, responses):
#     try:
#         # Extract the text answer from the ChatCompletion object
#         answer_text = resp.choices[0].message.content
#     except Exception as e:
#         answer_text = None
#         print(f"Error extracting answer: {e}")
#     data.append({
#         "prompt": prompt,
#         "response": answer_text
#     })

# # Create a DataFrame from the collected data
# df_responses = pd.DataFrame(data)

# csv_path = "/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/location_description/chat_responses.csv"
# parquet_path = "/home/chb3333/yulab/chb3333/data_extraction/sample_location_tumor_description/location_description/chat_responses.parquet"

# # Save the DataFrame as a CSV file
# df_responses.to_csv(csv_path, index=False)
# print(f"Responses saved to {csv_path}")

# # Save the DataFrame as a Parquet file (requires pyarrow or fastparquet)
# df_responses.to_parquet(parquet_path, index=False)
# print(f"Responses saved to {parquet_path}")