1. QCHAT_ASD_TERMS_Similarity
1) Feature Input: ASD-specific terminology and QCHAT (A1 - A10) questionnaires are inputted as the primary feature set. 2) Feature Extraction: Sentence transformers are employed to derive vector representations of the features, which are then assessed for similarity and reviewed by clinical experts, culminating in ASD Term Mapping


- Soo Kyung Bae(Scarlett), Ph.D. Student
- Dept. of Integrated Medicine(Major in Digital Healthcare)
- Yonsei University College of Medicine

In [1]:
import pandas as pd
import os
import re

import torch
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import xlsxwriter


  from tqdm.autonotebook import tqdm, trange
  Referenced from: <2D1B8D5C-7891-3680-9CF9-F771AE880676> /opt/anaconda3/envs/RoBERTa2/lib/python3.9/site-packages/torchvision/image.so
  warn(


In [2]:
# 코드별 질문 리스트 불러오기

QCHAT_path = r"./CH_input/QCHAT_DSM5.xlsx"

# SCQ sheet - Column D (En questions) , Make Dataframe
question_list = pd.read_excel(QCHAT_path)
question_list

Unnamed: 0,Var,Q_chat_10_T,ID,Name,DSM5 Criteria,Symptom of DSM5 Criteria,CUI
0,A1,Does your child look at you when you call his/...,ASD0194,attention and concentration deficit,A2,deficits in understanding and use of gestures,C2977673
1,A2,How easy is it for you to get eye contact with...,ASD0250,contacts eye,A2,abnormalities in eye contact,C0870532
2,A3,Does your child point to indicate that s/he wa...,S1049,spontaneously pointing,B3,excessively circumscribed interest,C4036288
3,A4,Does your child point to share interest with y...,S0769,pointing to express interest,B3,highly restricted interests,C0424091
4,A5,"Does your child pretend? (e.g. care for dolls,...",S1268,does pretend play,B2,need to take same route,C2371970
5,A6,Does your child follow where you’re looking?,ASD0642,shifting attention,B3,highly fixated interests,C2370875
6,A7,If you or someone else in the family is visibl...,ASD0019,social interaction impairment in social/emotio...,A1,social-emotional reciprocity,C4064035
7,A8,Would you describe your child’s first words as:,ASD0372,absent speech,B1,echolalia,C1854882
8,A9,Does your child use simple gestures? (e.g. wav...,ASD0216,difficulty imitating gesture,A2,deficits in understanding and use of gestures,C0566243
9,A10,Does your child stare at nothing with no appar...,ASD0350,play impairment stares blankly at play objects...,A3,difficulties in sharing imaginative play,C4064319


In [3]:
# 환자별 행동 특성 데이터 불러오기

# Check if CUDA is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'

trait_data = pd.read_csv(r"input/cdb_advanced_ASD_F3.csv")
trait_data.head()

Unnamed: 0,ID,Name,DSM5 Criteria,Symptom of DSM5 Criteria,CUI,Unnamed: 5
0,ASD0001,inflexible adherence to routines or rituals,B2,inflexible adherence to routines,C1837653,
1,ASD0002,impaired use of nonverbal behaviors,B2,ritualized patterns of nonverbal behavior,C4021798,
2,ASD0003,speech nonverbal communication skills facial e...,A2,nonverbal communication,C2018050,
3,ASD0004,abnormal sensations in eye,A2,abnormalities in eye contact,C0497201,
4,ASD0005,social/emotional reciprocity appears deaf,A1,social-emotional reciprocity,C4064026,


In [6]:
# 2. Text Preprocessing

In [4]:
# 모든 텍스트 소문자로 전환하기

def preprocess_text(text):
    return " ".join([word.lower() for word in text.split()]) # 입력받은 텍스트를 소문자화합니다

# 행동 특성의 이름과 질문 텍스트 소문자화
trait_data['Processed_Name'] = trait_data['Name'].apply(preprocess_text)
question_list['Processed_Q_chat_10_T'] = question_list['Q_chat_10_T'].apply(preprocess_text)

In [8]:
# 3. Vectorization using Sentence Transformers

In [5]:
# 질문 텍스트 vectorization

# sentence transformer 모델 불러오기
model = SentenceTransformer('all-MiniLM-L6-v2') # sentence transfomers 모델 불러오기
model = model.to(device)  

# 행동 특성 텍스트 데이터 vectorization
X_full = model.encode(trait_data['Processed_Name'].tolist(), convert_to_tensor=True) 
X_full = X_full.to(device)



In [6]:
#Before
trait_data['Processed_Name'].tolist()[0]

'inflexible adherence to routines or rituals'

In [7]:
#After
X_full[0]

tensor([ 2.8147e-02,  4.0301e-02, -1.5325e-02,  2.3316e-02, -7.7489e-02,
        -2.2377e-02,  7.5572e-03, -4.6510e-02,  3.0327e-02, -9.9353e-02,
         4.8089e-02,  1.2113e-02, -2.3195e-02,  2.1818e-02,  5.1161e-02,
        -6.0459e-02,  2.0495e-02,  6.8436e-02, -4.8017e-02,  3.0283e-02,
        -7.7767e-02, -5.0865e-03,  3.4439e-02,  7.0297e-02, -1.0118e-01,
         8.2324e-05, -2.2503e-02, -1.0917e-01,  4.1832e-02,  3.1996e-03,
        -3.7481e-02, -1.6535e-02, -8.8352e-02,  9.3030e-03, -1.0997e-03,
         5.6474e-02, -7.0479e-03,  2.5895e-02, -4.9383e-02,  8.2796e-03,
         4.2328e-03, -8.7808e-02, -5.6744e-02, -7.6702e-02, -3.3452e-02,
        -9.4498e-04, -1.8550e-02,  1.4718e-02, -5.0924e-02, -3.9544e-02,
        -4.6502e-02, -2.1864e-02,  5.4835e-02,  4.5170e-02,  3.8363e-02,
        -1.3326e-02,  2.6686e-02,  3.6204e-02, -3.6082e-03, -1.6451e-02,
         5.5127e-02, -2.8040e-02, -6.2620e-02,  9.5543e-02, -4.9915e-03,
         2.5485e-02, -2.7212e-03, -1.9747e-02,  6.7

In [12]:
# 4. KMeans clustering

In [8]:
# training 데이터 100개 클러스터로 나누기

kmeans_100 = KMeans(n_clusters=100, random_state=42).fit(X_full.cpu())  # KMeans requires CPU tensor
trait_data['100_Cluster'] = kmeans_100.predict(X_full.cpu())  # KMeans requires CPU tensor

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
# 5. Cosine Similarity 계산

In [9]:
# 질문 텍스트 vectorization 적용한 후 

# Function to find best matches in dataset based on the processed text and include rank
def match_question_to_data_detailed(question, trait_data, top_n=None):
    
    # 질문 텍스트 vectorization 
    question_vec = model.encode([question], convert_to_tensor=True)
    question_vec = question_vec.to(device)
    
    # 각가의 질문과 행동 특성 사이의 유사도(similarity) 계산
    cosine_similarities = cosine_similarity(question_vec, X_full).flatten()  # Compute cosine similarity on CPU
    
    # 유사도(similarity)가 높은 순서대로 결과 정렬
    if top_n:
        related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    else:
        related_docs_indices = cosine_similarities.argsort()[::-1]
    matched_data = trait_data.iloc[related_docs_indices]
    matched_data['Question']= question
    matched_data['Matching Rank/Probability'] = cosine_similarities[related_docs_indices]
    return matched_data

# Find the best matches for each question with detailed information
question_matches_detailed_ranked = {}
for idx, question in enumerate(question_list['Processed_Q_chat_10_T']):
    matched_data = match_question_to_data_detailed(question, trait_data) # 각각의 질문 내용과 행동 특성들을 비교
    question_matches_detailed_ranked[f"Q_chat_10_T {idx}"] = matched_data[['ID','Question', 'Name', 'CUI','DSM5 Criteria','Symptom of DSM5 Criteria', 'Matching Rank/Probability', '100_Cluster']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_data['Question']= question
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_data['Matching Rank/Probability'] = cosine_similarities[related_docs_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_data['Question']= question
A value is trying to be set on a copy of a slice 

In [42]:
# remain only results with similarity over 0.6

# creating top 5 criteria per question dataset in one csv file

result = pd.DataFrame(
    {'text': [],
    'criteria': []}
)
for i in range(0, 10):
    sheet = question_matches_detailed_ranked[f"Q_chat_10_T {i}"]
    question_matches_detailed_ranked[f"Q_chat_10_T {i}"] = sheet[sheet['Matching Rank/Probability'] > 0.0][0:5]
    criteria = question_matches_detailed_ranked[f"Q_chat_10_T {i}"]['DSM5 Criteria'].values
    values = question_matches_detailed_ranked[f"Q_chat_10_T {i}"]['Symptom of DSM5 Criteria'].values
    
    #print(values)
    joined_string_criteria = ", ".join(set([str(v) for v in criteria]))
    joined_string_values = ", ".join(set([str(v) for v in values]))
    
    new_row = pd.DataFrame({'text': [str(sheet['Question'].iloc[0])], 'criteria': [joined_string_values]})
    result = pd.concat([result, new_row])
result = result.reset_index(drop = True)
result.to_csv('qchat_en_st_total.csv', index=False)


In [17]:
# 6. Save the results to an Excel file

In [43]:
output_filepath_questions_detailed_ranked = r"qchat_en_st.xlsx"
writer = pd.ExcelWriter(output_filepath_questions_detailed_ranked, engine='xlsxwriter')
for q_key, matched_data in question_matches_detailed_ranked.items():
    # Add the question as a separate row before the matches
    dataframe = pd.DataFrame([[q_key, None, None, None, None, None, None, None]], columns=['Question', 'ID', 'Name', 'CUI', 'DSM5 Criteria', 'Symptom of DSM5 Criteria', 'Matching Rank/Probability', '100_Cluster'])
    dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
    dataframe.to_excel(writer, sheet_name=q_key, index=False)

# Use close() instead of save()
writer.close()



  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)
  dataframe = pd.concat([dataframe, matched_data], ignore_index=True)


In [19]:
question_list['Q_chat_10_T']
zero_group = ['My child looks at me when I call his/her name.',
             'It is easy to get eye contact with my child.',
             'My child points to indicate that s/he wants something(e.g. a toy that is out of reach)',
             'My child points to share interest with me.(e.g. pointing at an interesting sight)',
             'My child pretends. (e.g. care for dolls, talk on a toy phone)',
             'My child follows where I am looking.',
             'If I or someone else in the family is visibly upset, my child shows signs of wanting to comfort them. (e.g. stroking hair, hugging them)',
             'The first word of my child is typical,',
             'My child uses simple gestures. (e.g. wave goodbye)',
             'My child does not stare at nothing with no apparent purpose.']

one_group = ['My child does not look at me when I call his/her name.',
             'It is hard to get eye contact with my child.',
             'My child does not point to indicate that s/he wants something(e.g. a toy that is out of reach)',
             'My child does not point to share interest with me.(e.g. pointing at an interesting sight)',
             'My child does not pretend. (e.g. care for dolls, talk on a toy phone)',
             'My child does not follow where I am looking.',
             'If I or someone else in the family is visibly upset, my child does not show signs of wanting to comfort them. (e.g. stroking hair, hugging them)',
             'The first word of my child is typical,',
             'My child does not use simple gestures. (e.g. wave goodbye)',
             'My child stares at nothing with no apparent purpose.']

question_list['zero'] = zero_group
question_list['one'] = one_group
question_only_df = question_list[['Q_chat_10_T', 'zero', 'one']]
question_only_df.to_csv('question_and_yesorno_answer.csv', index = False)

In [20]:
# 질문 텍스트 vectorization 적용한 후 

# Function to find best matches in dataset based on the processed text and include rank
def match_question_to_data_detailed(answer_one, trait_data, top_n=None):
    
    # 질문 텍스트 vectorization 
    question_vec = model.encode([answer_one], convert_to_tensor=True)
    question_vec = question_vec.to(device)
    
    # 각가의 질문과 행동 특성 사이의 유사도(similarity) 계산
    cosine_similarities = cosine_similarity(question_vec, X_full).flatten()  # Compute cosine similarity on CPU
    
    # 유사도(similarity)가 높은 순서대로 결과 정렬
    if top_n:
        related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    else:
        related_docs_indices = cosine_similarities.argsort()[::-1]
    matched_data = trait_data.iloc[related_docs_indices]
    matched_data['one']= answer_one
    matched_data['Matching Rank/Probability'] = cosine_similarities[related_docs_indices]
    return matched_data

# Find the best matches for each question with detailed information
question_matches_detailed_ranked = {}
for idx, answer_one in enumerate(question_list['one']):
    matched_data = match_question_to_data_detailed(answer_one, trait_data) # 각각의 질문 내용과 행동 특성들을 비교
    question_matches_detailed_ranked[f"Q_chat_10_T {idx}"] = matched_data[['ID','one', 'Name', 'CUI','DSM5 Criteria','Symptom of DSM5 Criteria', 'Matching Rank/Probability', '100_Cluster']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_data['one']= answer_one
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_data['Matching Rank/Probability'] = cosine_similarities[related_docs_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_data['one']= answer_one
A value is trying to be set on a copy of a slice from a

In [21]:
# remain only results with similarity over 0.6

# creating top 5 criteria per question 1-answer dataset in one csv file

result = pd.DataFrame(
    {'text': [],
    'criteria': []}
)
for i in range(0, 10):
    sheet = question_matches_detailed_ranked[f"Q_chat_10_T {i}"]
    question_matches_detailed_ranked[f"Q_chat_10_T {i}"] = sheet[sheet['Matching Rank/Probability'] > 0.0][0:5]
    criteria = question_matches_detailed_ranked[f"Q_chat_10_T {i}"]['DSM5 Criteria'].values
    values = question_matches_detailed_ranked[f"Q_chat_10_T {i}"]['Symptom of DSM5 Criteria'].values
    
    #print(values)
    joined_string_criteria = ", ".join(set([str(v) for v in criteria]))
    joined_string_values = ", ".join(set([str(v) for v in values]))
    
    new_row = pd.DataFrame({'text': [str(sheet['one'].iloc[0])], 'criteria': [joined_string_values]})
    result = pd.concat([result, new_row])
result = result.reset_index(drop = True)
result.to_csv('qchat_en_st_total_by_one_answer.csv', index=False)


In [41]:
qchat_df = pd.read_csv("./input/Toddler Autism dataset July 2018.csv")

five_each_instances_df = pd.DataFrame()

qchat_df = qchat_df[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10']]
qchat_df = qchat_df.drop_duplicates()


Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1009, 1010, 1014, 1016, 1020, 1021, 1035, 1036, 1041, 1053],
      dtype='int64', length=409)

In [42]:
new_qchat_df = pd.read_csv("./input/Toddler Autism dataset July 2018.csv")
new_qchat_df = new_qchat_df.iloc[qchat_df.index]
new_qchat_df

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021,1022,0,1,1,1,1,1,1,1,0,0,12,7,m,White European,no,no,family member,Yes
1035,1036,1,0,0,1,0,1,1,1,0,0,18,5,m,Others,yes,no,family member,Yes
1036,1037,1,1,0,1,0,1,1,1,0,0,14,6,f,asian,no,no,family member,Yes
1041,1042,0,0,0,1,0,0,1,0,0,0,25,2,m,White European,yes,no,family member,No


In [43]:

for i in range(0,11):
    popped_rows = new_qchat_df[new_qchat_df['Qchat-10-Score'] == i][:5]
    #qchat_df = qchat_df.drop(popped_rows.index).reset_index(drop = True)

    five_each_instances_df = pd.concat((five_each_instances_df, popped_rows))

five_each_instances_df.to_csv("five_each_instances.csv", index = False)


In [44]:
five_each_instances_df

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
12,13,0,0,0,0,0,0,0,0,0,0,25,0,f,middle eastern,yes,no,family member,No
32,33,0,0,0,0,0,0,0,0,0,1,15,1,f,middle eastern,no,no,Health care professional,No
67,68,0,0,0,0,0,0,0,1,0,0,29,1,m,White European,no,yes,family member,No
123,124,0,0,0,0,0,1,0,0,0,0,25,1,f,asian,no,no,family member,No
135,136,0,0,0,0,1,0,0,0,0,0,34,1,f,south asian,yes,no,family member,No
148,149,0,0,0,0,0,0,1,0,0,0,30,1,f,Latino,no,yes,family member,No
8,9,0,0,0,0,0,0,1,0,0,1,36,2,m,asian,no,no,family member,No
58,59,0,0,0,0,0,0,0,1,0,1,15,2,f,White European,no,no,family member,No
68,69,1,0,1,0,0,0,0,0,0,0,23,2,m,south asian,no,yes,family member,No
80,81,0,1,0,0,0,0,0,0,0,1,30,2,m,middle eastern,no,no,family member,No
