In [None]:
# structure: original report, source(from which dataset), specialty. 
# it's simpler to analyse the final results by this structure.
import pandas as pd

# 1. Kaggle Chest X-ray 数据集
df_kaggle = pd.read_csv("data/ReportsDATASET.csv")
df_kaggle_clean = df_kaggle[["Text"]].drop_duplicates().dropna()
df_kaggle_clean["Source"] = "KaggleChest"
df_kaggle_clean["Specialty"] = "chest x-ray"
df_kaggle_clean = df_kaggle_clean.rename(columns={"Text": "Original"})

# 2. MTSamples 数据集
df_mt = pd.read_csv("data/mtsamples.csv")
df_mt_clean = df_mt[["transcription", "medical_specialty"]].drop_duplicates().dropna()
df_mt_clean["Source"] = "MTSamples"
df_mt_clean = df_mt_clean.rename(columns={"transcription": "Original", "medical_specialty": "Specialty"})

# 3. cxr_df 数据集
df_cxr = pd.read_csv("data/cxr_df.csv")
df_cxr_clean = df_cxr[["text"]].drop_duplicates().dropna()
df_cxr_clean["Source"] = "CXRdf"
df_cxr_clean["Specialty"] = "chest x-ray"
df_cxr_clean = df_cxr_clean.rename(columns={"text": "Original"})

# 合并所有数据
df_all = pd.concat([df_kaggle_clean, df_mt_clean, df_cxr_clean], ignore_index=True)
df_all = df_all.dropna(subset=["Original"]).reset_index(drop=True)

print(df_all.sample(5))  # 随机抽查几行
print("Statistics of data volume from various sources: \n", df_all['Source'].value_counts())
print("Specialty Distribution: \n", df_all['Specialty'].value_counts())

# 保存为新 CSV
df_all.to_csv("data/all_datasets_cleaned.csv", index=False)


                                                Original Source    Specialty
88324  In comparison with the study of ___, the degre...  CXRdf  chest x-ray
45771  There has been placement of an endotracheal tu...  CXRdf  chest x-ray
49383  ET tube is not visualized. The NG tube tip is ...  CXRdf  chest x-ray
9184   Portable AP radiograph of the chest was review...  CXRdf  chest x-ray
94142  A right PICC line and the left pleural drain a...  CXRdf  chest x-ray
Statistics of data volume from various sources: 
 Source
CXRdf          90471
MTSamples       4964
KaggleChest     1982
Name: count, dtype: int64
Specialty Distribution: 
 Specialty
chest x-ray                       92453
 Surgery                           1088
 Consult - History and Phy.         516
 Cardiovascular / Pulmonary         371
 Orthopedic                         355
 Radiology                          273
 General Medicine                   259
 Gastroenterology                   224
 Neurology                          

In [None]:
# some reports are too short/not complete, I need to clean them.
import pandas as pd

# 以合并后的数据举例
df = pd.read_csv("data/all_datasets_cleaned.csv")

# 统计每行的单词数
df['word_count'] = df['Original'].str.split().apply(len)

# 只保留单词数>=10的报告
df_clean = df[df['word_count'] >= 10].copy()
df_clean = df_clean.drop(columns=['word_count'])

# 保存清洗后结果
df_clean.to_csv("data/complete_reports.csv", index=False)

print("all reports :", len(df))
print("complete reports :", len(df_clean))


all reports : 97417
complete reports : 97012


In [1]:
import pandas as pd
# 读取清洗后的数据
df_all = pd.read_csv("data/complete_reports.csv")
# 统计每个 Specialty 的报告数量
print(df_all.sample(5))  # 随机抽查几行
print("\n")
print("Statistics of data volume from various sources: \n\n", df_all['Source'].value_counts())
print("\n")
print("Distribution of Specialty: \n", df_all['Specialty'].value_counts())



                                                Original     Source  \
6195   CHIEF COMPLAINT:,  Newly diagnosed mantle cell...  MTSamples   
24229  Endotracheal tube has its tip at the thoracic ...      CXRdf   
20383  Tracheostomy tube remains in standard position...      CXRdf   
96395  Left lung lavage was recently done, explaining...      CXRdf   
80589  The heart size is normal. The mediastinal and ...      CXRdf   

                         Specialty  
6195    Consult - History and Phy.  
24229                  chest x-ray  
20383                  chest x-ray  
96395                  chest x-ray  
80589                  chest x-ray  


Statistics of data volume from various sources: 

 Source
CXRdf          90105
MTSamples       4925
KaggleChest     1982
Name: count, dtype: int64


Distribution of Specialty: 
 Specialty
chest x-ray                       92087
 Surgery                           1083
 Consult - History and Phy.         511
 Cardiovascular / Pulmonary         368
 

In [None]:
# Stratified sampling by specialty to ensure data diversity while controlling the total data volume.

import pandas as pd

# 读取完整报告的数据
df = pd.read_csv("data/complete_reports.csv")

# 统计每个 Specialty 的数量
specialty_counts = df["Specialty"].value_counts().to_dict()

# 定义采样规则
refined_sample_targets = {}
for spec, count in specialty_counts.items():
    if spec == 'chest x-ray':
        refined_sample_targets[spec] = min(300, count)  # chest-xray300
    elif count >= 300:
        refined_sample_targets[spec] = min(100, count)  # 中型类别限制100
    elif 100 <= count < 300:
        refined_sample_targets[spec] = min(50, count)   # 小型类别限制50
    else:
        refined_sample_targets[spec] = count            # 长尾类别全保留

# 分层抽样
sampled_df_list = []
for spec, n in refined_sample_targets.items():
    df_spec = df[df["Specialty"] == spec]
    n = min(n, len(df_spec))
    sampled_df_list.append(df_spec.sample(n=n, random_state=2609680))

# 合并所有采样结果
sampled_all = pd.concat(sampled_df_list, ignore_index=True)

# 保存采样结果
sampled_all.to_csv("Stratified sampling.csv", index=False)








In [5]:
df_show=pd.read_csv("Stratified sampling.csv")
df_show

Unnamed: 0,Original,Source,Specialty
0,"A right chest tube remains in place, a small r...",CXRdf,chest x-ray
1,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,KaggleChest,chest x-ray
2,"In comparison with the study of ___, there is ...",CXRdf,chest x-ray
3,There has been interval repositioning of the l...,CXRdf,chest x-ray
4,"As compared to the previous radiograph, the bi...",CXRdf,chest x-ray
...,...,...,...
2156,"HISTORY OF PRESENT ILLNESS: , The patient is a...",MTSamples,Hospice - Palliative Care
2157,"HISTORY OF PRESENT ILLNESS:, History as provi...",MTSamples,Hospice - Palliative Care
2158,"REASON FOR CONSULT: , I was asked to see this ...",MTSamples,Hospice - Palliative Care
2159,"HISTORY:, This is an initial visit for this 9...",MTSamples,Hospice - Palliative Care


In [None]:
import pandas as pd
#check if there are duplicates in the sampled data 
df_check= pd.read_csv("Stratified sampling.csv")
has_duplicates = df_check['Original'].duplicated().any()
has_duplicates

np.True_

In [4]:
# 删除重复的 Original 行（保留第一次出现的）
df_no_duplicates = df_check.drop_duplicates(subset=['Original'], keep='first')

# 保存去重后的结果
df_no_duplicates.to_csv("Stratified_sampling_cleaned.csv", index=False)

In [3]:
import pandas as pd
df_show=pd.read_csv("Stratified_sampling_cleaned.csv")
df_show

Unnamed: 0,Original,Source,Specialty
0,"A right chest tube remains in place, a small r...",CXRdf,chest x-ray
1,\nChest PA-Lat XR\n\nImaging Study\nXray Chest...,KaggleChest,chest x-ray
2,"In comparison with the study of ___, there is ...",CXRdf,chest x-ray
3,There has been interval repositioning of the l...,CXRdf,chest x-ray
4,"As compared to the previous radiograph, the bi...",CXRdf,chest x-ray
...,...,...,...
1771,"REASON FOR FOLLOWUP:, Care conference with fa...",MTSamples,Hospice - Palliative Care
1772,"HISTORY OF PRESENT ILLNESS: , The patient is a...",MTSamples,Hospice - Palliative Care
1773,"HISTORY OF PRESENT ILLNESS:, History as provi...",MTSamples,Hospice - Palliative Care
1774,"HISTORY:, This is an initial visit for this 9...",MTSamples,Hospice - Palliative Care
