In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# tofdrug_sample100.csv 파일 읽기
df = pd.read_csv('tofdrug2_ko.csv', encoding='utf-8')

# drug_concept_id 및 source_name 컬럼에 따른 처방 통계 계산
prescription_stats = df.groupby(['drug_concept_id', 'source_name', 'atc_cd']).size().reset_index(name='count')

# 처방받은 횟수에 따라 내림차순으로 정렬
prescription_stats_sorted = prescription_stats.sort_values(by='count', ascending=False)

prescription_stats_sorted.head(15)


Unnamed: 0,drug_concept_id,source_name,atc_cd,count
421,19079658,Spironolactone 25mg tab,C03DA01,28710
82,957136,Furosemide 40mg tab,C03CA01,28287
587,21138024,Salbutamol 2.5mg nebule inhaler,R03AC02,26708
789,40165789,Enalapril 5mg tab,C09AA02,23732
696,36249739,Dextrose 5% 50ml btl,B05BA03,20804
1320,43296872,Acetylcysteine 20% 4ml inhaler,R05CB01,19925
846,40221384,Normal saline 0.9% 50ml btl,B05XA03,17824
203,1718698,Potassium chloride 40mEq/20ml inj,B05XA01,15554
429,19088169,Ambroxol 3mg/ml syrup,R05CB06,14800
112,1113143,Aspirin 100mg tab,N02BA01,13536


In [7]:
# Count unique values in the 'source_name' column
unique_source_names = df['source_name'].nunique()
unique_source_names

1437

In [24]:
# Filter out the rows where 'source_name' contains any of the specified substrings
keywords_to_remove = ['Salbutamol', 'Dextrose', 'Acetylcysteine', 'Normal saline', 'Potassium chloride', 'Ambroxol','Famotidine','Hartmann', 'Midazolam', 
                      'Heparin sodium', 'Plasma solution A', 'Thiopental', 'Epinephrine', 'Calciumluconate', 'Calcium chloride', 'Sodium bicarbonate',
                      'Albumin', 'Dexamethasone', 'Mannitol', 'Protamine', 'Ceftezole', 'Methylprednisolone', 'Vecuronium','Sodium chloride', 'Magnesium sulfate']
filtered_data = df[~df['source_name'].str.contains('|'.join(keywords_to_remove), case=False)]

# Display the number of rows before and after filtering, and show the first few rows of the filtered dataframe
original_row_count = df.shape[0]
filtered_row_count = filtered_data.shape[0]
filtered_data.head(), original_row_count, filtered_row_count


(   person_id cohort_start_date cohort_end_date  drug_exposure_id  \
 0    2201130   10/22/2004 0:00  9/30/2022 0:00          61263186   
 1    2201130   10/22/2004 0:00  9/30/2022 0:00          59915163   
 2    2201130   10/22/2004 0:00  9/30/2022 0:00          60144059   
 3    2201130   10/22/2004 0:00  9/30/2022 0:00         183327059   
 4    2201130   10/22/2004 0:00  9/30/2022 0:00          64978071   
 
    drug_concept_id drug_exposure_start_date drug_exposure_end_date  \
 0           703248           8/25/2017 0:00         9/28/2017 0:00   
 1           703248           7/21/2017 0:00         8/31/2017 0:00   
 2           703248           7/27/2017 0:00         7/27/2017 0:00   
 3           703248           4/15/2016 0:00         5/12/2016 0:00   
 4           703248           12/4/2017 0:00          1/7/2018 0:00   
 
    drug_type_concept_id  quantity  days_supply  ... dose_unit_source_value  \
 0              38000177      35.0         35.0  ...                    tab  

In [97]:
# 'source_name' 열에서 'furosemide'를 포함하는 행 확인
furosemide_data = filtered_data[filtered_data['source_name'].str.lower().str.contains('verapamil', na=False)]

# 'source_name', 'drug_source_value', 'route_source_value' 열에서 고유한 값 출력
desired_columns = ['drug_concept_id', 'source_name', 'drug_source_value', 'route_concept_id', 'route_source_value']

# 각 source_name에 대한 빈도수 계산
source_name_counts = furosemide_data['source_name'].value_counts()

# 빈도수를 데이터프레임에 추가하고 출력
unique_values = furosemide_data[desired_columns].drop_duplicates().reset_index(drop=True)
unique_values['count'] = unique_values['source_name'].map(source_name_counts)
unique_values


Unnamed: 0,drug_concept_id,source_name,drug_source_value,route_concept_id,route_source_value,count


In [99]:
# 'source_name' 열에서 'furosemide'를 포함하는 행 확인
furosemide_data = df[df['drug_source_value'].isin(['ADNI','ADN90I', 'DGXI', 'DGXL','DGX25'])]

# 'source_name', 'drug_source_value', 'route_source_value' 열에서 고유한 값 출력
desired_columns = ['drug_concept_id', 'source_name', 'drug_source_value', 'route_concept_id', 'route_source_value']

# 각 source_name에 대한 빈도수 계산
source_name_counts = furosemide_data['source_name'].value_counts()

# 빈도수를 데이터프레임에 추가하고 출력
unique_values = furosemide_data[desired_columns].drop_duplicates().reset_index(drop=True)
unique_values['count'] = unique_values['source_name'].map(source_name_counts)
unique_values


Unnamed: 0,drug_concept_id,source_name,drug_source_value,route_concept_id,route_source_value,count
0,1326437,Digoxin 0.05mg/ml syrup,DGXL,4132161,경구,4936
1,19018937,Digoxin 0.25mg tab,DGX25,4132161,경구,7899
2,36895150,Digoxin 0.25mg/1mL inj,DGXI,4156706,주사,369
3,46234165,Adenosine 90mg/30ml inj,ADN90I,4156706,주사,8
4,46234177,Adenosine 6mg/2ml inj,ADNI,4156706,주사,171


In [83]:
# 'drug_source_value' 열에서 값이 'HCT'인 행 확인
hct_drugs = df[df['drug_source_value'] == 'MXLT']

# 결과 출력
hct_drugs[['drug_concept_id', 'source_name', 'drug_source_value']].drop_duplicates()


Unnamed: 0,drug_concept_id,source_name,drug_source_value


In [None]:
# 'source_name' 열에서 'furosemide'를 포함하는 행 확인
furosemide_data = filtered_data[filtered_data['source_name'].str.contains('hydrochlorothiazide', na=False)]

# 'source_name', 'drug_source_value', 'route_source_value' 열에서 고유한 값 출력
desired_columns = ['drug_concept_id', 'source_name', 'drug_source_value', 'route_concept_id', 'route_source_value']

# 각 source_name에 대한 빈도수 계산
source_name_counts = furosemide_data['source_name'].value_counts()

# 빈도수를 데이터프레임에 추가하고 출력
unique_values = furosemide_data[desired_columns].drop_duplicates().reset_index(drop=True)
unique_values['count'] = unique_values['source_name'].map(source_name_counts)
unique_values


In [25]:
# Count unique values in the 'source_name' column
unique_source_names = filtered_data['source_name'].nunique()
unique_source_names

1330

In [26]:
# drug_concept_id 및 source_name 컬럼에 따른 처방 통계 계산
prescription_stats = filtered_data.groupby(['drug_concept_id', 'source_name', 'atc_cd']).size().reset_index(name='count')

# 처방받은 횟수에 따라 내림차순으로 정렬
prescription_stats_sorted = prescription_stats.sort_values(by='count', ascending=False)

prescription_stats_sorted.head(15)

Unnamed: 0,drug_concept_id,source_name,atc_cd,count
409,19079658,Spironolactone 25mg tab,C03DA01,28710
81,957136,Furosemide 40mg tab,C03CA01,28287
742,40165789,Enalapril 5mg tab,C09AA02,23732
109,1113143,Aspirin 100mg tab,N02BA01,13536
1000,42920240,Calcium gluconate 2g/20mL inj,A12AA03,13399
605,35603226,Furosemide 20mg/2mL inj,C03CA01,12531
1051,42921852,Alprostadil 10mcg/2ml inj,C01EA01,10257
931,42628993,Milrinone 10mg/10ml inj,C01CE02,9566
683,36895616,Nitroglycerin 10mg/10ml inj,C01DA02,8733
897,41359871,Dopamine hydrochloride 200mg/5mL inj,C01CA04,8008


In [23]:
# 'source_name' 열에서 'Normal saline'을 포함하는 행 확인
normal_saline_data = df[df['source_name'].str.contains('Normal saline', na=False)]

# 'source_name' 열과 'route_source_value' 열에서 고유한 값 출력
unique_values = normal_saline_data[['source_name', 'route_source_value']].drop_duplicates()
print(unique_values)


                                            source_name route_source_value
106294            Amikacin/Normal saline 250mg/50ml inj                 주사
238788  Normal saline(NaCl 0.9%) 10mL Prefilled syringe                 주사
250882                 Normal saline 0.9% K20 100mL btl                 주사
342385               Dextrose 5% & Normal saline 1L bag                 주사
409569                             Normal saline 3L bag                 주사
461758                       Normal saline 0.45% 1L bag                 주사
462085                     Normal saline 0.9% 150ml btl                 주사
462891                      Normal saline 0.9% 50ml btl                 주사
480705                     Normal saline 0.9% 100ml bag                 주사
487540                     Normal saline 0.9% 250ml bag                 주사
488266                     Normal saline 0.9% 500ml btl                 주사
491069              Normal saline 0.9% 20ml plastic inj                 주사
527288                   

In [16]:
# 'source_name' 열에서 'Normal saline'을 포함하는 행 확인
normal_saline_data = df[df['source_name'].str.contains('Methylprednisolone', na=False)]

# 'source_name' 열과 'route_source_value' 열에서 고유한 값 출력
unique_values = normal_saline_data[['source_name', 'route_source_value']].drop_duplicates()
print(unique_values)


                                         source_name route_source_value
81587                     Methylprednisolone 4mg tab                 경구
81660          Methylprednisolone succinate 40mg inj                 주사
333164        Methylprednisolone succinate 125mg inj                 주사
334671        Methylprednisolone succinate 500mg inj                 주사
526784   Methylprednisolone aceponate 0.1% 15g cream                 외용
526817          Methylprednisolone 0.1% 15g ointment                 외용
546491  Methylprednisolone aceponate Lotion 0.1% 80g                 외용
676992   Methylprednisolone aceponate 0.1% 10g cream                 외용
