# UKBiobank Achalasia
Here we assemble the necessary IDs required to do GWAS analysis on the achalasia data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
UKBB_df = pd.read_csv('datasets/self_hospital.csv', index_col=0)
UKBB_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,eid,20001-0.0,20001-0.1,20001-0.2,20001-0.3,20001-0.4,20001-0.5,20001-1.0,20001-1.1,20001-1.2,...,41281-0.37,41281-0.38,41281-0.39,41281-0.40,41281-0.41,41281-0.42,41281-0.43,41281-0.44,41281-0.45,41281-0.46
0,1000014,,,,,,,,,,...,,,,,,,,,,
1,1000023,,,,,,,,,,...,,,,,,,,,,
2,1000030,,,,,,,,,,...,,,,,,,,,,
3,1000041,,,,,,,,,,...,,,,,,,,,,
4,1000059,,,,,,,,,,...,,,,,,,,,,


41270 datafield refer to ICD10 diagnoses. 41202 refer to main ICD10 diagnoses. 41204 refers to secondary ICD10 columns. 2002 refers to self-report columns

# Identify Achalasia Patients

In [3]:
#Selecting df based on ICD10 columns and including eid
UKBBICD10_df = UKBB_df[[col for col in UKBB_df.columns if col == "eid" or col.startswith("41270")]]
achalasia_df = UKBBICD10_df[np.isin(UKBBICD10_df, ['K220','K224']).any(axis=1)]

In [4]:
#Selecting df based on main ICD10 columns and including eid
UKBBmICD10_df = UKBB_df[[col for col in UKBB_df.columns if col == "eid" or col.startswith("41202")]]
machalasia_df = UKBBmICD10_df[np.isin(UKBBmICD10_df, ['K220','K224']).any(axis=1)]

In [5]:
#Selecting df based on secondary ICD10 columns and including eid
UKBB2ndICD10_df = UKBB_df[[col for col in UKBB_df.columns if col == "eid" or col.startswith("41204")]]
secachalasia_df = UKBB2ndICD10_df[np.isin(UKBB2ndICD10_df, ['K220','K224']).any(axis=1)]

In [6]:
#Selecting df based on external causes ICD10 columns and including eid
UKBBeICD10_df = UKBB_df[[col for col in UKBB_df.columns if col == "eid" or col.startswith("41201")]]
exterachalasia_df = UKBBeICD10_df[np.isin(UKBBeICD10_df, ['K220','K224']).any(axis=1)]

In [7]:
#Selecting df based on self report columns and including eid: 1501 
UKBBself_df = UKBB_df[[col for col in UKBB_df.columns if col == "eid" or col.startswith("20002")]]
selfachalasia_df = UKBBself_df[np.isin(UKBBself_df, [(1501.0)]).any(axis=1)]

In [8]:
#Combine tables
allachalasia_df = pd.merge(selfachalasia_df, achalasia_df, how = 'outer', on = 'eid')
allachalasia_df = pd.merge(allachalasia_df, machalasia_df, how = 'outer', on = 'eid')
allachalasia_df = pd.merge(allachalasia_df, secachalasia_df, how = 'outer', on = 'eid')
allachalasia_df = pd.merge(allachalasia_df, exterachalasia_df, how = 'outer', on = 'eid')
allachalasia_df

Unnamed: 0,eid,20002-0.0,20002-0.1,20002-0.2,20002-0.3,20002-0.4,20002-0.5,20002-0.6,20002-0.7,20002-0.8,...,41201-0.12,41201-0.13,41201-0.14,41201-0.15,41201-0.16,41201-0.17,41201-0.18,41201-0.19,41201-0.20,41201-0.21
0,1038831,1501.0,1465.0,1154.0,1265.0,,,,,,...,,,,,,,,,,
1,1058137,1065.0,1220.0,1276.0,1523.0,1473.0,1501.0,1479.0,,,...,,,,,,,,,,
2,1094461,1065.0,1113.0,1651.0,1501.0,1473.0,,,,,...,,,,,,,,,,
3,1102623,1065.0,1501.0,,,,,,,,...,,,,,,,,,,
4,1112794,1501.0,1351.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,6000778,,,,,,,,,,...,,,,,,,,,,
1076,6001735,,,,,,,,,,...,,,,,,,,,,
1077,6018231,,,,,,,,,,...,,,,,,,,,,
1078,6018455,,,,,,,,,,...,,,,,,,,,,


In [9]:
#Give alchalasia a binary label: 1
allachalasia_df['achalasia'] = 1
allachalasia_df.to_csv('achalasia/all_achalasia.csv', index=False)

In [10]:
achalasia_binary_df = pd.merge(allachalasia_df[["eid", "achalasia"]], UKBB_df["eid"],  how = 'outer', on = 'eid')
achalasia_binary_df = achalasia_binary_df.fillna(0)

In [11]:
achalasia_binary_df

Unnamed: 0,eid,achalasia
0,1038831,1.0
1,1058137,1.0
2,1094461,1.0
3,1102623,1.0
4,1112794,1.0
...,...,...
502489,6024904,0.0
502490,6024916,0.0
502491,6024920,0.0
502492,6024937,0.0


In [12]:
achalasia_binary_df.to_csv("achalasia/achalasia_binary.csv", index=None)

# Conclusion
Before going into GWAS, we have 1080 patients out of 502494 patients have some form of achalasia diagnoses.