In [1]:
# necessary imports
import pandas as pd
import numpy as np

np.random.seed(212)

In [2]:
# read in our cohort's data
df = pd.read_csv('clinical_data/subset_cohort.csv',low_memory=False)

In [6]:
df["f.eid"].to_csv("clinical_data/cohort_ids.csv",index=False)

In [3]:
# subset to the unique ids, the infertility data, and age at first heart attack data
azf_df = df[["f.eid","f.132084.0.0","f.3894.0.0"]].copy()

# generate labels for being infertile
azf_df["infertile"] = azf_df["f.132084.0.0"].notnull()

# generate 1% chance of having an AZF microdeletion in the cohort
azf_df["azf_null"] = np.random.binomial(1,0.01,len(azf_df))

# increase that chance to 7.5% in infertile men
azf_df.loc[azf_df["infertile"],"azf_null"] = np.random.binomial(1,0.075,len(azf_df.loc[azf_df["infertile"]]))

# do the same for MI data. but this time make AZF related to MI in the fertile men 
# (e.g. increased AZF among fertile men with MIs)
azf_df["azf_MI"] = azf_df["azf_null"]
azf_df.loc[((azf_df["f.3894.0.0"].notnull())
        &(~azf_df["infertile"])),"azf_MI"] = np.random.binomial(1,0.015,
                                                                len(azf_df.loc[((azf_df["f.3894.0.0"].notnull())
                                                                                &(~azf_df["infertile"]))]))

In [4]:
# show a ~1% prevalence of AZF in the general cohort with a ~7.5% chance in the infertile men
azf_df.groupby("infertile").azf_null.mean()

infertile
False    0.010203
True     0.061404
Name: azf_null, dtype: float64

In [5]:
# show a ~1% prevalence of AZF in the general cohort with a ~1.5% chance men with heart attacks
azf_df.groupby(df["f.3894.0.0"].notnull()).azf_MI.mean()

f.3894.0.0
False    0.010372
True     0.016459
Name: azf_MI, dtype: float64

In [6]:
# save the data with the appropriate formatting
azf_null = azf_df[["f.eid","azf_null"]]
azf_null.columns = ["userId","exposure"]
azf_null.to_csv("clinical_data/simulated_azf_null.csv",index=False)

azf_MI = azf_df[["f.eid","azf_MI"]]
azf_MI.columns = ["userId","exposure"]
azf_MI.to_csv("clinical_data/simulated_azf_MI.csv",index=False)

In [12]:
# create columns with the requisite formatting (from f.###.#.# to x###_#_#)
cols = df.columns
# change f. to x
cols = ["x"+x.split("f.")[1] for x in cols]
# change the . to _
cols = [("_").join(x.split(".")) for x in cols]
# change "f.eid" (now "xeid") to "userId"
cols = [x if x!="xeid" else "userId" for x in cols]
# set the columns 
df.columns = cols
df.head()

Unnamed: 0,userId,x31_0_0,x34_0_0,x52_0_0,x84_0_0,x84_0_1,x84_0_2,x84_0_3,x84_0_4,x84_0_5,...,x42012_0_0,x130706_0_0,x130708_0_0,x130710_0_0,x130712_0_0,x130714_0_0,x131286_0_0,x131294_0_0,x132084_0_0,x132085_0_0
0,1000047,Male,1943,July,,,,,,,...,,,,,,,2010-06-28,,,
1,1000050,Male,1958,October,,,,,,,...,,,,,,,,,,
2,1000068,Male,1942,October,,,,,,,...,,,2017-05-30,,,2004-04-01,2017-05-30,,,
3,1000094,Male,1952,September,,,,,,,...,,,,,,,,,,
4,1000145,Male,1945,September,,,,,,,...,,,,,,,,,,


In [13]:
# drop our infertility exposure and save
df.drop("x132084_0_0",axis=1).to_csv("clinical_data/simulated_phenotypes.csv",index=False)

In [4]:
# necessary imports
import pandas as pd
import numpy as np

np.random.seed(212)

# read in our cohort's data
df = pd.read_csv('clinical_data/subset_cohort.csv',low_memory=False)

# subset to the unique ids, the infertility data, and age at first heart attack data
azf_df = df[["f.eid","f.132084.0.0","f.3894.0.0"]].copy()

# generate labels for being infertile
azf_df["infertile"] = azf_df["f.132084.0.0"].notnull()

# generate 1% chance of having an AZF microdeletion in the cohort
azf_df["azf_null"] = np.random.binomial(1,0.01,len(azf_df))

# increase that chance to 7.5% in infertile men
azf_df.loc[azf_df["infertile"],"azf_null"] = np.random.binomial(1,0.075,len(azf_df.loc[azf_df["infertile"]]))

# do the same for MI data. but this time make AZF related to MI in the fertile men 
# (e.g. increased AZF among fertile men with MIs)
azf_df["azf_MI"] = azf_df["azf_null"]
azf_df.loc[((azf_df["f.3894.0.0"].notnull())
        &(~azf_df["infertile"])),"azf_MI"] = np.random.binomial(1,0.015,
                                                                len(azf_df.loc[((azf_df["f.3894.0.0"].notnull())
                                                                                &(~azf_df["infertile"]))]))

# show a ~1% prevalence of AZF in the general cohort with a ~7.5% chance in the infertile men
azf_df.groupby("infertile").azf_null.mean()

# show a ~1% prevalence of AZF in the general cohort with a ~1.5% chance men with heart attacks
azf_df.groupby(df["f.3894.0.0"].notnull()).azf_MI.mean()

# save the data with the appropriate formatting
azf_null = azf_df[["f.eid","azf_null"]]
azf_null.columns = ["userId","exposure"]
azf_null.to_csv("clinical_data/simulated_azf_null.csv",index=False)

azf_MI = azf_df[["f.eid","azf_MI"]]
azf_MI.columns = ["userId","exposure"]
azf_MI.to_csv("clinical_data/simulated_azf_MI.csv",index=False)

# create columns with the requisite formatting (from f.###.#.# to x###_#_#)
cols = df.columns
# change f. to x
cols = ["x"+x.split("f.")[1] for x in cols]
# change the . to _
cols = [("_").join(x.split(".")) for x in cols]
# change "f.eid" (now "xeid") to "userId"
cols = [x if x!="xeid" else "userId" for x in cols]
# set the columns 
df.columns = cols
df.head()

# drop our infertility exposure and save
df.drop("x132084_0_0",axis=1).to_csv("clinical_data/simulated_phenotypes.csv",index=False)