# ipfjes quality analysis notebook

In [1]:
import pandas as pd
import psycopg2 as pg
import pandas.io.sql as psql
import datetime 
import numpy as np
%matplotlib inline

In [2]:
conn = pg.connect("dbname='carlplaying' user='drcjar' password='drcjar_is_not_fake'")

In [3]:
cursor = conn.cursor()
cursor.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
e = cursor.fetchall()

In [4]:
table_name_list = [i[0] for i in e]

In [5]:
select_template = 'SELECT * FROM {table_name}'
frames_dict = {}
for tname in table_name_list:
    query = select_template.format(table_name = tname)
    frames_dict[tname] = pd.read_sql(query, conn)

In [6]:
table_name_list

['axes_accessattempt',
 'axes_accesslog',
 'django_migrations',
 'django_session',
 'django_site',
 'auth_user',
 'auth_group',
 'ipfjes_allergies',
 'authtoken_token',
 'auth_user_groups',
 'auth_user_user_permissions',
 'django_admin_log',
 'ipfjes_ctandbiopsy',
 'ipfjes_demographics',
 'ipfjes_cohabitationhistory',
 'ipfjes_bloodrelationhistory',
 'ipfjes_investigation',
 'ipfjes_ipfjesethnicity',
 'ipfjes_dyspnoea',
 'ipfjes_diagnosishistory',
 'ipfjes_everencounteredasbestos',
 'ipfjes_generalnotes',
 'ipfjes_mask',
 'ipfjes_location',
 'ipfjes_smokables',
 'ipfjes_site',
 'ipfjes_scarringdrugs',
 'ipfjes_residentialhistory',
 'ipfjes_reasonforremoval',
 'ipfjes_relationship',
 'ipfjes_socjob',
 'ipfjes_soccode',
 'ipfjes_smokinghistory',
 'ipfjes_tasklocation',
 'ipfjes_symptomcomplex',
 'opal_antimicrobial',
 'opal_antimicrobial_adverse_event',
 'opal_antimicrobial_frequency',
 'opal_antimicrobial_route',
 'ipfjes_treatment',
 'opal_clinical_advice_reason_for_interaction',
 'opa

# lets identify participants who have withdrawn so we can exclude from quality checks

In [7]:
print(len(frames_dict['ipfjes_removalreason'][~frames_dict['ipfjes_removalreason'].reason_fk_id.isnull()]))

67


In [8]:
frames_dict['ipfjes_removalreason'][~frames_dict['ipfjes_removalreason'].reason_fk_id.isnull()] # participants removed

Unnamed: 0,id,created,updated,consistency_token,reason_ft,created_by_id,episode_id,reason_fk_id,updated_by_id
89,67,,2017-10-12 16:28:53.408685+01:00,7be270bf,,,59,3.0,4.0
90,14,,2017-10-12 16:36:50.673188+01:00,28629739,,,83,1.0,4.0
93,96,,2017-10-19 10:34:33.416127+01:00,e8750282,,,101,1.0,4.0
149,156,,2017-11-24 10:25:04.486385+00:00,167e947b,,,161,1.0,4.0
152,95,,,8f6267d9,Unknown,,43,4.0,
231,211,,2018-01-23 18:24:33.224619+00:00,312d8e08,,,216,1.0,4.0
252,7,,,carl,,,11,2.0,
253,33,,,10bd23bf,,,28,1.0,
265,237,,2018-02-08 14:12:50.905425+00:00,9dccc4db,,,242,3.0,4.0
266,242,,2018-02-08 14:14:24.350733+00:00,d522e5d6,,,247,3.0,4.0


In [9]:
notepisodes = frames_dict['ipfjes_removalreason'][~frames_dict['ipfjes_removalreason'].reason_fk_id.isnull()].episode_id.values # participants removed

In [10]:
len(notepisodes)

67

# does everyone have a CT and/or biopsy result

In [11]:
len(frames_dict['ipfjes_ctandbiopsy'])

1033

In [12]:
ct = frames_dict['ipfjes_ctandbiopsy'][~frames_dict['ipfjes_ctandbiopsy'].episode_id.isin(notepisodes)].copy()

In [13]:
len(ct)

966

In [14]:
assert ct.ct_findings.isnull().any() == False # should have a ct for everyone

In [15]:
ct[ct.ct_findings.isnull() == True].episode_id.values # lets see who we don't have a ct for

array([], dtype=int64)

In [16]:
ct[ct.episode_id.duplicated()] # shouldn't have duplicated episode ids in the table

Unnamed: 0,id,created,updated,consistency_token,ct_findings,ct_findings_other,biopsy_findings,biopsy_findings_other,created_by_id,episode_id,updated_by_id


In [17]:
frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics'].patient_id.isin([1064, 1059, 1057, 1065, 1053, 1016, 1005,  999,  983, 1060, 1062,
       1061, 1056])][['first_name', 'surname', 'date_of_birth']].to_csv('for_carl_to_lookup.csv')

# are the hospital numbers sensible

In [18]:
frames_dict['ipfjes_demographics'].hospital_number.map(len).value_counts() # should be 6

6    1033
Name: hospital_number, dtype: int64

In [19]:
frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics'].hospital_number.map(len) == 0]

Unnamed: 0,id,created,updated,consistency_token,hospital_number,nhs_number,surname,first_name,middle_name,date_of_birth,...,birth_place_fk_id,created_by_id,ethnicity_fk_id,marital_status_fk_id,patient_id,sex_fk_id,title_fk_id,updated_by_id,contact_details,phone_number


In [20]:
130029

130029

In [21]:
frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics'].hospital_number.astype(str) == '040044']

Unnamed: 0,id,created,updated,consistency_token,hospital_number,nhs_number,surname,first_name,middle_name,date_of_birth,...,birth_place_fk_id,created_by_id,ethnicity_fk_id,marital_status_fk_id,patient_id,sex_fk_id,title_fk_id,updated_by_id,contact_details,phone_number
392,444,,2018-07-04 15:35:35.192653+01:00,62c39d49,40044,,Leigh,David,,1939-07-27,...,,,,,444,,,3,"Townhill Surgery, Wessex Road, \nSouthampton, ...",02380191702 (tuesdays preferred)


In [22]:
frames_dict['ipfjes_studyparticipantdetails'][frames_dict['ipfjes_studyparticipantdetails'].episode_id == 444]

Unnamed: 0,id,created,updated,consistency_token,participant_type,site_ft,created_by_id,episode_id,site_fk_id,updated_by_id,comments,email_address,postal_address,want_updates
351,444,,2018-06-13 12:13:49.156387+01:00,8ca6e1ac,control,,,444,6.0,4,"patient has had a couple of strokes, so interv...",,,No


In [23]:
assert frames_dict['ipfjes_demographics'].hospital_number.isnull().any() == False # everyone should have one

In [24]:
assert len(frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics'].hospital_number.duplicated()]) == 0
# no duplicates

In [25]:
frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics'].hospital_number.duplicated()]

Unnamed: 0,id,created,updated,consistency_token,hospital_number,nhs_number,surname,first_name,middle_name,date_of_birth,...,birth_place_fk_id,created_by_id,ethnicity_fk_id,marital_status_fk_id,patient_id,sex_fk_id,title_fk_id,updated_by_id,contact_details,phone_number


In [26]:
assert frames_dict['ipfjes_demographics'].id.isnull().any() == False
assert frames_dict['ipfjes_demographics'].surname.isnull().any() == False 
assert frames_dict['ipfjes_demographics'].first_name.isnull().any() == False
assert frames_dict['ipfjes_demographics'].date_of_birth.isnull().any() == False
assert frames_dict['ipfjes_demographics'].ethnicity_ft.isnull().any() == False

In [27]:
frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics'].date_of_birth.isnull() == True]

Unnamed: 0,id,created,updated,consistency_token,hospital_number,nhs_number,surname,first_name,middle_name,date_of_birth,...,birth_place_fk_id,created_by_id,ethnicity_fk_id,marital_status_fk_id,patient_id,sex_fk_id,title_fk_id,updated_by_id,contact_details,phone_number


In [28]:
datetime.date.today()

datetime.date(2019, 12, 17)

In [29]:
frames_dict['ipfjes_demographics'].date_of_birth.dropna().min()

datetime.date(1925, 1, 26)

In [30]:
(datetime.date.today() - frames_dict['ipfjes_demographics'].date_of_birth.dropna().min()) / 365

datetime.timedelta(94, 82375, 890411)

In [31]:
frames_dict['ipfjes_demographics'].date_of_birth.dropna().max()

datetime.date(1986, 6, 27)

In [32]:
(datetime.date.today() - frames_dict['ipfjes_demographics'].date_of_birth.dropna().max()) / 365

datetime.timedelta(33, 42844, 931507)

In [33]:
assert frames_dict['ipfjes_cohabitationhistory'].patient_id.isnull().any() == False
assert frames_dict['ipfjes_cohabitationhistory'].nameofperson.isnull().any() == False
assert frames_dict['ipfjes_cohabitationhistory'].occupation.isnull().any() == False

# http://129.31.156.129/#/patient/1022 - ask caitlin

In [34]:
frames_dict['ipfjes_cohabitationhistory'][frames_dict['ipfjes_cohabitationhistory'].occupation.isnull() == True]

Unnamed: 0,id,created,updated,consistency_token,nameofperson,relationship,howlong,occupation,created_by_id,patient_id,updated_by_id


In [35]:
assert frames_dict['ipfjes_bloodrelationhistory'].episode_id.isnull().any() == False
assert frames_dict['ipfjes_bloodrelationhistory'].relation_ft.isnull().any() == False
assert frames_dict['ipfjes_bloodrelationhistory'].scarring.isnull().any() == False

In [36]:
# frames_dict['ipfjes_ipfjesethnicity'] ethnicity lookup
# frames_dict['ipfjes_mask'] mask lookup
# frames_dict['ipfjes_site'] centre lookup
# frames_dict['ipfjes_soccode'] soc code lookup
# frames_dict['ipfjes_asbestoslocalcontrol'] asbestos controls lookup
# frames_dict['ipfjes_asbestoshandling'] asbestos handling lookup

In [37]:
assert frames_dict['ipfjes_dyspnoea'].id.isnull().any() == False # mrc dyspnoea

In [38]:
assert frames_dict['ipfjes_diagnosishistory'].id.isnull().any() == False # how diagnosed

# lets check the smoking histories

In [39]:
sh = frames_dict['ipfjes_smokinghistory'][~frames_dict['ipfjes_smokinghistory'].episode_id.isin(notepisodes)].copy()

In [74]:
sh.columns

Index(['id', 'created', 'updated', 'consistency_token', 'ever_smoked',
       'current_smoker', 'start_smoking_age', 'stop_smoking_age',
       'cigarettes_per_day', 'created_by_id', 'episode_id', 'updated_by_id',
       'smoking_notes', 'smoking_type_other', 'what_do_you_smoke'],
      dtype='object')

In [75]:
sh[sh.id.isin([13, 55, 74, 119, 384, 546, 808])]

Unnamed: 0,id,created,updated,consistency_token,ever_smoked,current_smoker,start_smoking_age,stop_smoking_age,cigarettes_per_day,created_by_id,episode_id,updated_by_id,smoking_notes,smoking_type_other,what_do_you_smoke
11,74,,2017-09-18 15:25:12.953638+01:00,56217da6,Yes,Yes,18,,,,74,4.0,,roll up 1/2 ounce / day and half,Other
67,55,,2017-08-31 12:21:03.745300+01:00,1095b496,Yes,No,13,28.0,,,55,4.0,,ounce a week pipe,Other
83,13,,2018-02-07 15:53:04.958640+00:00,21e8182a,Yes,No,15,15.0,10.0,,13,4.0,,,Cigarettes
112,119,,2017-11-09 11:01:08.012331+00:00,12645e70,Yes,No,26,33.0,,,119,4.0,,"pipe, ounce would last 2-3 weeks and cigars 10...",Other
360,384,,2018-04-25 11:22:31.256074+01:00,ff6938d3,Yes,No,18,35.0,0.0,,384,1.0,,,Pipe
515,546,,2018-07-26 15:43:49.205731+01:00,bbf0afcf,Yes,No,18,33.0,,,546,3.0,,,Roll-ups
783,808,,2019-01-31 12:12:24.179642+00:00,545e5cc2,Yes,No,17,33.0,,,808,3.0,,,Cigarettes


In [40]:
assert sh.id.isnull().any() == False # smoking history
# frames_dict['ipfjes_smokinghistory']

In [42]:
assert sh.ever_smoked.isnull().any() == False

AssertionError: 

In [43]:
sh[sh.ever_smoked.isnull()].id.values # missing smoking hx to fix, mostly just haven't been interviewed yet

array([ 789, 1039, 1051, 1054, 1055, 1058])

In [44]:
# 390, 400, 401, 415, 453, 502, 507, 555, 557, 584, no longer wishes to take part or no response - why is removal reason != null ?
# 596, 599, 601 has smoking history what's going on? (prob not interviewed before but now are)


In [45]:
sh.start_smoking_age.fillna('0').map(lambda x: len(x)).value_counts() # start smoking age should be sensible

2    686
1    279
0      1
Name: start_smoking_age, dtype: int64

In [46]:
sh[sh.start_smoking_age.fillna('0').map(lambda x: len(x)) ==0]

Unnamed: 0,id,created,updated,consistency_token,ever_smoked,current_smoker,start_smoking_age,stop_smoking_age,cigarettes_per_day,created_by_id,episode_id,updated_by_id,smoking_notes,smoking_type_other,what_do_you_smoke
85,79,,2019-11-08 21:50:38.975548+00:00,c215c4f7,No,No,,,,,79,4.0,,,


In [47]:
sh.to_csv("smokey.csv")

In [48]:
sh[(sh.cigarettes_per_day.notnull()) & (sh.cigarettes_per_day.str.contains('-'))]

Unnamed: 0,id,created,updated,consistency_token,ever_smoked,current_smoker,start_smoking_age,stop_smoking_age,cigarettes_per_day,created_by_id,episode_id,updated_by_id,smoking_notes,smoking_type_other,what_do_you_smoke


In [49]:
sh.stop_smoking_age.fillna('0').map(lambda x: len(x)).value_counts() # ditto stop smoking age should be sensible

2    660
1    305
0      1
Name: stop_smoking_age, dtype: int64

In [50]:
sh[sh.stop_smoking_age.fillna('0').map(lambda x: len(x)) == 0]

Unnamed: 0,id,created,updated,consistency_token,ever_smoked,current_smoker,start_smoking_age,stop_smoking_age,cigarettes_per_day,created_by_id,episode_id,updated_by_id,smoking_notes,smoking_type_other,what_do_you_smoke
85,79,,2019-11-08 21:50:38.975548+00:00,c215c4f7,No,No,,,,,79,4.0,,,


In [51]:
sh[sh.ever_smoked.isnull()] # check that this maps our home screen

Unnamed: 0,id,created,updated,consistency_token,ever_smoked,current_smoker,start_smoking_age,stop_smoking_age,cigarettes_per_day,created_by_id,episode_id,updated_by_id,smoking_notes,smoking_type_other,what_do_you_smoke
751,789,,,,,,,,,,789,,,,
979,1039,,,,,,,,,,1039,,,,
1013,1051,,,,,,,,,,1051,,,,
1015,1054,,,,,,,,,,1054,,,,
1016,1055,,,,,,,,,,1055,,,,
1017,1058,,,,,,,,,,1058,,,,


In [52]:
len(sh[sh.ever_smoked.isnull()])

6

# lets identify people who haven't been interviewed yet. given we're checking and fixing folk who lack a smoking hx. we can assume that we map those without interview.

In [53]:
notepisodesyet = sh[sh.ever_smoked.isnull()].episode_id.values

In [54]:
nes = np.append(notepisodesyet, notepisodes) # lets not consider any of the episodes that aren't complete or removed

In [55]:
print('total of {} not included, comprised of {} participants who withdrew and {} participants yet to be interviewed \n nb we also have 3 isle of man pts with no long/lat'.format(len(nes), len(notepisodes), len(sh[sh.ever_smoked.isnull()]))) 

total of 73 not included, comprised of 67 participants who withdrew and 6 participants yet to be interviewed 
 nb we also have 3 isle of man pts with no long/lat


In [56]:
with open("notepisodes.txt", "w") as f: # lets save what are not episodes
    for s in list(nes):
        f.write(str(s) +"\n")

In [57]:
aes = frames_dict['ipfjes_asbestosexposurescreening'][~frames_dict['ipfjes_asbestosexposurescreening'].episode_id.isin(nes)].copy()
assert aes.exposed.isnull().any == False # ever exposed 

AssertionError: 

In [58]:
aes[aes.exposed.isnull()] # missing asbestos hx to fix

Unnamed: 0,id,created,updated,consistency_token,exposed,description,created_by_id,episode_id,updated_by_id,related_occupation_id
282,326,2018-02-08 15:13:34.557446+00:00,2019-11-09 21:12:37.771624+00:00,5dd9573e,,,4.0,269,4.0,1155.0
566,638,2018-06-28 12:27:38.984785+01:00,,176dd81b,,,4.0,32,,212.0
567,639,2018-06-28 12:28:17.724315+01:00,,ca3ddecd,,,4.0,49,,239.0
814,890,2018-10-29 14:21:01.014833+00:00,2018-12-12 15:38:17.966310+00:00,78cab084,,,3.0,691,3.0,2832.0
821,899,2018-10-31 11:38:55.058070+00:00,2018-12-12 15:38:17.967661+00:00,1ba416ac,,,4.0,691,3.0,2835.0
976,982,2018-11-21 11:34:37.131855+00:00,2018-12-12 17:40:12.502580+00:00,7d898f92,,,3.0,733,3.0,3071.0
1087,1153,2019-05-09 16:19:20.622773+01:00,,5c3cb851,,,4.0,97,,456.0
1106,1172,2019-05-20 15:01:15.613398+01:00,,229bbdcb,,,3.0,892,,3673.0
1156,1221,2019-06-13 11:36:39.374826+01:00,,39f20774,,,4.0,351,,1532.0
1275,1359,2019-09-12 15:32:44.685464+01:00,,ed31fbbd,,,11.0,1047,,4287.0


In [59]:
# rupa checking 080019

# issue of ambiguity in asbestos exposure screening question at end. if pt has already reported exposure and been screened then we select no. if they havent and are screened yes we complete question including desc of job to link back.

# therefore ever exposed people = those reporting an exposure history (for any job) +/or those answering yes to screening question 

In [60]:
frames_dict['ipfjes_asbestosexposurehistory'].handling_ft.value_counts() # how many cherrie histories we have

                           602
breaking or ripping out     30
Name: handling_ft, dtype: int64

In [61]:
frames_dict['ipfjes_asbestoshandling']

Unnamed: 0,id,name
0,1,lifting boards without damage
1,2,painting boards
2,3,wearing protective gloves or aprons
3,4,unscrewing and carefully removing boards
4,5,machine stitching quilts (yarn only)
5,6,lifting loose fibre and bagging
6,7,machine stitching quilts (cloth & yarn)
7,8,braiding rope
8,9,drilling
9,10,breaking or ripping out


In [62]:
noadesc = frames_dict['ipfjes_asbestosexposurehistory'][frames_dict['ipfjes_asbestosexposurehistory'].description.isnull()]

In [63]:
noadesc[~noadesc.handling_fk_id.isnull()].episode_id.values # no description but do have handling info..

array([428, 276])

In [64]:
noadesc[~noadesc.asbestos_type.isnull()].episode_id.values # no description but do have asbestos type..

array([122, 114, 299, 428, 924, 276])

# issue: we have a lot of blank asbestos exposure descriptions

In [65]:
frames_dict['ipfjes_occupationalhistory'].columns # occ history, needs debugging

Index(['id', 'job_name', 'job_tasks', 'employer_output', 'start_year',
       'end_year', 'address', 'consistency_token', 'created', 'created_by_id',
       'patient_id', 'updated', 'updated_by_id', 'soc_job_fk_id', 'soc_job_ft',
       'av_hours_per_week_if_not_full_time',
       'av_months_per_year_if_not_year_round', 'company_name', 'full_time',
       'working_area_ft', 'year_round', 'working_area_fk_id', 'soc_code_id'],
      dtype='object')

In [66]:
# frames_dict['ipfjes_occupationalhistory'][['patient_id', 'soc_job_ft', 'soc_job_fk_id', 'job_tasks', 'start_year', 'end_year']]

In [67]:
assert frames_dict['ipfjes_studyparticipantdetails'].participant_type.isnull().any() == False # check everyone is a case or control

In [68]:
frames_dict['ipfjes_studyparticipantdetails'][frames_dict['ipfjes_studyparticipantdetails'].participant_type.isnull()]

Unnamed: 0,id,created,updated,consistency_token,participant_type,site_ft,created_by_id,episode_id,site_fk_id,updated_by_id,comments,email_address,postal_address,want_updates


In [69]:
frames_dict['ipfjes_studyparticipantdetails'].participant_type.value_counts()

case       517
control    516
Name: participant_type, dtype: int64

In [70]:
frames_dict['ipfjes_demographics'][(frames_dict['ipfjes_demographics']['surname'] == "Farmery") & (frames_dict['ipfjes_demographics']['first_name'] == "David") ]['contact_details'].values

array(['Grove Park Terrace Surgery 25 Grove Park Terrace, Chiswick, London, W4 3JL'],
      dtype=object)

In [71]:
frames_dict['ipfjes_demographics'][frames_dict['ipfjes_demographics']['contact_details'].isnull()] 
# missing gp addresses

Unnamed: 0,id,created,updated,consistency_token,hospital_number,nhs_number,surname,first_name,middle_name,date_of_birth,...,birth_place_fk_id,created_by_id,ethnicity_fk_id,marital_status_fk_id,patient_id,sex_fk_id,title_fk_id,updated_by_id,contact_details,phone_number
819,1052,,2019-09-23 09:28:38.814674+01:00,de8e9a04,200064,,Sellers,Peter,,1949-12-17,...,,,,,1051,,,4,,7984553986
821,1053,,2019-09-23 09:29:21.893396+01:00,033d93fa,200063,,Elliot,Robert,,1941-12-15,...,,,,,1052,,,4,,2086921669
833,1059,,2019-09-23 09:45:23.710362+01:00,6b609ef1,100104,,Graham,Alan,,1948-07-13,...,,,,,1058,,,4,,7932653419


In [72]:
# not known issue that isle of man postcodes don't have a centroid in ons lookup