In [74]:
import pandas as pd
import xlrd
import warnings
warnings.filterwarnings('ignore')

In [33]:
# load subset of IPFJES participants with a genotype and an exposure assessment
df = pd.read_csv('flat_data_genotype_subset.csv')

In [34]:
len(df)

902

In [35]:
df.case.value_counts()

1    464
0    438
Name: case, dtype: int64

In [36]:
# louise says we have 8 plates in the first instance
# to top up another plate we'll have to randomly select additional samples we've not 
# previously selected (to be done as of 2/11/21)

n_of_plates = 8
n_of_wells = n_of_plates * 95 # 1 well of 96 well plate for a control

In [37]:
n_of_wells

760

In [38]:
n_of_wells / 2 # since we want even mix of cases and controls

380.0

In [39]:
# randomly sample 380 cases
cases = df[df['case'] == 1].sample(380)

In [40]:
# randomly sample 380 controls
controls = df[df['case'] == 0].sample(380)

In [41]:
# combine random sample of cases and controls
df2 = pd.concat([cases, controls])

In [42]:
# check ages roughly similar
df2.groupby('case').age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,380.0,74.947368,8.630948,34.0,70.0,76.0,80.0,96.0
1,380.0,77.136842,7.721212,55.0,72.0,77.0,83.0,95.0


In [43]:
# randomly shuffle the dataframe
df2 = df2.sample(frac = 1)

In [44]:
# make index sensible
df2.reset_index(inplace=True)

In [45]:
# our desired output index starts at 1
df2['sample_index'] = df2.index + 1

In [46]:
# 8 plates means 8 bins so put the samples in that and label accordingly
df2['Plate ID'] = pd.cut(df2.sample_index, bins=8, labels=[str(i) for i in range(1,9)], precision=0)

In [47]:
# check we did it right
df2['Plate ID'].value_counts()

8    95
7    95
6    95
5    95
4    95
3    95
2    95
1    95
Name: Plate ID, dtype: int64

In [48]:
# add project column as requested in guidance provided
df2['Project'] = 'IPFJES'

In [49]:
# formatting for spreadsheets
df3 = df2[['Plate ID', 'participant_id', 'Project']]

In [50]:
# cleaning dirty datas
# dna_locations.xls is a spreadsheet of sample locations that was made manually...
gt = pd.read_excel('dna_locations.xls',header=0)
gt['old_participant_id'] = gt['participant_id']
gt[gt.participant_id.duplicated(keep=False)].sort_values(by='participant_id')
gt = gt.drop_duplicates(subset='participant_id')
gt.participant_id = gt.participant_id.astype(str)
gt.participant_id = gt.participant_id.str.replace('-1','')
gt.participant_id = gt.participant_id.str.replace('-2','')
gt.participant_id = gt.participant_id.str.replace('-3','')
gt.participant_id = gt.participant_id.str.replace('DR|SF|WP|RN|PB|GW|-| ','')
gt.loc[gt['participant_id'] == '08003', 'participant_id'] = '080003'
gt.loc[gt['participant_id'] == '08005', 'participant_id'] = '080005'
gt.loc[gt['participant_id'] == '12004', 'participant_id'] = '120004'
gt.loc[gt['participant_id'] == '12006', 'participant_id'] = '120006'
gt.loc[gt['participant_id'] == '19006', 'participant_id'] = '190006'
gt.loc[gt['participant_id'] == '16018', 'participant_id'] = '160018'
gt.loc[gt['participant_id'] == '19007', 'participant_id'] = '190007'
gt.loc[gt['participant_id'] == '10004B', 'participant_id'] = '100004'
gt.loc[gt['participant_id'].map(len) == 7, 'participant_id'] = gt.loc[gt['participant_id'].map(len) == 7, 'participant_id'].str.replace('000','00')
gt.loc[gt.participant_id.map(len) == 5, 'participant_id'] = '0' + gt.loc[gt.participant_id.map(len) == 5, 'participant_id']
gt.participant_id = gt.participant_id.str.replace('040150','040015')
gt = gt.drop_duplicates(subset='participant_id')

In [51]:
# cleaning dirty datas (specifically addressing the fact that the preceding '0' is lost when strings are incorrectly saved as numbers)
df3.participant_id = df.participant_id.astype(str)
df3.loc[df3.participant_id.map(len) == 5, 'participant_id'] = '0' + df3.loc[df3.participant_id.map(len) == 5, 'participant_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [52]:
# samples I have genotyping and box location for
# but are apparently missing from my spreadsheet of dna locations
# which is a bit of a mess. sources of truth are log books, dna quality control data, genotyping results, and location data.
# we have a genotype for all of these so we must have a sample

set(df3.participant_id) - set(gt.participant_id)

"""
{'040041',
 '050038',
 '050043',
 '080003',
 '080004',
 '080006',
 '090044',
 '100052',
 '160042',
 '190004',
 '200023'}
"""



"\n{'040041',\n '050038',\n '050043',\n '080003',\n '080004',\n '080006',\n '090044',\n '100052',\n '160042',\n '190004',\n '200023'}\n"

In [53]:
# lets merge our study dataframe with our dna locations to add location data
df4 = pd.merge(df3, gt, on='participant_id', how='left')

In [54]:
# separately we have data on genomic content of samples
# herein we use that together with some calculations we did elsewhere
# to suggest what's needed to make 100ng/ul working stock that we can then use
# to plate samples to desired 25ul of 10ng/ml concentration sample in each well

df5 = pd.read_excel('carldata_kenny.xls')

In [55]:
# unfortunately genomic content of sample data also had a manually entered sample ID...

df5['participant_id'] = df5['Sample ID']
df5 = df5[df5['Genomic DNA'] > 0]
df5 = df5.drop_duplicates(subset='participant_id')
df5.participant_id = df5.participant_id.astype(str)
df5.participant_id = df5.participant_id.str.replace('-1','')
df5.participant_id = df5.participant_id.str.replace('-2','')
df5.participant_id = df5.participant_id.str.replace('-3','')
df5.participant_id = df5.participant_id.str.replace('DR|SF|WP|RN|PB|GW|-| ','')
df5.loc[df5['participant_id'] == '08003', 'participant_id'] = '080003'
df5.loc[df5['participant_id'] == '08005', 'participant_id'] = '080005'
df5.loc[df5['participant_id'] == '12004', 'participant_id'] = '120004'
df5.loc[df5['participant_id'] == '12006', 'participant_id'] = '120006'
df5.loc[df5['participant_id'] == '19006', 'participant_id'] = '190006'
df5.loc[df5['participant_id'] == '16018', 'participant_id'] = '160018'
df5.loc[df5['participant_id'] == '19007', 'participant_id'] = '190007'
df5.loc[df5['participant_id'] == '10004B', 'participant_id'] = '100004'
df5.loc[df5['participant_id'].map(len) == 7, 'participant_id'] = df5.loc[df5['participant_id'].map(len) == 7, 'participant_id'].str.replace('000','00')
df5.loc[df5.participant_id.map(len) == 5, 'participant_id'] = '0' + df5.loc[df5.participant_id.map(len) == 5, 'participant_id']
df5 = df5.drop_duplicates(subset='participant_id')

In [56]:
# lets combine with our earlier work to add details of how to make working stock

df6 = pd.merge(df4, df5, on='participant_id', how='left')

In [None]:
# lets add one other source of 'truth' for the lols...


In [61]:
df7 = pd.read_excel('genotyping_additional_information.xlsx')

In [64]:
df7

Unnamed: 0,audrey_participant_id,result,location
0,010082-1,Heterozygous,
1,040067-1,Homozygous Allele 1,Box Eight
2,200024-1,Heterozygous,Box Eight
3,200023-1,Heterozygous,Box Eight
4,190009-1,Heterozygous,Box Eight
5,060033-1,Heterozygous,Box Eight
6,160034-1,Heterozygous,Box Eight
7,010056-1,Homozygous Allele 1,Box Eight
8,030062-1,Homozygous Allele 1,Box Eight
9,010055-1,Heterozygous,Box Eight


In [65]:
df7['participant_id'] = df7['audrey_participant_id']
df7 = df7.drop_duplicates(subset='participant_id')
df7.participant_id = df7.participant_id.astype(str)
df7.participant_id = df7.participant_id.str.replace('-1','')
df7.participant_id = df7.participant_id.str.replace('-2','')
df7.participant_id = df7.participant_id.str.replace('-3','')
df7.participant_id = df7.participant_id.str.replace('DR|SF|WP|RN|PB|GW|-| ','')
df7.loc[df7['participant_id'] == '08003', 'participant_id'] = '080003'
df7.loc[df7['participant_id'] == '08005', 'participant_id'] = '080005'
df7.loc[df7['participant_id'] == '12004', 'participant_id'] = '120004'
df7.loc[df7['participant_id'] == '12006', 'participant_id'] = '120006'
df7.loc[df7['participant_id'] == '19006', 'participant_id'] = '190006'
df7.loc[df7['participant_id'] == '16018', 'participant_id'] = '160018'
df7.loc[df7['participant_id'] == '19007', 'participant_id'] = '190007'
df7.loc[df7['participant_id'] == '10004B', 'participant_id'] = '100004'
df7.loc[df7['participant_id'].map(len) == 7, 'participant_id'] = df7.loc[df7['participant_id'].map(len) == 7, 'participant_id'].str.replace('000','00')
df7.loc[df7.participant_id.map(len) == 5, 'participant_id'] = '0' + df7.loc[df7.participant_id.map(len) == 5, 'participant_id']
df7 = df7.drop_duplicates(subset='participant_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the

In [67]:
# lets add 'Audrey result data too..'
df8 = pd.merge(df6, df7, on='participant_id', how='left')

In [72]:
# outcome
# n.b Box 8 (prev missing) is currently in Bill and Miriam lab  GSB I think
# all other boxes are in Paul and Jenny lab EKB
# Plate ID is for Louise Plate
# participant_id and Project self-explanatory 
# Box_number and Location is according to our manually entered location spreadsheet
# old_participant_id shows what we 'cleaned'
# Sample ID manually entered into nanodrop machine, genomic DNA from machine output (we have dumps of raw files so I
# provide other parameters too if necessary), volume for dilution calculations mine from genotyping_prep script
# audrey_participant_id, result, location from another manually entered spreadsheet that a lab assistant did
df8

Unnamed: 0,Plate ID,participant_id,Project,Box_number,Location,old_participant_id,Date,Time,Sample ID,Genomic DNA,diln factor for intermediate concn (100ng/ul),volume of genomic DNA for working concn (100ng/ul),volume of PCR water for working concn (100ng/ul),audrey_participant_id,result,location
0,1,080013,IPFJES,Box 5,C7,0800013-1,2018-08-29,1:01 PM,080013-1,244.29,2.4429,20.467477,29.532523,080013-1,Heterozygous,Box Five
1,1,100061,IPFJES,Box 10,G5,100061-1,2019-07-09,11:53 AM,100061-1,843.30,8.4330,5.929088,44.070912,100061-1,Homozygous Allele 1,Box Ten
2,1,100026,IPFJES,Box 4,G4,100026-1,2018-08-23,10:44 AM,100026-1,213.72,2.1372,23.395096,26.604904,100026-1,Heterozygous,Box Four
3,1,010013,IPFJES,Box 5,D10,010013-1,2018-09-05,1:07 PM,010013-1,441.45,4.4145,11.326311,38.673689,010013-1,Homozygous Allele 1,Box Five
4,1,140002,IPFJES,Poor Quality Box One,C3,140002-,2018-08-01,12:18 PM,140002,46.02,0.4602,108.648414,-58.648414,140002-,Homozygous Allele 1,Poor Quality Box One
5,1,090054,IPFJES,Box 4,G2,090054-1,2018-08-23,10:42 AM,090054-1,365.40,3.6540,13.683634,36.316366,090054-1,Heterozygous,Box Four
6,1,060031,IPFJES,Poor Quality Box Three,I1,060031-1,2019-06-05,1:25 PM,060031-1,83.19,0.8319,60.103378,-10.103378,060031-1,Homozygous Allele 1,Poor Quality Box Three
7,1,090002,IPFJES,Box 8 (prev missing),F6,090002-1,2018-09-13,11:58 AM,090002-1,423.15,4.2315,11.816141,38.183859,,,
8,1,040003,IPFJES,Box 8 (prev missing),I3,040003-1,2018-09-21,11:01 AM,040003-1,211.44,2.1144,23.647370,26.352630,,,
9,1,040095,IPFJES,Box 10,D10,040095-1,2019-07-03,11:45 AM,040095-1,79.62,0.7962,62.798292,-12.798292,040095-1,Homozygous Allele 1,Box Ten


In [73]:
# save the result
df8.to_csv('ipfjes_samples_to_be_plated.csv')