Source of original data: [https://genomics.senescence.info](https://genomics.senescence.info/longevity/)<br />
*(LongevityMap build 3, release date: 2017 June 24, number of genes: 884)*

In [1]:
import pandas as pd

In [2]:
# read original data into dataFrame
df = pd.read_csv('longevity genes -original.csv')

print("Dimension:", df.shape)
df.head()

Dimension: (550, 7)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed,Unnamed: 6
0,1,non-significant,Dutch,HLA-B40,HLA-B,1859103,
1,2,non-significant,Dutch,HLA-DRB5,HLA-DRB5,1859103,
2,3,non-significant,Finnish,APOB,APOB,8018664,
3,4,significant,Finnish,APOC3,APOC3,8018664,
4,5,significant,Finnish,E2/E3/E4,APOE,8018664,


In [3]:
# Last columns of the dataFrame contains only NaN value.
# Make sure of this and delete this column as unnecessary.
assert df['Unnamed: 6'].isna().all(), "In the column 'Unnamed: 6' there is at least 1 record with value other than Nan"
del df['Unnamed: 6']

print("Dimension:", df.shape)
df.head()

Dimension: (550, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
0,1,non-significant,Dutch,HLA-B40,HLA-B,1859103
1,2,non-significant,Dutch,HLA-DRB5,HLA-DRB5,1859103
2,3,non-significant,Finnish,APOB,APOB,8018664
3,4,significant,Finnish,APOC3,APOC3,8018664
4,5,significant,Finnish,E2/E3/E4,APOE,8018664


In [4]:
# Seperate records that do not have values in the column 'Variant(s)'
df_var_nan = df[(df['Variant(s)'].isna())]

print("Dimension:", df_var_nan.shape)
df_var_nan.head()

Dimension: (16, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
255,258,significant,American (Caucasian),,TP53,20824210
256,261,non-significant,American (Caucasian),,TP53,20824210
275,280,significant,European,,TP53,23286790
276,281,significant,European,,TP53,23286790
277,282,significant,European,,TP53,23286790


In [5]:
# rows that have some info in the column 'Variant(s)' - this is intermediate dataFrame for further splitting
df_var_not_nan = df[(df['Variant(s)'].notna())]

print("Dimension:", df_var_not_nan.shape)
df_var_not_nan.head()

Dimension: (534, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
0,1,non-significant,Dutch,HLA-B40,HLA-B,1859103
1,2,non-significant,Dutch,HLA-DRB5,HLA-DRB5,1859103
2,3,non-significant,Finnish,APOB,APOB,8018664
3,4,significant,Finnish,APOC3,APOC3,8018664
4,5,significant,Finnish,E2/E3/E4,APOE,8018664


In [6]:
# rows that have sting in the column 'Variant(s)' that begins with characters other than 'rs'
df_var_not_rs = df_var_not_nan[~df_var_not_nan['Variant(s)'].str.startswith('rs')]

print("Dimension:", df_var_not_rs.shape)
df_var_not_rs.head()

Dimension: (322, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
0,1,non-significant,Dutch,HLA-B40,HLA-B,1859103
1,2,non-significant,Dutch,HLA-DRB5,HLA-DRB5,1859103
2,3,non-significant,Finnish,APOB,APOB,8018664
3,4,significant,Finnish,APOC3,APOC3,8018664
4,5,significant,Finnish,E2/E3/E4,APOE,8018664


In [7]:
# rows that have sting in the column 'Variant(s)' that begins with characters 'rs'
df_var_rs = df_var_not_nan[df_var_not_nan['Variant(s)'].str.startswith('rs')]

print("Dimension:", df_var_rs.shape)
df_var_rs.tail()

Dimension: (212, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
539,G548,non-significant,Danish,"rs1685354,rs647126",UCP3,22743239
540,G549,non-significant,Jordanian,"rs2241766,rs266729",ADIPOQ,20201642
541,G550,non-significant,Italian,"rs6457931,rs1321312,rs4331968,rs9470367,rs6920...","PANDAR,CDKN1A,RAB44",20126416
543,G552,non-significant,Danish,"rs2866164,Q95H",MTTP,16015282
546,556,significant,American (Caucasian),rs1042714,ADRB2,20399803


In [8]:
# ensure that we did not lost something during splitting of the original dataFrame
assert len(df) == len(df_var_nan) + len(df_var_rs) + len(df_var_not_rs), "Something wrong with splitting"

<br />


In [9]:
# Number of variants in each record in dataFrame with rs-variants.
# It is calculated by the number of commas +1
nmb_repeats = (df_var_rs['Variant(s)'].str.count(',') + 1).tolist()

assert len(df_var_rs) == len(nmb_repeats), "Problem with calculation"
print('Length of list:', len(nmb_repeats))
nmb_repeats[-5:]

Length of list: 212


[2, 2, 31, 2, 1]

In [10]:
## alternative way:
# nmb_repeats_alt = [len(st.split(',')) for st in df_var_rs['Variant(s)']]

## make sure that two ways gives the same result
# assert len(nmb_repeats_alt) == len(nmb_repeats), "Problems with calculation"
# assert nmb_repeats_alt == nmb_repeats, "Problems with calculation (2)"
# nmb_repeats_alt[-5:]

In [11]:
# dublicate records in the dataFrame 'df_var_rs' that have several variants in the column 'Variant(s)'
df_var_rs_extended = df_var_rs.loc[df_var_rs.index.repeat(nmb_repeats)]

print("Dimension:", df_var_rs_extended.shape)
df_var_rs_extended.tail()

Dimension: (3031, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
541,G550,non-significant,Italian,"rs6457931,rs1321312,rs4331968,rs9470367,rs6920...","PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,"rs6457931,rs1321312,rs4331968,rs9470367,rs6920...","PANDAR,CDKN1A,RAB44",20126416
543,G552,non-significant,Danish,"rs2866164,Q95H",MTTP,16015282
543,G552,non-significant,Danish,"rs2866164,Q95H",MTTP,16015282
546,556,significant,American (Caucasian),rs1042714,ADRB2,20399803


In [12]:
# Create "list" of all variants in the column 'Variant(s)' in the dataFrame 'df_var_rs'.
# To speed up calculation, use generator.
gen_variants = (el for ls in df_var_rs['Variant(s)'] for el in ls.split(','))
gen_variants

<generator object <genexpr> at 0x00000230AD0B0900>

In [13]:
## Look at result (warning: the generator will be exhausted)
# ls_variants = list(gen_variants)
# print('Length of resulting list:', len(ls_variants))
# ls_variants[:5]

In [14]:
## alternative way:
# import itertools
# iter_variants = itertools.chain.from_iterable(ls.split(',') for ls in df_var_rs['Variant(s)'])
# print(type(iter_variants))

## look at result (warning: the iterator will be exhausted) and make sure that two ways gives the same result
# ls_variants_alt = list(iter_variants)

# assert len(ls_variants_alt) == len(ls_variants), "Problem with calculation"
# print('Length of resulting list:', len(ls_variants_alt))

# assert ls_variants_alt == ls_variants, "Problem with calculation (2)"
# ls_variants_alt[:5]

In [15]:
# change content of the columns 'Variant(s)' by splitting data
df_var_rs_extended['Variant(s)'] = list(gen_variants)

print("Dimension:", df_var_rs_extended.shape)
df_var_rs_extended.tail()

Dimension: (3031, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
541,G550,non-significant,Italian,rs6457940,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs2145047,"PANDAR,CDKN1A,RAB44",20126416
543,G552,non-significant,Danish,rs2866164,MTTP,16015282
543,G552,non-significant,Danish,Q95H,MTTP,16015282
546,556,significant,American (Caucasian),rs1042714,ADRB2,20399803


<br />


In [16]:
# combine all dataFrames in one
df_result = pd.concat([df_var_rs_extended, df_var_not_rs, df_var_nan])

print("Dimension:", df_result.shape)
df_result.iloc[3000:3060]

Dimension: (3369, 6)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed
541,G550,non-significant,Italian,rs9470367,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs6920453,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs9462209,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs4713999,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs1321309,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs4711459,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs4714003,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs10947623,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs12192827,"PANDAR,CDKN1A,RAB44",20126416
541,G550,non-significant,Italian,rs12192877,"PANDAR,CDKN1A,RAB44",20126416


In [17]:
# save result to disk
df_result.to_csv('longevity genes -splitted.csv', index=False)
print('Done')

Done
