In [69]:
import pandas as pd

In [70]:
uniprot_df = pd.read_csv('uniprot_data.csv')

In [71]:
uniprot_df.head()

Unnamed: 0,yourlist:M201912106746803381A1F0E0DB47453E0216320D314B872,Entry,Entry name,Status,Protein names,Gene names,Organism,Length
0,AGRN,A0A087X208,A0A087X208_HUMAN,unreviewed,Agrin,AGRN,Homo sapiens (Human),1930
1,AGRN,A0A494C0G5,A0A494C0G5_HUMAN,unreviewed,Agrin,AGRN,Homo sapiens (Human),1940
2,AGRN,A0A494C1I6,A0A494C1I6_HUMAN,unreviewed,Agrin,AGRN,Homo sapiens (Human),1963
3,AGRN,O00468,AGRIN_HUMAN,reviewed,Agrin [Cleaved into: Agrin N-terminal 110 kDa ...,AGRN AGRIN,Homo sapiens (Human),2068
4,AGRN,Q15952,Q15952_HUMAN,unreviewed,Agrin (Fragment),AGRN,Homo sapiens (Human),62


In [72]:
#clean the uniprot data by only taking entries where the Status has been reviewed - most canonical sequence
#Take only the length of the protein and rename the columns
#Group by the symbol and take the average length 
clean_uniprot = uniprot_df[uniprot_df['Status'] == 'reviewed']
length_df = clean_uniprot[['yourlist:M201912106746803381A1F0E0DB47453E0216320D314B872','Length']]
length_df.columns = ['SYMBOL','Length']
length_df = length_df.groupby('SYMBOL').mean()
length_df.reset_index(inplace=True)

In [73]:
length_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2321 entries, 0 to 2320
Data columns (total 2 columns):
SYMBOL    2321 non-null object
Length    2321 non-null float64
dtypes: float64(1), object(1)
memory usage: 36.4+ KB


In [74]:
#Read in cleaned data set 
df = pd.read_csv('data_cleaned1.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [75]:
df.head()

Unnamed: 0,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,CLNDNINCL,...,INT,EX,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,BIOTYPE_misc_RNA,BIOTYPE_protein_coding,BIOTYPE_unknown
0,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,0,1,0,0,0,0,1,0,1,0
1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,,...,0,1,0,0,0,0,1,0,1,0
2,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,0,1,0,0,0,0,1,0,1,0
3,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,0,1,0,0,0,0,1,0,1,0
4,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,,...,0,1,0,0,0,0,1,0,1,0


In [76]:
#Join the original data frame with the uniprot length data using merge on "SYMBOL"
new_df = df.merge(length_df, on='SYMBOL', how='left')

In [77]:
new_df.head()

Unnamed: 0,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,CLNDNINCL,...,EX,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,BIOTYPE_misc_RNA,BIOTYPE_protein_coding,BIOTYPE_unknown,Length
0,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,1,0,0,0,0,1,0,1,0,2068.0
1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,,...,1,0,0,0,0,1,0,1,0,2068.0
2,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,1,0,0,0,0,1,0,1,0,2068.0
3,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,1,0,0,0,0,1,0,1,0,2068.0
4,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,,...,1,0,0,0,0,1,0,1,0,2068.0


In [78]:
#make sure all of the entries in "Protein_position" column are numeric
#Fill in missing values in the protein position and length columns as the mean of those columns
#Calculate Relative location = Protein position/Length
new_df.Protein_position = pd.to_numeric(new_df.Protein_position,errors='coerce')

new_df['Relative_Location'] = new_df.Protein_position/new_df.Length


In [79]:
new_df.Relative_Location.fillna(0, inplace=True)
new_df.head()

Unnamed: 0,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,CLNDNINCL,...,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,BIOTYPE_misc_RNA,BIOTYPE_protein_coding,BIOTYPE_unknown,Length,Relative_Location
0,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,0,0,0,0,1,0,1,0,2068.0,0.001934
1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,,...,0,0,0,0,1,0,1,0,2068.0,0.007253
2,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,0,0,0,0,1,0,1,0,2068.0,0.011122
3,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",,...,0,0,0,0,1,0,1,0,2068.0,0.04207
4,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,,...,0,0,0,0,1,0,1,0,2068.0,0.085106


In [80]:
#save new dataframe with joined data to a new csv file
new_df.to_csv('data_cleaned4.csv',index=False)