# BEFORE RUNNING TAKE A LOOK AT THE README, import information regarding protein sequence mappings

## Use the gene names to parse protein groups tables for the protein identifier. Take the matching gene and corresponding protein name to search for protein sequence

Number of isoforms detected in AM protein groups 2 <br>
Tmpo <br>
Tor1aip2 <br>

Number of isoforms detected in AM protein groups 1<br>
Tmpo <br>



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from collections import Counter

In [2]:
proteomics_AM_GeneNames = pd.read_csv('../../Data/RawData/PPG_AM_proteomics_covariance_input_0419.tsv',sep='\t',usecols=[0])
proteomics_AM_GeneNames.set_index('Gene.names',inplace=True)
proteomics_AT_GeneNames = pd.read_csv('../../Data/RawData/PPG_AT2_proteomics_covariance_input_0419.tsv',sep='\t',usecols=[0])
proteomics_AT_GeneNames.set_index('Gene.names',inplace=True)

In [3]:
proteomics_AM_GeneNames.shape

(4017, 0)

### Does not create a unique way in terms of order of generating gene names

In [6]:
GeneNames = list(proteomics_AM_GeneNames.index) + list(proteomics_AT_GeneNames.index)
GeneNames = set(GeneNames) #list of all unique gene names found in both AM and AT2 proteomics tables

In [7]:
AM_imputed = pd.read_csv('../../Data/RawData/PPG_AM_main_imputed.tsv',sep='\t',usecols=[0,1,2])
AM_imputed.set_index('Gene.names',inplace=True)
AT2_imputed = pd.read_csv('../../Data/RawData/PPG_AT2_main_imputed.tsv',sep='\t',usecols=[0,1,2])
AT2_imputed.set_index('Gene.names',inplace=True)

### For the gene names found in the proteomics table extract protein names from imputed, check for differences in majority protein

In [8]:
am_genes = list(set(list(AM_imputed.index))&GeneNames) #overlapping gene names from inputed and protein covariance
at2_genes = list(set(list(AT2_imputed.index))&GeneNames) #overlapping gene names from inputed and protein covariance

In [9]:
am_mappings = AM_imputed.loc[am_genes,:].copy()
am_mappings['Majority.protein.IDs'] = am_mappings['Majority.protein.IDs'].str.split(';').str[0]

at2_mappings = AT2_imputed.loc[at2_genes,:].copy()
at2_mappings['Majority.protein.IDs'] = at2_mappings['Majority.protein.IDs'].str.split(';').str[0]

### shows if there is a different majority protein in the first position between am and at2 cells

In [10]:
for gene in set(am_mappings.index)&set(at2_mappings.index):
    if am_mappings.loc[gene,'Majority.protein.IDs'] != at2_mappings.loc[gene,'Majority.protein.IDs']:
        print(gene)
        print(am_mappings.loc[gene,'Majority.protein.IDs'])
        print(at2_mappings.loc[gene,'Majority.protein.IDs'])
        print('\n')

Igkv3-2
P01654
P03977




## List below to grab fasta files from uniprot, check that the new list that is generated matches the old one (most likely not).

In [11]:
uniprot_names = list(set(am_mappings['Majority.protein.IDs']) | set(at2_mappings['Majority.protein.IDs']))

In [12]:
# with open("../../Data/ProcessedData/uniprot.txt","w+")  as fp:
#     for name in uniprot_names:
#         fp.write(name+'\n')

#### Just use existing file to avoid confusion from noted above

In [13]:
with open("../../Data/ProcessedData/uniprot.txt","r")  as fp:
    uniprot_names = fp.read().splitlines()

### combine fasta file to corresponding protein names

In [14]:
with open("../../Data/RawData/protein_seq.fasta","r+") as fp:
    sequences = fp.read().splitlines()

In [15]:
counter = 0
sequence_mappings = {protein: [] for protein in uniprot_names} #initialize dict, protein names as keys and value is a list

for line in sequences[1::]: #skip first line
    if '>' in line:
        counter += 1
        continue #immediately goes to next iteration
    else:
        sequence_mappings[uniprot_names[counter]].append(line)
        
#concatenate all lists into strings
for keys in sequence_mappings.keys():
    sequence_mappings[keys] = ''.join(sequence_mappings[keys])

In [16]:
#Use sequence mappings to match corresponding sequence to protein
at2_mappings['ProteinSequence'] = at2_mappings['Majority.protein.IDs'].map(sequence_mappings)
am_mappings['ProteinSequence'] = am_mappings['Majority.protein.IDs'].map(sequence_mappings)

In [17]:
#Adding protein length
at2_mappings['ProteinLength'] = at2_mappings['ProteinSequence'].str.len()
am_mappings['ProteinLength'] = am_mappings['ProteinSequence'].str.len()

In [18]:
at2_mappings.head()

Unnamed: 0_level_0,Protein,Majority.protein.IDs,ProteinSequence,ProteinLength
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Agpat3,Q9D517,Q9D517,MGLLAYLKTQFVVHLLIGFVFVVSGLIINFTQLCTLALWPISKHLY...,376
Rbms1,Q91W59,Q91W59,MGKVWKQQMYPQYATYYYPQYLQAKQSLVPAHPMAPPSPSTTSSNN...,403
Alkbh5,Q3TSG4,Q3TSG4,MAAASGYTDLREKLKSMTSRDNYKAGSREAAAAAAAAVAAAAAAAA...,395
H2-D1,P01899;P01897;P01900;P01896;P01895;P14427,P01899,MGAMAPRTLLLLLAAALAPTQTRAGPHSMRYFETAVSRPGLEEPRY...,362
Prpf4,Q9DAW6,Q9DAW6,MASSRASSTTTKTKAPDDLVAPVVKKPHIYYGSLEEKERERLAKGE...,521


In [19]:
at2_mappings.loc['Thoc1']

Protein                                                            Q8R3N6
Majority.protein.IDs                                               Q8R3N6
ProteinSequence         MSPTPALFSLPEARTRFTKSTREALNNKNIKPLLTAFSQLPGSENE...
ProteinLength                                                         657
Name: Thoc1, dtype: object

In [16]:
# at2_mappings.to_csv('../../Data/ProcessedData/AT2_SeqMappings.tsv',sep='\t')
# am_mappings.to_csv('../../Data/ProcessedData/AM_SeqMappings.tsv',sep='\t')