In [1]:
DATA_PATH = '../../data/uniprot-reviewed%3Ayes.tab.gz'
MAX_SEQUENCE_LENGTH = 500

In [2]:
AMINO_ACIDS_SET = {'G', 'A', 'V', 'L', 'I', 'P', 'F', 'Y', 'W', 'S', 'T', 'C', 'M', 'N', 'Q', 'K', 'R', 'H', 'D', 'E'}

# Data Loading

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(DATA_PATH, sep='\t', header=(0), skipinitialspace=True)

In [5]:
df.head()

Unnamed: 0,Entry,Entry name,Gene names,Length,Sequence,EC number
0,Q754G5,ARP4_ASHGO,ARP4 AFR105C,479,MSNSALQVYGGDEITAVVIDPGSFTTNIGYSGTDCPQAILPSCYGK...,
1,P19108,ARRB_DROMI,Arr2 ArrB,401,MVVSVKVFKKATPNGKVTFYLGRRDFIDHLDYCDPVDGVIVVEPEY...,
2,P36575,ARRC_HUMAN,ARR3 ARRX CAR,388,MSKVFKKTSSNGKLSIYLGKRDFVDHVDTVEPIDGVVLVDPEYLKC...,
3,Q96B67,ARRD3_HUMAN,ARRDC3 KIAA1376,414,MVLGKVKSLTISFDCLNDSNVPVYSSGDTVSGRVNLEVTGEIRVKS...,
4,P63621,ARSC_NEIMA,arsC NMA0252,117,MPEIKIFHNPRCSKSRAALSLLEERGIAAEVVKYLDTPPDLSELKD...,1.20.4.1


In [6]:
print('Rows count: {}'.format(df.shape[0]))

Rows count: 556568


# Data PreProcessing

## Handling empty values

In [7]:
print('Empty values in columns')
df.isnull().sum()

Empty values in columns


Entry              0
Entry name         0
Gene names     22953
Length             0
Sequence           0
EC number     293609
dtype: int64

In [8]:
print('Removing rows that does not have EC number')
df_nonan = df.dropna(axis=0, how='any', subset=['EC number'])
dropped_rows_count = df.shape[0] - df_nonan.shape[0]
print('Rows count before: {}'.format(df.shape[0]))
print('Rows count after: {}'.format(df_nonan.shape[0]))
print('Rows removed: {} ({:0.2f}%)'.format(dropped_rows_count, dropped_rows_count/df.shape[0]*100))

Removing rows that does not have EC number
Rows count before: 556568
Rows count after: 262959
Rows removed: 293609 (52.75%)


## Handling Non-Standard amino acids

In [9]:
df_noamino = df_nonan[df_nonan["Sequence"].apply(lambda x: set(x).issubset(AMINO_ACIDS_SET))]

In [10]:
dropped_rows_count = df_nonan.shape[0] - df_noamino.shape[0]
print('Rows count before: {}'.format(df_nonan.shape[0]))
print('Rows count after: {}'.format(df_noamino.shape[0]))
print('Rows removed: {} ({:0.2f}%)'.format(dropped_rows_count, dropped_rows_count/df_nonan.shape[0]*100))

Rows count before: 262959
Rows count after: 261991
Rows removed: 968 (0.37%)


## Dropping Too Long Sequences

In [11]:
df_maxlength = df_noamino[df_noamino["Length"] <= MAX_SEQUENCE_LENGTH]

In [12]:
dropped_rows_count = df_noamino.shape[0] - df_maxlength.shape[0]
print('Rows count before: {}'.format(df_noamino.shape[0]))
print('Rows count after: {}'.format(df_maxlength.shape[0]))
print('Rows removed: {} ({:0.2f}%)'.format(dropped_rows_count, dropped_rows_count/df_noamino.shape[0]*100))

Rows count before: 261991
Rows count after: 205843
Rows removed: 56148 (21.43%)


In [13]:
df_processed = df_maxlength

## Adding Multi-Level EC Numbers

In [14]:
def generateLabel(item, index):
    return ".".join(item.split(".")[0:index])       

In [15]:
for i in range(1,5):
    df_maxlength["Level_"+str(i)] = [ generateLabel(item[0],i) for item in df_maxlength["EC number"].str.split(";").values ]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
df_maxlength.head()

Unnamed: 0,Entry,Entry name,Gene names,Length,Sequence,EC number,Level_1,Level_2,Level_3,Level_4
4,P63621,ARSC_NEIMA,arsC NMA0252,117,MPEIKIFHNPRCSKSRAALSLLEERGIAAEVVKYLDTPPDLSELKD...,1.20.4.1,1,1.2,1.20.4,1.20.4.1
5,Q8ENQ5,ARSC_OCEIH,arsC OB2423,139,MSKKIIYFLCTGNSCRSQMAEGWGKKILGEEWDVYSAGIEAHGLNP...,1.20.4.4,1,1.2,1.20.4,1.20.4.4
6,Q2FFW6,ARSC_STAA3,arsC SAUSA300_1719,131,MTKKTIYFICTGNSCRSQMAEGWAKQILADDWNVYSAGIETHGVNP...,1.20.4.4,1,1.2,1.20.4,1.20.4.4
7,Q2YTL3,ARSC_STAAB,arsC SAB1632,131,MTKKTIYFICTGNSCRSQMAEGWAKQILAEDWNVYSAGIETHGVNP...,1.20.4.4,1,1.2,1.20.4,1.20.4.4
8,Q5HF01,ARSC_STAAC,arsC SACOL1824,131,MTKKTIYFICTGNSCRSQMAEGWAKQILADDWNVYSAGIETHGVNP...,1.20.4.4,1,1.2,1.20.4,1.20.4.4


# First level analysis

In [17]:
df_level1 = df_maxlength[['Level_1','Length']]

df_level1.groupby('Level_1').describe()

Unnamed: 0_level_0,Length,Length,Length,Length,Length,Length,Length,Length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Level_1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,32142.0,307.944621,105.879715,4.0,228.0,324.0,380.0,500.0
2,76011.0,306.805699,93.772955,5.0,237.0,311.0,375.0,500.0
3,47778.0,277.298757,109.51413,6.0,194.0,266.0,354.0,500.0
4,20313.0,293.681534,109.662534,9.0,200.0,286.0,382.0,500.0
5,12144.0,310.981308,101.114842,9.0,239.0,287.0,419.0,500.0
6,17455.0,376.49556,94.701974,10.0,313.0,404.0,453.0,500.0


# Second Level Analysis

In [18]:
df_level2 = df_maxlength[['Level_2','Length']]

In [19]:
df_level2_group = df_level2.groupby('Level_2').filter(lambda x: len(x) > 10000)

In [20]:
df_level2_group.groupby('Level_2').describe()

Unnamed: 0_level_0,Length,Length,Length,Length,Length,Length,Length,Length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2.1,16370.0,304.575443,78.600895,9.0,246.0,303.0,364.0,500.0
2.7,25785.0,289.60287,103.000477,9.0,212.0,289.0,367.0,500.0
3.1,16181.0,240.133304,98.545559,6.0,157.0,230.0,307.0,500.0
3.6,11339.0,303.761178,112.112808,14.0,207.0,285.0,390.0,500.0
6.3,10030.0,358.018744,106.683177,15.0,282.0,373.0,447.0,500.0


# Third level analysis

In [21]:
df_level3 = df_maxlength[['Level_3','Length']]
df_level3_group = df_level3.groupby('Level_3').filter(lambda x: len(x) > 7000)
df_level3_group.groupby('Level_3').describe()

Unnamed: 0_level_0,Length,Length,Length,Length,Length,Length,Length,Length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Level_3,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2.1.1,12267.0,293.226135,80.627205,9.0,238.0,279.0,353.0,500.0
2.3.1,7242.0,319.683927,81.65403,10.0,262.0,335.0,380.0,500.0
2.5.1,7488.0,315.203526,84.263921,15.0,277.0,311.0,392.0,500.0
2.7.1,7093.0,303.488228,87.0433,14.0,256.0,298.0,337.0,500.0
2.7.7,7818.0,279.584165,119.015882,10.0,189.0,258.0,380.0,500.0


# Fourth level analysis

In [51]:
df_level4 = df_maxlength[['Level_4','Length']]
df_level4_group = df_level4.groupby('Level_4').filter(lambda x: len(x) > 100)
df_level4_group.groupby('Level_4').agg({"Length": ["mean", "count", "std", "max"]}).sort_values(by=[("Length","mean")])
#3.6.1.7

Unnamed: 0_level_0,Length,Length,Length,Length
Unnamed: 0_level_1,mean,count,std,max
Level_4,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1.97.1.12,80.652439,164,5.708238,93
3.6.1.7,93.623616,271,5.626527,120
4.2.1.96,105.253906,256,10.899990,192
3.6.1.31,105.712766,282,14.432039,252
3.5.1.5,109.148014,554,27.659577,308
3.1.1.4,117.848156,461,43.273482,239
2.4.2.2,122.970000,200,86.561037,433
2.7.8.7,129.514530,585,20.034650,344
3.1.26.5,130.956585,691,35.640327,366
5.4.99.62,135.072816,206,4.383955,154


# EC Number file processing

In [3]:
columns = ['Level_1', 'Level_2', 'Level_3', 'Level_4', 'Name', 'ToDelete']
ecs = pd.read_csv('../../data/ECNumbers.csv', sep='.', header=None, names = columns, dtype = str)

In [4]:
ecs = ecs.drop(['ToDelete'], axis=1)

In [22]:
ecs['Level_1'] = ecs['Level_1'].str.strip()
ecs['Level_2'] = ecs['Level_2'].str.strip()
ecs['Level_3'] = ecs['Level_3'].str.strip()
ecs['Level_4'] = ecs['Level_4'].str.strip()

In [23]:
ecs['ECNumber'] = ecs['Level_1'] + '.' + ecs['Level_2'] + '.' + ecs['Level_3']

In [25]:
ecs = ecs[['ECNumber', 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Name']]

In [26]:
ecs

Unnamed: 0,ECNumber,Level_1,Level_2,Level_3,Level_4,Name
0,1.-.-,1,-,-,-,Oxidoreductases
1,1.1.-,1,1,-,-,Acting on the CH-OH group of donors
2,1.1.1,1,1,1,-,With NAD(+) or NADP(+) as acceptor
3,1.1.2,1,1,2,-,With a cytochrome as acceptor
4,1.1.3,1,1,3,-,With oxygen as acceptor
5,1.1.4,1,1,4,-,With a disulfide as acceptor
6,1.1.5,1,1,5,-,With a quinone or similar compound as acceptor
7,1.1.9,1,1,9,-,With a copper protein as acceptor
8,1.1.98,1,1,98,-,"With other, known, acceptors"
9,1.1.99,1,1,99,-,With other acceptors


In [32]:
ecs.to_csv('ECNumbers.csv', sep='\t', header=True, index=False)