In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset_name = "rdp_16s_v18.fa.dat"
directory = %pwd
full_path =directory + '\\' + dataset_name

In [3]:
headers = []
with open(full_path) as datFile:
    datFile.readline()
    for _ in range(72):
        line = datFile.readline().split('\t')
        headers.append(line[0])

In [4]:
headers[1] = 'Domain' #домен
headers[2] = 'Phylum' #тип
headers[3] = 'Class' #класс
headers[4] = 'Order' #порядок
headers[5] = 'Family' #семейство
headers[6] = 'Genus' #род

In [5]:
frame = pd.read_csv(full_path, delimiter="\t", names=headers, skiprows=74)

<h1>Примеры строк из датасета</h1>

In [6]:
frame.sample(5)

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
13979,KR349442,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Celeribacter,0.0131,0.0207,0.0172,...,0.0062,0.0221,0.0096,0.0248,0.009,0.0145,0.0131,0.0096,0.0076,0.5499
4252,EF422411,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae_1,Alkalihalobacillus,0.015,0.019,0.0245,...,0.0095,0.0211,0.0122,0.0252,0.0122,0.0122,0.0088,0.0156,0.0054,0.5482
13586,KJ632968,Bacteria,Proteobacteria,Gammaproteobacteria,Pasteurellales,Pasteurellaceae,Ursidibacter,0.0147,0.017,0.0243,...,0.0096,0.0206,0.0199,0.0258,0.0125,0.0133,0.0081,0.0184,0.0081,0.5169
15584,MN365231,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Sinirhodobacter,0.0137,0.0173,0.0195,...,0.0087,0.0202,0.0115,0.0252,0.0087,0.0115,0.0137,0.0108,0.0043,0.5522
1728,AB327250,Bacteria,Actinobacteria,Actinobacteria,Catenulisporales,Catenulisporaceae,Catenulispora,0.0116,0.0191,0.0198,...,0.0055,0.0136,0.0177,0.0279,0.013,0.0061,0.0102,0.0123,0.0041,0.5936


<h1>Количество элементов в каждой группе микроорганизмов</h1>

In [7]:
by_domain = frame.groupby(['Phylum', 'Class', 'Order', 'Family'])
by_domain_count = by_domain.size().to_frame()
by_domain_count.rename(columns={0: 'Elements'}, inplace=True)
by_domain_count

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Elements
Phylum,Class,Order,Family,Unnamed: 4_level_1
Abditibacteriota,Abditibacteria,Abditibacteriales,Abditibacteriaceae,1
Acetothermia,Acetothermia_genera_incertae_sedis,_,_,1
Acidobacteria,Acidobacteria_Gp1,Acidicapsa,_,6
Acidobacteria,Acidobacteria_Gp1,Acidipila,_,2
Acidobacteria,Acidobacteria_Gp1,Acidobacterium,_,3
...,...,...,...,...
Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR18,_,_,1
Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR20,_,_,1
candidate_division_WPS-1,WPS-1_genera_incertae_sedis,_,_,2
candidate_division_WPS-2,WPS-2_genera_incertae_sedis,_,_,1


<h1>Статистические характеристики датасета</h1>

In [8]:
by_domain_count.describe()

Unnamed: 0,Elements
count,683.0
mean,31.030747
std,82.819738
min,1.0
25%,2.0
50%,6.0
75%,21.0
max,1135.0


<h1>По умолчанию мы обрезаем все группы, количество элементов в которых меньше среднего значения</h1>
<h2>Если вы хотите выбрать другой параметр - измените значение переменной reduce_factor, расположенной ниже</h2>

In [9]:
reduce_factor = by_domain_count['Elements'].mean()
reduce_factor

31.0307467057101

In [10]:
reduce_groups = by_domain_count.loc[by_domain_count['Elements'] >= reduce_factor]
reduce_groups

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Elements
Phylum,Class,Order,Family,Unnamed: 4_level_1
Acidobacteria,Acidobacteria_Gp1,Gp1,_,38
Acidobacteria,Acidobacteria_Gp3,Gp3,_,34
Acidobacteria,Acidobacteria_Gp6,Gp6,_,57
Actinobacteria,Actinobacteria,Actinomycetales,Actinomycetaceae,96
Actinobacteria,Actinobacteria,Bifidobacteriales,Bifidobacteriaceae,111
...,...,...,...,...
Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,212
Spirochaetes,Spirochaetia,Spirochaetales,Spirochaetaceae,93
Tenericutes,Mollicutes,Entomoplasmatales,Spiroplasmataceae,50
Tenericutes,Mollicutes,Mycoplasmoidales,Metamycoplasmataceae,122


In [13]:
by_domain_count.to_excel(directory + "\\" + "groups.xlsx", 'Sheet1')

pandas.core.frame.DataFrame

<h1>

In [12]:
frame.groupby(['Phylum', 'Class', 'Order', 'Family']).filter(lambda group: len(group) > reduce_factor )

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
0,EF599163,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,0.0150,0.0176,0.0209,...,0.0052,0.0202,0.0183,0.0248,0.0144,0.0104,0.0078,0.0150,0.0065,0.5394
1,AY859683,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Mycobacteriaceae,Mycobacterium,0.0101,0.0176,0.0196,...,0.0054,0.0142,0.0176,0.0318,0.0149,0.0061,0.0095,0.0142,0.0054,0.5796
2,AY883036,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,0.0095,0.0154,0.0212,...,0.0066,0.0220,0.0139,0.0308,0.0190,0.0088,0.0088,0.0147,0.0073,0.5545
3,DQ656489,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Aquincola,0.0152,0.0179,0.0214,...,0.0062,0.0186,0.0200,0.0255,0.0117,0.0097,0.0055,0.0131,0.0062,0.5686
4,D85479,Bacteria,Actinobacteria,Actinobacteria,Micromonosporales,Micromonosporaceae,Couchioplanes,0.0136,0.0170,0.0218,...,0.0068,0.0143,0.0150,0.0252,0.0116,0.0061,0.0102,0.0116,0.0054,0.6014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21188,NR_074344,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Corynebacteriaceae,Corynebacterium,0.0107,0.0168,0.0195,...,0.0060,0.0141,0.0174,0.0322,0.0154,0.0087,0.0107,0.0154,0.0080,0.5680
21189,NR_074339,Bacteria,Bacteroidetes,Cytophagia,Cytophagales,Cytophagaceae,Runella,0.0127,0.0161,0.0268,...,0.0054,0.0174,0.0188,0.0255,0.0114,0.0121,0.0074,0.0141,0.0107,0.5224
21191,NR_074244,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,0.0102,0.0163,0.0197,...,0.0068,0.0183,0.0142,0.0299,0.0075,0.0102,0.0088,0.0129,0.0081,0.5752
21192,NR_074234,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,0.0136,0.0197,0.0204,...,0.0054,0.0244,0.0177,0.0143,0.0102,0.0136,0.0102,0.0163,0.0075,0.5302


In [22]:
temp = frame.groupby(['Phylum', 'Class', 'Order', 'Family'])['ID'].count().to_frame(name = 'count').reset_index()
temp

Unnamed: 0,Phylum,Class,Order,Family,count
0,Abditibacteriota,Abditibacteria,Abditibacteriales,Abditibacteriaceae,1
1,Acetothermia,Acetothermia_genera_incertae_sedis,_,_,1
2,Acidobacteria,Acidobacteria_Gp1,Acidicapsa,_,6
3,Acidobacteria,Acidobacteria_Gp1,Acidipila,_,2
4,Acidobacteria,Acidobacteria_Gp1,Acidobacterium,_,3
...,...,...,...,...,...
678,Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR18,_,_,1
679,Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR20,_,_,1
680,candidate_division_WPS-1,WPS-1_genera_incertae_sedis,_,_,2
681,candidate_division_WPS-2,WPS-2_genera_incertae_sedis,_,_,1


In [19]:
temp.to_excel(directory + "\\" + "groups.xlsx", 'All_groups')

In [None]:
f