In [1]:
import pandas as pd
import numpy as np

In [2]:
DIRECTORY = %pwd

# Путь до директории, в которой лежит ноутбук, вычисляется самостоятельно
# Необходимо указать лишь имя файла

In [3]:
dataset_name = "rdp_16s_v18.fa.dat"
dataset_path = DIRECTORY + '\\' + dataset_name

In [4]:
headers = []
with open(dataset_path) as datFile:
    datFile.readline()
    for _ in range(72):
        line = datFile.readline().split('\t')
        headers.append(line[0])

In [5]:
headers[1] = 'Domain' #домен
headers[2] = 'Phylum' #тип
headers[3] = 'Class' #класс
headers[4] = 'Order' #порядок
headers[5] = 'Family' #семейство
headers[6] = 'Genus' #род

In [6]:
df = pd.read_csv(dataset_path, delimiter="\t", names=headers, skiprows=74)
len(df)

21194

# Примеры строк из датасета

In [7]:
df.head(5)

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
0,EF599163,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,0.015,0.0176,0.0209,...,0.0052,0.0202,0.0183,0.0248,0.0144,0.0104,0.0078,0.015,0.0065,0.5394
1,AY859683,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Mycobacteriaceae,Mycobacterium,0.0101,0.0176,0.0196,...,0.0054,0.0142,0.0176,0.0318,0.0149,0.0061,0.0095,0.0142,0.0054,0.5796
2,AY883036,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,0.0095,0.0154,0.0212,...,0.0066,0.022,0.0139,0.0308,0.019,0.0088,0.0088,0.0147,0.0073,0.5545
3,DQ656489,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Aquincola,0.0152,0.0179,0.0214,...,0.0062,0.0186,0.02,0.0255,0.0117,0.0097,0.0055,0.0131,0.0062,0.5686
4,D85479,Bacteria,Actinobacteria,Actinobacteria,Micromonosporales,Micromonosporaceae,Couchioplanes,0.0136,0.017,0.0218,...,0.0068,0.0143,0.015,0.0252,0.0116,0.0061,0.0102,0.0116,0.0054,0.6014


<h1>Количество элементов в каждой группе микроорганизмов</h1>

In [8]:
groups_length = df.groupby(['Phylum', 'Class', 'Order', 'Family'])['ID'].count().to_frame(name = 'count').reset_index()
groups_length

Unnamed: 0,Phylum,Class,Order,Family,count
0,Abditibacteriota,Abditibacteria,Abditibacteriales,Abditibacteriaceae,1
1,Acetothermia,Acetothermia_genera_incertae_sedis,_,_,1
2,Acidobacteria,Acidobacteria_Gp1,Acidicapsa,_,6
3,Acidobacteria,Acidobacteria_Gp1,Acidipila,_,2
4,Acidobacteria,Acidobacteria_Gp1,Acidobacterium,_,3
...,...,...,...,...,...
678,Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR18,_,_,1
679,Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR20,_,_,1
680,candidate_division_WPS-1,WPS-1_genera_incertae_sedis,_,_,2
681,candidate_division_WPS-2,WPS-2_genera_incertae_sedis,_,_,1


# Сохраняем группы в таблицу Excel

In [9]:
xlsx_name = "groups.xlsx"
xlsx_path = DIRECTORY + "\\" + xlsx_name
with pd.ExcelWriter(xlsx_path) as writer:
    groups_length.to_excel(writer, sheet_name = "AllGroups")

# Статистические характеристики датасета

In [10]:
groups_describe = groups_length.describe()

# По умолчанию мы обрезаем все группы, количество элементов в которых меньше среднего значения
## Если вы хотите выбрать другой параметр - измените значение переменной reduce_factor, расположенной ниже

In [11]:
reduce_factor = groups_describe.at['mean', 'count']
reduce_factor

31.0307467057101

# Количество элементов в оставшихся группах микроорганизмов

In [13]:
t = df.groupby(['Phylum', 'Class', 'Order', 'Family'])['ID'].count()
reduced_groups = t[t > reduce_factor].to_frame(name = 'count').reset_index()
reduced_groups

Unnamed: 0,Phylum,Class,Order,Family,count
0,Acidobacteria,Acidobacteria_Gp1,Gp1,_,38
1,Acidobacteria,Acidobacteria_Gp3,Gp3,_,34
2,Acidobacteria,Acidobacteria_Gp6,Gp6,_,57
3,Actinobacteria,Actinobacteria,Actinomycetales,Actinomycetaceae,96
4,Actinobacteria,Actinobacteria,Bifidobacteriales,Bifidobacteriaceae,111
...,...,...,...,...,...
132,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,212
133,Spirochaetes,Spirochaetia,Spirochaetales,Spirochaetaceae,93
134,Tenericutes,Mollicutes,Entomoplasmatales,Spiroplasmataceae,50
135,Tenericutes,Mollicutes,Mycoplasmoidales,Metamycoplasmataceae,122


# Добавляем новую таблицу в excel-файл

In [14]:
with pd.ExcelWriter(xlsx_path, mode='a') as writer:
    reduced_groups.to_excel(writer, sheet_name = 'ReducedGroups')

# Огранизмы, принадлежащие оставшимся группам

In [16]:
df.groupby(['Phylum', 'Class', 'Order', 'Family']).filter(lambda group: len(group) > reduce_factor)

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
0,EF599163,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,0.0150,0.0176,0.0209,...,0.0052,0.0202,0.0183,0.0248,0.0144,0.0104,0.0078,0.0150,0.0065,0.5394
1,AY859683,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Mycobacteriaceae,Mycobacterium,0.0101,0.0176,0.0196,...,0.0054,0.0142,0.0176,0.0318,0.0149,0.0061,0.0095,0.0142,0.0054,0.5796
2,AY883036,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,0.0095,0.0154,0.0212,...,0.0066,0.0220,0.0139,0.0308,0.0190,0.0088,0.0088,0.0147,0.0073,0.5545
3,DQ656489,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Aquincola,0.0152,0.0179,0.0214,...,0.0062,0.0186,0.0200,0.0255,0.0117,0.0097,0.0055,0.0131,0.0062,0.5686
4,D85479,Bacteria,Actinobacteria,Actinobacteria,Micromonosporales,Micromonosporaceae,Couchioplanes,0.0136,0.0170,0.0218,...,0.0068,0.0143,0.0150,0.0252,0.0116,0.0061,0.0102,0.0116,0.0054,0.6014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21188,NR_074344,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Corynebacteriaceae,Corynebacterium,0.0107,0.0168,0.0195,...,0.0060,0.0141,0.0174,0.0322,0.0154,0.0087,0.0107,0.0154,0.0080,0.5680
21189,NR_074339,Bacteria,Bacteroidetes,Cytophagia,Cytophagales,Cytophagaceae,Runella,0.0127,0.0161,0.0268,...,0.0054,0.0174,0.0188,0.0255,0.0114,0.0121,0.0074,0.0141,0.0107,0.5224
21191,NR_074244,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,0.0102,0.0163,0.0197,...,0.0068,0.0183,0.0142,0.0299,0.0075,0.0102,0.0088,0.0129,0.0081,0.5752
21192,NR_074234,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,0.0136,0.0197,0.0204,...,0.0054,0.0244,0.0177,0.0143,0.0102,0.0136,0.0102,0.0163,0.0075,0.5302


In [None]:
f