In [1]:
import pandas as pd
import numpy as np
from os.path import exists

In [2]:
DIRECTORY = %pwd

# Путь до директории, в которой лежит ноутбук, вычисляется самостоятельно
# Необходимо указать лишь имя файла

In [3]:
dataset_name = "rdp_16s_v18.fa.dat"
dataset_path = DIRECTORY + '\\' + dataset_name

In [4]:
headers = []
with open(dataset_path) as datFile:
    datFile.readline()
    for _ in range(72):
        line = datFile.readline().split('\t')
        headers.append(line[0])

# 'Domain' - Домен
# 'Phylum' - Тип
# 'Class' - Класс
# 'Order' - Порядок
# 'Family' - Семейство
# 'Genus' - Род

In [5]:
groups_key = ['Phylum', 'Class', 'Order', 'Family']
headers[1:7] = ["Domain"] + groups_key + ["Genus"]

In [6]:
df = pd.read_csv(dataset_path, delimiter="\t", names=headers, skiprows=74)

# Исходный датасет

In [7]:
df

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
0,EF599163,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,0.0150,0.0176,0.0209,...,0.0052,0.0202,0.0183,0.0248,0.0144,0.0104,0.0078,0.0150,0.0065,0.5394
1,AY859683,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Mycobacteriaceae,Mycobacterium,0.0101,0.0176,0.0196,...,0.0054,0.0142,0.0176,0.0318,0.0149,0.0061,0.0095,0.0142,0.0054,0.5796
2,AY883036,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,0.0095,0.0154,0.0212,...,0.0066,0.0220,0.0139,0.0308,0.0190,0.0088,0.0088,0.0147,0.0073,0.5545
3,DQ656489,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Aquincola,0.0152,0.0179,0.0214,...,0.0062,0.0186,0.0200,0.0255,0.0117,0.0097,0.0055,0.0131,0.0062,0.5686
4,D85479,Bacteria,Actinobacteria,Actinobacteria,Micromonosporales,Micromonosporaceae,Couchioplanes,0.0136,0.0170,0.0218,...,0.0068,0.0143,0.0150,0.0252,0.0116,0.0061,0.0102,0.0116,0.0054,0.6014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21189,NR_074339,Bacteria,Bacteroidetes,Cytophagia,Cytophagales,Cytophagaceae,Runella,0.0127,0.0161,0.0268,...,0.0054,0.0174,0.0188,0.0255,0.0114,0.0121,0.0074,0.0141,0.0107,0.5224
21190,NR_074270,Bacteria,Chlorobi,Chlorobia,Chlorobiales,Chlorobiaceae,Chloroherpeton,0.0162,0.0162,0.0325,...,0.0101,0.0217,0.0156,0.0250,0.0169,0.0129,0.0068,0.0156,0.0061,0.5115
21191,NR_074244,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,0.0102,0.0163,0.0197,...,0.0068,0.0183,0.0142,0.0299,0.0075,0.0102,0.0088,0.0129,0.0081,0.5752
21192,NR_074234,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,0.0136,0.0197,0.0204,...,0.0054,0.0244,0.0177,0.0143,0.0102,0.0136,0.0102,0.0163,0.0075,0.5302


<h1>Количество элементов в каждой группе организмов</h1>

In [8]:
groups = df.groupby(groups_key)['ID'].count().to_frame(name = 'count').reset_index()
groups

Unnamed: 0,Phylum,Class,Order,Family,count
0,Abditibacteriota,Abditibacteria,Abditibacteriales,Abditibacteriaceae,1
1,Acetothermia,Acetothermia_genera_incertae_sedis,_,_,1
2,Acidobacteria,Acidobacteria_Gp1,Acidicapsa,_,6
3,Acidobacteria,Acidobacteria_Gp1,Acidipila,_,2
4,Acidobacteria,Acidobacteria_Gp1,Acidobacterium,_,3
...,...,...,...,...,...
678,Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR18,_,_,1
679,Woesearchaeota,Woesearchaeota_Incertae_Sedis_AR20,_,_,1
680,candidate_division_WPS-1,WPS-1_genera_incertae_sedis,_,_,2
681,candidate_division_WPS-2,WPS-2_genera_incertae_sedis,_,_,1


# Сохраняем группы в таблицу Excel

In [9]:
xlsx_name = "groups.xlsx"
xlsx_path = DIRECTORY + "\\" + xlsx_name
with pd.ExcelWriter(xlsx_path) as writer:
    groups.to_excel(writer, 
                    sheet_name = "AllGroups", 
                    index=False)

# Статистические характеристики групп организмов

In [10]:
groups_describe = groups.describe()
groups_describe

Unnamed: 0,count
count,683.0
mean,31.030747
std,82.819738
min,1.0
25%,2.0
50%,6.0
75%,21.0
max,1135.0


# По умолчанию мы обрезаем все группы, количество элементов в которых меньше среднего значения
## Если вы хотите выбрать другой параметр - измените значение переменной reduce_factor, расположенной ниже

In [11]:
reduce_factor = groups_describe.at['mean', 'count']
reduce_factor

31.0307467057101

# Количество элементов в оставшихся группах микроорганизмов

In [12]:
reduced_groups = df.groupby(groups_key)['ID'].count()[lambda x: x > reduce_factor].to_frame(name = 'count').reset_index()
reduced_groups

Unnamed: 0,Phylum,Class,Order,Family,count
0,Acidobacteria,Acidobacteria_Gp1,Gp1,_,38
1,Acidobacteria,Acidobacteria_Gp3,Gp3,_,34
2,Acidobacteria,Acidobacteria_Gp6,Gp6,_,57
3,Actinobacteria,Actinobacteria,Actinomycetales,Actinomycetaceae,96
4,Actinobacteria,Actinobacteria,Bifidobacteriales,Bifidobacteriaceae,111
...,...,...,...,...,...
132,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,212
133,Spirochaetes,Spirochaetia,Spirochaetales,Spirochaetaceae,93
134,Tenericutes,Mollicutes,Entomoplasmatales,Spiroplasmataceae,50
135,Tenericutes,Mollicutes,Mycoplasmoidales,Metamycoplasmataceae,122


# Добавляем новую таблицу в excel-файл

In [13]:
with pd.ExcelWriter(xlsx_path, mode='a') as writer:
    reduced_groups.to_excel(writer, 
                            sheet_name = 'ReducedGroups', 
                            index=False)

# Статистические характеристики оставшихся групп организмов

In [14]:
reduced_groups_describe = reduced_groups.describe()
rg_min_value = reduced_groups_describe.at['min', 'count']
reduced_groups_describe

Unnamed: 0,count
count,137.0
mean,128.372263
std,149.180655
min,32.0
25%,47.0
50%,77.0
75%,149.0
max,1135.0


# Огранизмы, принадлежащие оставшимся группам

In [15]:
reduced_df = df.groupby(groups_key).filter(lambda group: len(group) > reduce_factor)
reduced_df

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
0,EF599163,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,0.0150,0.0176,0.0209,...,0.0052,0.0202,0.0183,0.0248,0.0144,0.0104,0.0078,0.0150,0.0065,0.5394
1,AY859683,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Mycobacteriaceae,Mycobacterium,0.0101,0.0176,0.0196,...,0.0054,0.0142,0.0176,0.0318,0.0149,0.0061,0.0095,0.0142,0.0054,0.5796
2,AY883036,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,0.0095,0.0154,0.0212,...,0.0066,0.0220,0.0139,0.0308,0.0190,0.0088,0.0088,0.0147,0.0073,0.5545
3,DQ656489,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Aquincola,0.0152,0.0179,0.0214,...,0.0062,0.0186,0.0200,0.0255,0.0117,0.0097,0.0055,0.0131,0.0062,0.5686
4,D85479,Bacteria,Actinobacteria,Actinobacteria,Micromonosporales,Micromonosporaceae,Couchioplanes,0.0136,0.0170,0.0218,...,0.0068,0.0143,0.0150,0.0252,0.0116,0.0061,0.0102,0.0116,0.0054,0.6014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21188,NR_074344,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Corynebacteriaceae,Corynebacterium,0.0107,0.0168,0.0195,...,0.0060,0.0141,0.0174,0.0322,0.0154,0.0087,0.0107,0.0154,0.0080,0.5680
21189,NR_074339,Bacteria,Bacteroidetes,Cytophagia,Cytophagales,Cytophagaceae,Runella,0.0127,0.0161,0.0268,...,0.0054,0.0174,0.0188,0.0255,0.0114,0.0121,0.0074,0.0141,0.0107,0.5224
21191,NR_074244,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,0.0102,0.0163,0.0197,...,0.0068,0.0183,0.0142,0.0299,0.0075,0.0102,0.0088,0.0129,0.0081,0.5752
21192,NR_074234,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,0.0136,0.0197,0.0204,...,0.0054,0.0244,0.0177,0.0143,0.0102,0.0136,0.0102,0.0163,0.0075,0.5302


In [16]:
reduced_df.set_index('ID')

Unnamed: 0_level_0,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,AAT,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EF599163,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,0.0150,0.0176,0.0209,0.0189,...,0.0052,0.0202,0.0183,0.0248,0.0144,0.0104,0.0078,0.0150,0.0065,0.5394
AY859683,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Mycobacteriaceae,Mycobacterium,0.0101,0.0176,0.0196,0.0108,...,0.0054,0.0142,0.0176,0.0318,0.0149,0.0061,0.0095,0.0142,0.0054,0.5796
AY883036,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Acetobacteraceae,Acetobacter,0.0095,0.0154,0.0212,0.0117,...,0.0066,0.0220,0.0139,0.0308,0.0190,0.0088,0.0088,0.0147,0.0073,0.5545
DQ656489,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Aquincola,0.0152,0.0179,0.0214,0.0152,...,0.0062,0.0186,0.0200,0.0255,0.0117,0.0097,0.0055,0.0131,0.0062,0.5686
D85479,Bacteria,Actinobacteria,Actinobacteria,Micromonosporales,Micromonosporaceae,Couchioplanes,0.0136,0.0170,0.0218,0.0075,...,0.0068,0.0143,0.0150,0.0252,0.0116,0.0061,0.0102,0.0116,0.0054,0.6014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NR_074344,Bacteria,Actinobacteria,Actinobacteria,Mycobacteriales,Corynebacteriaceae,Corynebacterium,0.0107,0.0168,0.0195,0.0114,...,0.0060,0.0141,0.0174,0.0322,0.0154,0.0087,0.0107,0.0154,0.0080,0.5680
NR_074339,Bacteria,Bacteroidetes,Cytophagia,Cytophagales,Cytophagaceae,Runella,0.0127,0.0161,0.0268,0.0161,...,0.0054,0.0174,0.0188,0.0255,0.0114,0.0121,0.0074,0.0141,0.0107,0.5224
NR_074244,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,0.0102,0.0163,0.0197,0.0122,...,0.0068,0.0183,0.0142,0.0299,0.0075,0.0102,0.0088,0.0129,0.0081,0.5752
NR_074234,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,0.0136,0.0197,0.0204,0.0129,...,0.0054,0.0244,0.0177,0.0143,0.0102,0.0136,0.0102,0.0163,0.0075,0.5302


# Производим стратифицированную выборку

In [17]:
stratified_sampling = reduced_df.groupby(groups_key).apply(lambda group: group.sample(int(rg_min_value))).set_index('ID')
stratified_sampling

Unnamed: 0_level_0,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,AAT,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EF457399,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0205,0.0247,0.0205,0.0151,...,0.0068,0.0219,0.0151,0.0260,0.0096,0.0041,0.0082,0.0110,0.0055,0.5820
AY326562,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0169,0.0225,0.0197,0.0162,...,0.0042,0.0197,0.0148,0.0254,0.0106,0.0063,0.0063,0.0120,0.0042,0.5703
AF523990,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0183,0.0223,0.0199,0.0143,...,0.0048,0.0183,0.0135,0.0263,0.0120,0.0080,0.0088,0.0127,0.0048,0.5752
EF457384,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0207,0.0303,0.0193,0.0124,...,0.0014,0.0262,0.0138,0.0207,0.0083,0.0055,0.0083,0.0110,0.0041,0.5810
EF457443,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0174,0.0254,0.0187,0.0174,...,0.0000,0.0201,0.0147,0.0214,0.0054,0.0040,0.0054,0.0080,0.0040,0.6115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X64380,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0211,0.0230,0.0201,0.0144,...,0.0086,0.0172,0.0182,0.0259,0.0134,0.0105,0.0086,0.0172,0.0144,0.5382
U68671,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0241,0.0181,0.0201,0.0161,...,0.0121,0.0201,0.0181,0.0302,0.0101,0.0121,0.0161,0.0141,0.0060,0.5351
AY326524,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0187,0.0215,0.0201,0.0139,...,0.0097,0.0174,0.0181,0.0236,0.0160,0.0083,0.0076,0.0111,0.0062,0.5555
AJ289983,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0148,0.0215,0.0256,0.0141,...,0.0081,0.0182,0.0168,0.0256,0.0128,0.0067,0.0101,0.0121,0.0061,0.5410


# Сохраняем данные в формате csv

In [28]:
reset_sample = stratified_sampling.reset_index()
reset_sample

Unnamed: 0,ID,Domain,Phylum,Class,Order,Family,Genus,AAA,AAC,AAG,...,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT,CG
0,EF457399,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0205,0.0247,0.0205,...,0.0068,0.0219,0.0151,0.0260,0.0096,0.0041,0.0082,0.0110,0.0055,0.5820
1,AY326562,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0169,0.0225,0.0197,...,0.0042,0.0197,0.0148,0.0254,0.0106,0.0063,0.0063,0.0120,0.0042,0.5703
2,AF523990,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0183,0.0223,0.0199,...,0.0048,0.0183,0.0135,0.0263,0.0120,0.0080,0.0088,0.0127,0.0048,0.5752
3,EF457384,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0207,0.0303,0.0193,...,0.0014,0.0262,0.0138,0.0207,0.0083,0.0055,0.0083,0.0110,0.0041,0.5810
4,EF457443,Bacteria,Acidobacteria,Acidobacteria_Gp1,Gp1,_,_,0.0174,0.0254,0.0187,...,0.0000,0.0201,0.0147,0.0214,0.0054,0.0040,0.0054,0.0080,0.0040,0.6115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4379,X64380,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0211,0.0230,0.0201,...,0.0086,0.0172,0.0182,0.0259,0.0134,0.0105,0.0086,0.0172,0.0144,0.5382
4380,U68671,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0241,0.0181,0.0201,...,0.0121,0.0201,0.0181,0.0302,0.0101,0.0121,0.0161,0.0141,0.0060,0.5351
4381,AY326524,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0187,0.0215,0.0201,...,0.0097,0.0174,0.0181,0.0236,0.0160,0.0083,0.0076,0.0111,0.0062,0.5555
4382,AJ289983,Bacteria,Verrucomicrobia,Spartobacteria,Spartobacteria_genera_incertae_sedis,_,_,0.0148,0.0215,0.0256,...,0.0081,0.0182,0.0168,0.0256,0.0128,0.0067,0.0101,0.0121,0.0061,0.5410


In [26]:
reset_sample.to_csv('sampling_data.csv',
                           sep=';',
                           encoding='utf-8',
                           index=False)

# Сохраняем в формате dat 

In [47]:
reduced_dat = "reduced_" + dataset_name
if not exists(f"{DIRECTORY}\\{reduced_dat}"):
    with open(reduced_dat, 'w') as dat:
        dat.write(f"{reset_sample.shape[1]}\t{reset_sample.shape[0]}\n")
        for index, value in enumerate(reset_sample.columns):
            if index == 0:
                dat.write("ID\tSTRING\n")
            elif 1 <= index <= 6:
                dat.write(f"s{index}\tSTRING\n")
            else:
                dat.write(f"{value}\tFLOAT\n")
    reset_sample.to_csv(reduced_dat, sep='\t', mode='a', header=False)