In [1]:
import pandas as pd
# читаем содержимое фалов
bulls = pd.read_csv('bulls.csv').sort_values('ebv', ascending=False)  # быки-кандидаты на осеменение
cows = pd.read_csv('cows.csv').sort_values('ebv', ascending=False)  # коровы, которых нужно осеменить
date_map = {'id': 'str', 'mother_id': 'str', 'father_id': 'str'}
pedigree = pd.read_csv('pedigree.csv', dtype=date_map)  # родословные животных

In [2]:
def count_data(df: pd.DataFrame):
    """ Проверяю чистоту данных, сколько записей с пропущенными значениями """
    ln = len(df)
    null_val = sum(df.isna().sum(axis=1))
    print('Количество строк: {ln}, количество строк с пропущенными значениями: {null_val}'.format(ln=ln, null_val=null_val))

In [3]:
print(pedigree.head())
count_data(pedigree)
pedigree.set_index(['mother_id', 'father_id'])
# пропусков ~12%, заполняем наиболее часто повторяющимися значениями
# big_mother = pedigree['mother_id'].mode().iloc[0]
# big_father = pedigree['father_id'].mode().iloc[0]
# pedigree['mother_id'] =  pedigree.apply(
#     lambda row: big_mother if pd.isna(row['mother_id']) and row['id'] != big_mother else row['mother_id'],
#     axis=1
# )
# pedigree['father_id'] = pedigree.apply(
#     lambda row: big_mother if pd.isna(row['father_id']) and row['id'] != big_mother else row['father_id'],
#     axis=1
# )
# count_data(pedigree)

# id: идентификатор животного
# mother_id: идентификатор матери
# father_id: идентификатор отца

              id      mother_id      father_id
0  GB00000090350  GB00000070596  FR00000051087
1  DE00000090351  GB00000085021  GB00000051158
2  US00000090352  US00000087323  GB00000091078
3  FR00000090353            NaN            NaN
4  GB00000090358  US00000056066  NL00000050889
Количество строк: 94400, количество строк с пропущенными значениями: 11416


Unnamed: 0_level_0,Unnamed: 1_level_0,id
mother_id,father_id,Unnamed: 2_level_1
GB00000070596,FR00000051087,GB00000090350
GB00000085021,GB00000051158,DE00000090351
US00000087323,GB00000091078,US00000090352
,,FR00000090353
US00000056066,NL00000050889,GB00000090358
...,...,...
FR00000026340,GB00000000557,FR00000046829
GB00000022627,US00000001653,FR00000048066
NL00000005019,GB00000001788,RU00000048290
DE00000028637,DE00000003760,US00000013052


In [4]:
# смотрим, что вообще есть
print(bulls.head())
count_data(bulls)
bulls_with_nan = bulls.loc[bulls.isna().any(axis=1), 'id'].values[0]
parents = pedigree.loc[pedigree['id'] == bulls_with_nan, ['mother_id', 'father_id']]
father_id = parents.father_id.values[0]
mother_id = parents.mother_id.values[0]
# bulls.loc[bulls.id == father_id]
# cows[cows.id == mother_id]
bulls = bulls.fillna(bulls.mean(numeric_only=True))
# count_data(bulls)

# id: идентификатор быка
# ebv: селекционная ценность (Estimated Breeding Value)

               id  descendants_count     ebv
18  US00000000795                299  1867.9
17  US00000003507                 84  1782.9
2   US00000003013                436  1528.6
4   DE00000001742                496  1510.6
19  US00000003459                 51  1506.6
Количество строк: 39, количество строк с пропущенными значениями: 1


In [5]:
print(cows.head())
count_data(cows)
# Процент пропущенных значений небольшой (~10%), можем посчитать среднюю селекционную ценность
cows = cows.fillna(cows.mean(numeric_only=True))

# print(cows.head())
# id: идентификатор коровы
# ebv: селекционная ценность

                 id     ebv
4838  DE00000028635  2046.2
6810  FR00000011732  1981.1
9716  DE00000023145  1840.5
8011  DE00000014083  1814.8
5699  NL00000004886  1788.7
Количество строк: 17177, количество строк с пропущенными значениями: 1849


In [6]:
import numpy as np
max_cows_for_bull = int(np.floor(0.1 * len(cows)))
print('Один бык может осеменить {max_cows} коров'.format(max_cows=max_cows_for_bull))

Один бык может осеменить 1717 коров


In [6]:
# Пример структуры данных для животных
# Каждое животное представлено словарём с ключами 'id', 'father', 'mother'
# 'father' и 'mother' — id родителей или None, если неизвестны

animals = {
    'A': {'father': None, 'mother': None},
    'B': {'father': None, 'mother': None},
    'C': {'father': 'A', 'mother': 'B'},
    'D': {'father': 'A', 'mother': 'B'},
    'E': {'father': 'C', 'mother': 'D'},
}

def get_ancestors(animal_id, animals, generation=0, ancestors=None):
    if ancestors is None:
        ancestors = {}
    if animal_id is None:
        return ancestors
    if animal_id not in ancestors or generation < ancestors[animal_id]:
        ancestors[animal_id] = generation
        father = animals[animal_id]['father']
        mother = animals[animal_id]['mother']
        get_ancestors(father, animals, generation + 1, ancestors)
        get_ancestors(mother, animals, generation + 1, ancestors)
    return ancestors

def coefficient_of_relatedness(animal1, animal2, animals):
    ancestors1 = get_ancestors(animal1, animals)
    ancestors2 = get_ancestors(animal2, animals)

    # Общие предки
    common_ancestors = set(ancestors1.keys()) & set(ancestors2.keys())

    r = 0.0
    for ancestor in common_ancestors:
        n1 = ancestors1[ancestor]
        n2 = ancestors2[ancestor]
        # Формула: 2^-(n1 + n2)
        r += 2 ** (-(n1 + n2))
    return r

# Пример использования
animal1 = 'E'
animal2 = 'D'
r = coefficient_of_relatedness(animal1, animal2, animals)
print(f"Коэффициент родства между {animal1} и {animal2}: {r:.4f}")

Коэффициент родства между E и D: 0.7500


In [22]:
def get_ancestors(animal_id):
    ancestors = set()
    visited = set()
    stack = [animal_id]
    while stack:
        current = stack.pop()
        if current in visited or pd.isna(current):
            continue
        visited.add(current)
        row = pedigree[pedigree['id'] == current]
        if not row.empty:
            m, f = row.iloc[0]['mother_id'], row.iloc[0]['father_id']
            # Добавляем только валидные id
            for parent in [m, f]:
                if pd.notna(parent) and parent != current:
                    print(parent)
                    stack.append(parent)
                    ancestors.add(parent)
    return ancestors

top_bull = bulls.head(1)
get_ancestors(top_bull.iloc[0]['id'])

RU00000000432
NL00000001279
NL00000001176
US00000000916
DE00000000510
DE00000000020
US00000000079
US00000000653
DE00000000414
FR00000000371
FR00000000879
RU00000001144
GB00000001305
US00000000988
NL00000003601
NL00000000439
FR00000001299
US00000001231
NL00000000358
GB00000000208
RU00000000202
FR00000000534
US00000000549
RU00000001030
GB00000044158
US00000007635
RU00000000920
US00000000197
DE00000000983
RU00000001065
DE00000002175
DE00000001137
DE00000000770
FR00000000007
FR00000000009
DE00000000402
NL00000001558
NL00000000439
FR00000000814
DE00000000703
FR00000002228
RU00000000936
RU00000000768
US00000000158
US00000000919
US00000001231
US00000000788
GB00000000299
GB00000000853
DE00000001142
GB00000000053
GB00000000742
NL00000000918
FR00000001165
US00000000929
NL00000000567
DE00000001862
FR00000002045
RU00000002513
GB00000001443
RU00000001485
NL00000001444
US00000000158
DE00000002620
US00000000382
NL00000000446
RU00000000092
NL00000000666
GB00000000890
NL00000001160
GB00000000208
GB0000

{'DE00000000020',
 'DE00000000082',
 'DE00000000144',
 'DE00000000151',
 'DE00000000152',
 'DE00000000402',
 'DE00000000414',
 'DE00000000510',
 'DE00000000703',
 'DE00000000770',
 'DE00000000830',
 'DE00000000944',
 'DE00000000983',
 'DE00000001117',
 'DE00000001137',
 'DE00000001142',
 'DE00000001518',
 'DE00000001552',
 'DE00000001670',
 'DE00000001862',
 'DE00000001874',
 'DE00000002175',
 'DE00000002428',
 'DE00000002475',
 'DE00000002620',
 'FR00000000007',
 'FR00000000009',
 'FR00000000081',
 'FR00000000110',
 'FR00000000117',
 'FR00000000164',
 'FR00000000179',
 'FR00000000221',
 'FR00000000371',
 'FR00000000416',
 'FR00000000445',
 'FR00000000521',
 'FR00000000534',
 'FR00000000652',
 'FR00000000780',
 'FR00000000814',
 'FR00000000847',
 'FR00000000879',
 'FR00000000912',
 'FR00000000982',
 'FR00000001165',
 'FR00000001178',
 'FR00000001224',
 'FR00000001273',
 'FR00000001299',
 'FR00000001704',
 'FR00000001908',
 'FR00000002038',
 'FR00000002045',
 'FR00000002182',
 'FR000000

In [13]:
def get_ancestors(animal_id):
    ancestors = set()
    visited = set()
    stack = [animal_id]
    while stack:
        current = stack.pop()
        if current in visited or pd.isna(current):
            continue
        visited.add(current)
        row = pedigree[pedigree['id'] == current]
        if not row.empty:
            m, f = row.iloc[0]['mother_id'], row.iloc[0]['father_id']
            # Добавляем только валидные id
            for parent in [m, f]:
                if pd.notna(parent) and parent != current:
                    stack.append(parent)
                    ancestors.add(parent)
    return ancestors

def calc_rel(cow_id, bull_id):
    pedigree_bull = pedigree[pedigree['id'] == bull_id].iloc[0]
    pedigree_cow = pedigree[pedigree['id'] == cow_id].iloc[0]
    if pedigree_bull['mother_id'] == pedigree_cow['mother_id'] or pedigree_bull['father_id'] == pedigree_cow['father_id']:
        return 0.8  # Либо родные, либо двоюродные, мало подходит. Сделано, для ускорения.
    bulls_anc = get_ancestors(bull_id)
    cows_anc = get_ancestors(cow_id)
    # Родство — доля общих предков (очень упрощённо)
    common = cows_anc & bulls_anc
    total = cows_anc | bulls_anc
    if not total:
        return 0.0
    return len(common) / len(total)

pairs = []
for bull in bulls.itertuples(index=False):
    for cow in cows.itertuples(index=False):
        rel = calc_rel(cow.id, bull.id)
        print(cow.id, bull.id, rel)
        if rel <= 0.05:
            ebv_offspring = (cow.ebv + bull.ebv) / 2
            pairs.append({'cow_id': cow.id, 'bull_id': bull.id, 'ebv': ebv_offspring, 'rel': rel})

# pairs_df
# pd.DataFrame(pairs)


DE00000028635 US00000000795 0.6904761904761905
FR00000011732 US00000000795 0.8039538714991763
DE00000023145 US00000000795 0.7589424572317263
DE00000014083 US00000000795 0.7612687813021702
NL00000004886 US00000000795 0.7374392220421394
DE00000027440 US00000000795 0.7479806138933764
US00000069140 US00000000795 0.5614886731391586
GB00000083297 US00000000795 0.5735537190082645
GB00000024686 US00000000795 0.7587354409317804
GB00000017595 US00000000795 0.6517006802721088
NL00000027867 US00000000795 0.7660910518053375
DE00000023455 US00000000795 0.8
GB00000009133 US00000000795 0.7167182662538699
RU00000045502 US00000000795 0.8413793103448276
US00000011575 US00000000795 0.7174603174603175


KeyboardInterrupt: 

In [ ]:
from collections import defaultdict
bull_usage = defaultdict(int)
selected_pairs = []
used_cows = set()

# Сортируем по EBV потомка
pairs_df = pairs_df.sort_values(by='ebv', ascending=False)

for _, row in pairs_df.iterrows():
    if row['cow_id'] in used_cows:
        continue
    if bull_usage[row['bull_id']] >= max_cows_for_bull:
        continue
    selected_pairs.append(row)
    used_cows.add(row['cow_id'])
    bull_usage[row['bull_id']] += 1
    if len(used_cows) == len(cows):
        break

# Анализ разброса EBV
ebvs = [p['ebv'] for p in selected_pairs]
mean_ebv = sum(ebvs) / len(ebvs)
std_ebv = pd.Series(ebvs).std()

print(f'Средний EBV: {mean_ebv:.2f}, Разброс EBV: {std_ebv:.2f}')