# Cortellari 2021
Deal with [Cortellari et al. 2021](https://doi.org/10.1038/s41598-021-89900-2) goat dataset.

In [1]:
import csv
import pathlib

import pandas as pd

from tqdm.notebook import tqdm

from src.features.plinkio import BinaryPlinkIO
from src.features.smarterdb import global_connection, SampleGoat, Dataset, Breed
from src.data.common import WORKING_ASSEMBLIES

In [2]:
conn = global_connection()
ARS1 = WORKING_ASSEMBLIES["ARS1"]

In [3]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException as exc:
            logger.error(exc)
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException as exc:
            logger.error(exc)
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException as exc:
            logger.error(exc)
            return False
        
    def is_illumina(self):
        try:
            return self.process_pedfile(coding='illumina')
        
        except CodingException as exc:
            logger.error(exc)
            return False
        
class CustomBinaryPlinkIO(CustomMixin, BinaryPlinkIO):
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=len(self.plink_file.get_samples())):
            _ = self._process_genotypes(line, coding)
            
        return True

In [4]:
cortellari_dataset = Dataset.objects.get(file="cortellari_et_al_2021.zip")
cortellari_dataset.contents

['s41598-021-89900-2/',
 's41598-021-89900-2/Cortellari2021.bim',
 's41598-021-89900-2/Cortellari2021.fam',
 's41598-021-89900-2/41598_2021_89900_MOESM8_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM7_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM10_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM2_ESM.png',
 's41598-021-89900-2/41598_2021_89900_MOESM6_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM9_ESM.xlsx',
 's41598-021-89900-2/Cortellari2021.bed',
 's41598-021-89900-2/41598_2021_89900_MOESM4_ESM.tiff',
 's41598-021-89900-2/41598_2021_89900_MOESM13_ESM.docx',
 's41598-021-89900-2/s41598-021-89900-2.pdf',
 's41598-021-89900-2/41598_2021_89900_MOESM3_ESM.png',
 's41598-021-89900-2/41598_2021_89900_MOESM12_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM11_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM5_ESM.xlsx',
 's41598-021-89900-2/41598_2021_89900_MOESM1_ESM.tiff',
 's41598-021-89900-2/king.con',
 's41598-021-89900-2/to_remove.csv']

Ok, open dataset and do the standard checks

In [5]:
prefix = str(cortellari_dataset.working_dir / "s41598-021-89900-2/Cortellari2021")
plinkio = CustomBinaryPlinkIO(prefix=prefix, species=cortellari_dataset.species, chip_name=cortellari_dataset.chip_name)
plinkio.n_of_individuals = cortellari_dataset.n_of_individuals

In [6]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(src_assembly=ARS1)

In [7]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 53347 of 53347 SNPs using 'name' (0.0% missing)


Is this dataset in top coordinates?

In [8]:
plinkio.is_top()

  0%|          | 0/1071 [00:00<?, ?it/s]

True

Ok, now is time to collect info about samples:

In [9]:
breeds = pd.read_html("https://www.nature.com/articles/s41598-021-89900-2/tables/1")[0]

In [10]:
breeds.drop(index=34, inplace=True)
breeds.columns = ['fid', 'breed', "raw_dataset_count", "haplotype_sharing_count", "population_structure_count", "landscape_genomics_count"]
breeds.head()

Unnamed: 0,fid,breed,raw_dataset_count,haplotype_sharing_count,population_structure_count,landscape_genomics_count
0,ALP,Camosciata delle Alpi,143,117,30,43
1,ARG,Argentata dell'Etna,48,46,30,41
2,ASP,Capra dell'Aspromonte,24,24,24,18
3,BEZ,Bezoar (outgroup),7,7,7,0
4,BIA,Bianca Monticellana,24,23,23,17


Ensure code are new or associated to the same breed within the database:

In [11]:
not_found = []

for idx, row in breeds.iterrows():
    code = row['fid']
    name = row['breed']
    count = Breed.objects.filter(code=code, species="Goat").count()
    if count != 0:
        breed = Breed.objects.get(code=code, species="Goat")
        print(f"Found code '{code}': '{breed}' for cortellari '{name}'")
    else:
        not_found.append((code, name))

Found code 'ALP': 'Alpine (Camosciata delle Alpi) (ALP) Goat' for cortellari 'Camosciata delle Alpi'
Found code 'ARG': 'Argentata (ARG) Goat' for cortellari 'Argentata dell'Etna'
Found code 'ASP': 'Aspromontana (ASP) Goat' for cortellari 'Capra dell'Aspromonte'
Found code 'BEZ': 'Bezoar (BEZ) Goat' for cortellari 'Bezoar (outgroup)'
Found code 'BIO': 'Bionda dell'Adamello (BIO) Goat' for cortellari 'Bionda dell'Adamello'
Found code 'GAR': 'Garganica (GAR) Goat' for cortellari 'Garganica'
Found code 'JON': 'Jonica (JON) Goat' for cortellari 'Jonica'
Found code 'MAL': 'Mallorquina (MAL) Goat' for cortellari 'Maltese'
Found code 'NIC': 'Nicastrese (NIC) Goat' for cortellari 'Nicastrese'
Found code 'NVE': 'Nera Verzasca (NVE) Goat' for cortellari 'Nera di Verzasca'
Found code 'ORO': 'Orobica (ORO) Goat' for cortellari 'Orobica'
Found code 'RME': 'Rossa Mediterranea (RME) Goat' for cortellari 'Rossa Mediterranea'
Found code 'SAA': 'Saanen (SAA) Goat' for cortellari 'Saanen'
Found code 'SAR'

Here are the breeds I couldn't find:

In [12]:
for code, name in not_found:
    print(code, name)

BIA Bianca Monticellana
CAP Capestrina
DDS Derivata di Siria
FAC Facciuta della Valnerina
FUL Fulva del Lazio
GCI Grigia Ciociara
GIR Girgentana
GRF Garfagnana
LIV Capra di Livo
MES Messinese
MNT_M Montecristo (mainland)
MNT_I Montecristo (island)
MON Capra di Montefalcone
MXS Incrocio Maltese e Sarda
RCC Roccaverano
SAM Maltese sampled in Sardinia
TER Capra di Teramo
VLS Vallesana
VPS Capra della Val Passiria


Ok, I need to rename some breed names or codes to be consistent with the SMARTER database:

In [13]:
# duplicate code columns
breeds['code'] = breeds['fid']

# fix breed codes
breeds.set_index('fid', inplace=True)
breeds.at["MAL", "code"] = "MLT"
breeds.at["GCI", "code"] = "CCG"
breeds.at["GIR", "code"] = "GGT"
breeds.at["TER", "code"] = "DIT"
breeds.at["VPS", "code"] = "VSS"

# fix breed names
for code in ["ALP", "ARG", "ASP", "NVE"]:
    breed = Breed.objects.get(species="Goat", code=code)
    breeds.at[code, 'breed'] = breed.name

breeds.reset_index(inplace=True)
breeds.set_index('code', inplace=True)

for code in ["CCG", "DIT", "VSS"]:
    breed = Breed.objects.get(species="Goat", code=code)
    breeds.at[code, 'breed'] = breed.name

breeds.reset_index(inplace=True)
breeds.head()

Unnamed: 0,code,fid,breed,raw_dataset_count,haplotype_sharing_count,population_structure_count,landscape_genomics_count
0,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
1,ARG,ARG,Argentata,48,46,30,41
2,ASP,ASP,Aspromontana,24,24,24,18
3,BEZ,BEZ,Bezoar (outgroup),7,7,7,0
4,BIA,BIA,Bianca Monticellana,24,23,23,17


In [14]:
not_found = []

for idx, row in breeds.iterrows():
    code = row['code']
    name = row['breed']
    count = Breed.objects.filter(code=code, species="Goat").count()
    if count != 0:
        breed = Breed.objects.get(code=code, species="Goat")
        print(f"Found code '{code}': '{breed}' for cortellari '{name}'")
    else:
        not_found.append((code, name))

Found code 'ALP': 'Alpine (Camosciata delle Alpi) (ALP) Goat' for cortellari 'Alpine (Camosciata delle Alpi)'
Found code 'ARG': 'Argentata (ARG) Goat' for cortellari 'Argentata'
Found code 'ASP': 'Aspromontana (ASP) Goat' for cortellari 'Aspromontana'
Found code 'BEZ': 'Bezoar (BEZ) Goat' for cortellari 'Bezoar (outgroup)'
Found code 'BIO': 'Bionda dell'Adamello (BIO) Goat' for cortellari 'Bionda dell'Adamello'
Found code 'GAR': 'Garganica (GAR) Goat' for cortellari 'Garganica'
Found code 'CCG': 'Ciociara Grigia (CCG) Goat' for cortellari 'Ciociara Grigia'
Found code 'GGT': 'Girgentana (GGT) Goat' for cortellari 'Girgentana'
Found code 'JON': 'Jonica (JON) Goat' for cortellari 'Jonica'
Found code 'MLT': 'Maltese (MLT) Goat' for cortellari 'Maltese'
Found code 'NIC': 'Nicastrese (NIC) Goat' for cortellari 'Nicastrese'
Found code 'NVE': 'Nera Verzasca (NVE) Goat' for cortellari 'Nera Verzasca'
Found code 'ORO': 'Orobica (ORO) Goat' for cortellari 'Orobica'
Found code 'RME': 'Rossa Medite

In [15]:
for code, name in not_found:
    print(code, name)

BIA Bianca Monticellana
CAP Capestrina
DDS Derivata di Siria
FAC Facciuta della Valnerina
FUL Fulva del Lazio
GRF Garfagnana
LIV Capra di Livo
MES Messinese
MNT_M Montecristo (mainland)
MNT_I Montecristo (island)
MON Capra di Montefalcone
MXS Incrocio Maltese e Sarda
RCC Roccaverano
SAM Maltese sampled in Sardinia
VLS Vallesana


Ok, there are some samples that seems to be already in the SMARTER database: those samples where identified using [king](https://www.kingrelatedness.com/) software. Read those sample names from file:

In [16]:
to_remove = pd.read_csv(cortellari_dataset.working_dir / "s41598-021-89900-2/to_remove.csv", index_col=0)
to_remove.head()

Unnamed: 0,ID2
0,ALP215
1,ALP216
2,ALP217
3,ALP218
4,ALP219


Read genotypes and create a simple table for fid and iid:

In [17]:
tmp = {'fid':[], 'original_id': []}

for fid, original_id, *_ in plinkio.read_pedfile():
    tmp['fid'].append(fid)
    tmp['original_id'].append(original_id)
    
tmp = pd.DataFrame.from_dict(tmp)
tmp.head()

Unnamed: 0,fid,original_id
0,ARG,ARG1
1,ARG,ARG2
2,ARG,ARG3
3,ARG,ARG4
4,ARG,ARG5


Merge samples with breed informations:

In [18]:
samples = tmp.set_index('fid').join(breeds.set_index('fid'))
samples.reset_index(inplace=True)
samples.set_index("original_id", inplace=True)
samples.head()

Unnamed: 0_level_0,fid,code,breed,raw_dataset_count,haplotype_sharing_count,population_structure_count,landscape_genomics_count
original_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ALP128,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
ALP129,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
ALP130,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
ALP131,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
ALP132,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43


Now transform the *ID2* series into a list, then drop those indexes:

In [19]:
filtered_samples = samples.drop(index=to_remove['ID2'].to_list())
filtered_samples.reset_index(inplace=True)
filtered_samples.head()

Unnamed: 0,original_id,fid,code,breed,raw_dataset_count,haplotype_sharing_count,population_structure_count,landscape_genomics_count
0,ALP211,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
1,ALP212,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
2,ALP213,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
3,ALP214,ALP,ALP,Alpine (Camosciata delle Alpi),143,117,30,43
4,ARG10,ARG,ARG,Argentata,48,46,30,41


Remove unused columns and add country

In [20]:
filtered_samples.drop(["raw_dataset_count", "haplotype_sharing_count", "population_structure_count", "landscape_genomics_count"], axis=1, inplace=True)
filtered_samples["country"] = "Italy"
filtered_samples.head()

Unnamed: 0,original_id,fid,code,breed,country
0,ALP211,ALP,ALP,Alpine (Camosciata delle Alpi),Italy
1,ALP212,ALP,ALP,Alpine (Camosciata delle Alpi),Italy
2,ALP213,ALP,ALP,Alpine (Camosciata delle Alpi),Italy
3,ALP214,ALP,ALP,Alpine (Camosciata delle Alpi),Italy
4,ARG10,ARG,ARG,Argentata,Italy


Write samples into a file:

In [21]:
filtered_samples.to_excel("cortellari_samples_fix.xlsx")

In [22]:
filtered_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   original_id  523 non-null    object
 1   fid          523 non-null    object
 2   code         523 non-null    object
 3   breed        523 non-null    object
 4   country      523 non-null    object
dtypes: object(5)
memory usage: 20.6+ KB
