# Goat metadata description
Try to understand which metadata we have and what we need

In [None]:
import json

import pandas as pd

from src.features.smarterdb import global_connection, SampleGoat, Dataset

In [None]:
conn = global_connection()

Get all goat samples:

In [None]:
goats = SampleGoat.objects.all()
goats = pd.json_normalize(json.loads(goats.to_json()))
goats.head()

In [None]:
goats.info()

Get only informative columns

In [None]:
goats_useful = goats[[
    "original_id", "smarter_id", "country", "breed", "breed_code", "type", "phenotype.purpose", "phenotype.köppen_group", 
    "phenotype.chest_girth", "phenotype.height", "phenotype.length", "phenotype.widthofpinbones", "phenotype.famacha", "phenotype.coat_color", 
    "locations.coordinates", "sex", "dataset_id.$oid"
]]
goats_useful.head()

In [None]:
goats_useful.info()

Summarize info for `background` and `foreground` data. Count on text columns simply sum column with a value (any)

In [None]:
goats_useful.groupby("type").count()

Get info on sheeps genotypes datasets:

In [None]:
datasets = Dataset.objects.filter(species="Goat", type_="genotypes").all()
datasets = pd.json_normalize(json.loads(datasets.to_json()))
datasets.head()

Get only the columns I need for datasets:

In [None]:
datasets_useful = datasets[["_id.$oid", "file", "n_of_individuals", "partner", "chip_name"]]

In [None]:
datasets_useful.info()

Try to merge dataframes on object ids:

In [None]:
samples = pd.merge(datasets_useful, goats_useful, how="inner", left_on="_id.$oid", right_on="dataset_id.$oid")

In [None]:
samples.head()

In [None]:
samples.info()

Group datasets by file. Then count columns which have (any) value:

In [None]:
dataset_samples = samples.groupby(["file", "partner"]).count()
print(dataset_samples.columns)
dataset_samples = dataset_samples.reset_index()
dataset_samples = dataset_samples.set_index("file")
dataset_samples.head()

Try to focus only on some columns. Add a total row on bottom:

In [None]:
columns = ["partner", "n_of_individuals", "phenotype.purpose", "phenotype.köppen_group", "phenotype.chest_girth", 
           "phenotype.height", "phenotype.length", "phenotype.widthofpinbones", "phenotype.famacha", "phenotype.coat_color", 
           "locations.coordinates", "sex"]
d1 = dataset_samples[columns]
d2 = pd.DataFrame({"total": dataset_samples.sum()[columns]}).transpose()
d2 = d2.drop("partner", axis=1)
summary = pd.concat([d1, d2], axis=0)
summary.to_excel("goat_summary.xlsx")
summary

How many phenotypes we have? can we define a set of allowed values?

In [None]:
samples.value_counts("phenotype.purpose")

In [None]:
samples.value_counts("phenotype.köppen_group")

In [None]:
samples.describe()