## Code used to generate data for ontology annotations
#### Values needed for ontology table:
* `trait`
* `trait_description`
* `units`
* `method_name`

In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('data/raw/mac_season_4.csv')
print(f'Shape of dataframe: {df.shape}')

Shape of dataframe: (372363, 39)


In [7]:
unique_traits = df.trait.unique()
print(f'{df.trait.nunique()} unique traits within dataset')

75 unique traits within dataset


### I. Find all traits that have more than one unique value for 
* `trait_description`
* `units`
* `method_name`

In [25]:
for trait in unique_traits:
    
    if df.loc[df.trait == trait].trait_description.nunique() > 1:
        print(f'Trait with more than one unique trait description: {trait}')
        
    if df.loc[df.trait == trait].units.nunique() > 1:
        print(f'Trait with more than one unique value for units: {trait}')
    
    if df.loc[df.trait == trait].method_name.nunique() > 1:
        print(f'Trait with more than one unique method name: {trait}')

Trait with more than one unique method name: canopy_height


### II. Trait descriptions
Four traits have ontology identifiers in their trait description:
* `lodging_present`
* `stem_elongated_internodes_number`
* `emergence_count`
* `plant_basal_tiller_number`

In [1]:
want_to_print_unique_descriptions = False

if want_to_print_unique_descriptions:

    for trait in unique_traits:
    
        print(f'All unique descriptions for {trait}: {df.loc[df.trait == trait].trait_description.unique()}')

#### Find all trait descriptions containing ontology identifiers
* Search for string values containing `CO_`

In [20]:
ontology_identifiers = df.loc[df.trait_description.str.contains('CO_') == True]
print(f'Traits with ontology identifiers in trait description: {ontology_identifiers.trait.unique()}')

Traits with ontology identifiers in trait description: ['lodging_present' 'stem_elongated_internodes_number' 'emergence_count'
 'plant_basal_tiller_number']


### III. Units

In [2]:
want_to_print_unique_traits = False

if want_to_print_unique_traits:

    for trait in unique_traits:
    
        print(f'All unique units for {trait}: {df.loc[df.trait == trait].units.unique()}')

#### Example exploration code for null values
* View one complete row for more information

In [14]:
df.loc[df.trait == 'leaf_desiccation_present'].iloc[0]

Unnamed: 0                                                           1
checked                                                              0
result_type                                                     traits
id                                                          6001958927
citation_id                                                      6e+09
site_id                                                     6000005673
treatment_id                                                     6e+09
sitename                  MAC Field Scanner Season 4 Range 11 Column 5
city                                                          Maricopa
lat                                                            33.0749
lon                                                           -111.975
scientificname                                         Sorghum bicolor
commonname                                                     sorghum
genus                                                          Sorghum
specie

### IV. Method Name
* `canopy_height` is the only trait with more than method

In [3]:
want_to_print_methods = False

if want_to_print_methods:

    for trait in unique_traits:
    
        print(f'All unique method names for {trait}: {df.loc[df.trait == trait].method_name.unique()}')