### Code for Creating Cultivar Lookup Table
* MAC Season 4 Cultivars (2017)
* MAC Season 6 Cultivars (2018)
* KSU Cultivars (2016)
* Clemson Cultivars (2014)

In [None]:
import datetime
import numpy as np
import pandas as pd

In [None]:
season_4 = pd.read_csv('data/interim/season_4_cultivars.csv')
print(season_4.shape)
# season_4.head()

In [None]:
season_6 = pd.read_csv('data/interim/season_6_cultivars.csv')
print(season_6.shape)
# season_6.head()

In [None]:
ksu = pd.read_csv('data/interim/ksu_cultivars.csv')
print(ksu.shape)
# ksu.head()

In [None]:
genomics = pd.read_csv('data/interim/genomics_cultivars.csv')
print(genomics.shape)
# genomics.head()

In [None]:
clemson = pd.read_csv('data/interim/clemson_cultivars.csv')
print(clemson.shape)
# clemson.head()

In [None]:
clemson_1 = clemson.drop(labels='Unnamed: 0', axis=1)
print(clemson_1.shape)
# clemson_1.head()

In [None]:
clemson_2 = clemson_1.sort_values(by=['clemson_cultivars'], ignore_index=True)
print(clemson_2.shape)
# clemson_2.head()

#### Create lists of all cultivar column values

In [None]:
s4_values = season_4.season_4_cultivars.values
s6_values = season_6.season_6_cultivars.values
ksu_values = ksu.ksu_cultivars.values
genomics_values = genomics.with_genomic_data.values
clemson_values = clemson_2.clemson_cultivars.values

#### Create Union of all cultivars for new index

In [None]:
all_cultivars = list(set(s4_values) | set(s6_values) | set(ksu_values) | set(genomics_values) | set(clemson_values))

In [None]:
len(all_cultivars)

#### Create boolean lists for new column values

In [None]:
# season 4

new_s4 = []

for cultivar in all_cultivars:    
    if cultivar in s4_values:        
        new_s4.append(1)   
    else:       
        new_s4.append(0)   

print(len(new_s4))

In [None]:
# season 6

new_s6 = []

for cultivar in all_cultivars:    
    if cultivar in s6_values:        
        new_s6.append(1)   
    else:       
        new_s6.append(0)   

print(len(new_s6))

In [None]:
# ksu

new_ksu = []

for cultivar in all_cultivars:    
    if cultivar in ksu_values:        
        new_ksu.append(1)   
    else:       
        new_ksu.append(0)   

print(len(new_ksu))

In [None]:
# cultivars with genomics data

new_geno = []

for cultivar in all_cultivars:    
    if cultivar in genomics_values:        
        new_geno.append(1)   
    else:       
        new_geno.append(0)   

print(len(new_geno))

In [None]:
# clemson

new_clemson = []

for cultivar in all_cultivars:    
    if cultivar in clemson_values:        
        new_clemson.append(1)   
    else:       
        new_clemson.append(0)   

print(len(new_clemson))

#### New DataFrame with All Cultivars

In [None]:
cultivar_df = pd.DataFrame(index=all_cultivars, 
                           data={'season_4': new_s4, 'season_6': new_s6, 'ksu': new_ksu, 'clemson': new_clemson,
                                'genomic_data': new_geno}).sort_index()

print(cultivar_df.shape)
# cultivar_df.head()

#### New column with total value for each other (i.e. a value of 2 would indicate that the cultivar is present in 2 columns)

In [None]:
totals = []
for index, row in cultivar_df.iterrows():
    totals.append(row.season_4 + row.season_6 + row.ksu + row.clemson + row.genomic_data)

print(len(totals))
print(totals[:5])

In [None]:
cultivar_df_1 = cultivar_df.copy()

cultivar_df_1['total_count'] = totals
print(cultivar_df_1.shape)
cultivar_df_1.tail(3)

In [None]:
# cultivar_df_1.sample(n=10)

#### Write to `.csv`

In [None]:
timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
output_filename = f'data/processed/cultivar_lookup_table_{timestamp}.csv'.replace(':', '')

cultivar_df_1.to_csv(output_filename, index=True)