Join V2 data from TERRA program recveved directly from Roman
(Season 6; copied notebook TERRA-3)

In [4]:
import pandas as pd

In [7]:
#heights = pd.read_csv("/Volumes/Curt-MacPro-Backup/D3M/terra/Raw-data/V2-Roman-1108/s6.csv")
#genids = pd.read_csv("/Volumes/Curt-MacPro-Backup/D3M/terra/Raw-data/V2-Roman-1108/s6_genotypes.csv")
#gennames = pd.read_csv("/Volumes/Curt-MacPro-Backup/D3M/terra/Raw-data/V2-Roman-1108/genotype_names.csv")
heights = pd.read_csv("/media/clisle/Backup Plus/terra/Raw-data/V2-Roman-1108/s6.csv")
genids = pd.read_csv("/media/clisle/Backup Plus/terra/Raw-data/V2-Roman-1108/s6_genotypes.csv")
gennames = pd.read_csv("/media/clisle/Backup Plus/terra/Raw-data/V2-Roman-1108/genotype_names.csv")

In [8]:
heights.head()

Unnamed: 0,day_number,range,column,sensor,height(cm)
0,115,3,2,1,4.4
1,115,3,2,2,6.16
2,115,3,3,1,5.76
3,115,3,3,2,6.4
4,115,3,4,1,5.79


In [9]:
def returnUniqueCounts(dframe):
    return pd.DataFrame.from_records([(col, dframe[col].nunique()) for col in dframe.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

In [10]:
returnUniqueCounts(heights)

Unnamed: 0,Column_Name,Num_Unique
3,sensor,2
2,column,14
1,range,50
0,day_number,68
4,height(cm),25752


In [11]:
returnUniqueCounts(genids)

Unnamed: 0,Column_Name,Num_Unique
1,column,14
0,range,50
2,genotype_id,326


In [12]:
returnUniqueCounts(gennames)

Unnamed: 0,Column_Name,Num_Unique
0,genotype_id,350
1,genotype_string,350


Now do a join to first add the genID to each height measurement.  This is an inner join so we fill in only values we already have in the left dataframe.  We don't want to concatenate rows onto the left dataframe.  By specifying range,column to match, this will add the genotype_id (cultivar ID) to the height data.

Info on joins in Pandas: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging

In [13]:
join1 = pd.merge(left=heights,right=genids,on=['range','column'],how='inner')

In [14]:
returnUniqueCounts(join1)

Unnamed: 0,Column_Name,Num_Unique
3,sensor,2
2,column,14
1,range,50
0,day_number,68
5,genotype_id,326
4,height(cm),25752


In [15]:
join1.tail()

Unnamed: 0,day_number,range,column,sensor,height(cm),genotype_id
87745,205,52,15,2,264.58,350
87746,206,52,15,1,272.36,350
87747,206,52,15,2,261.45,350
87748,208,52,15,1,277.41,350
87749,208,52,15,2,261.61,350


Now we add the genotype (cultivar) name by doing a join on the genotype_id column

In [16]:
join2 = pd.merge(left=join1,right=gennames,on='genotype_id',how='inner')
returnUniqueCounts(join2)

Unnamed: 0,Column_Name,Num_Unique
3,sensor,2
2,column,14
1,range,50
0,day_number,68
5,genotype_id,326
6,genotype_string,326
4,height(cm),25752


In [17]:
join2.tail()

Unnamed: 0,day_number,range,column,sensor,height(cm),genotype_id,genotype_string
87745,205,50,12,2,285.89,21,PI569090
87746,206,50,12,1,286.85,21,PI569090
87747,206,50,12,2,288.15,21,PI569090
87748,208,50,12,1,291.18,21,PI569090
87749,208,50,12,2,289.07,21,PI569090


In [18]:
join2.head()

Unnamed: 0,day_number,range,column,sensor,height(cm),genotype_id,genotype_string
0,115,3,2,1,4.4,350,SP1516
1,115,3,2,2,6.16,350,SP1516
2,116,3,2,1,5.56,350,SP1516
3,116,3,2,2,6.27,350,SP1516
4,119,3,2,1,5.66,350,SP1516


In [19]:
join2.loc[(join2['genotype_string'] == 'PI145619') ].describe()

Unnamed: 0,day_number,range,column,sensor,height(cm),genotype_id
count,256.0,256.0,256.0,256.0,256.0,256.0
mean,158.917969,22.5,5.5,1.496094,107.313242,340.0
std,24.962818,6.512733,3.506856,0.500964,97.792519,0.0
min,115.0,16.0,2.0,1.0,4.41,340.0
25%,139.75,16.0,2.0,1.0,11.825,340.0
50%,157.5,22.5,5.5,1.0,76.655,340.0
75%,178.0,29.0,9.0,2.0,207.2425,340.0
max,208.0,29.0,9.0,2.0,291.84,340.0


Therefore, the join2 dataframe contains the new canopy height data.  Save it out:

In [21]:
join2.to_csv('/media/clisle/Backup Plus/terra/processing/V2/s6_height.csv')

In [22]:
# lets just pick one of the sensors first to simplify.  It will be better to average the sensors, but that will take a while. 
sensor1 = join2.loc[(join2['sensor'] == 1)]
print(join2.shape)
print(sensor1.shape)

(87750, 7)
(43986, 7)


In [23]:
sensor1.to_csv('/media/clisle/Backup Plus/terra/processing/V2/s6_height_s1.csv')

Since it takes a long time to run, lets add a date field to the S4 height information to further facilitate time sequence modeling

In [24]:
import arrow
import itertools

count = 0
startdate = arrow.get("2019-01-01T00:29:00.655800-05:00")

def convertDayToDate(startdate,dayOffset):
    return startdate.shift(days=int(dayOffset))

sensor1['date'] = startdate
    
for i in range(len(sensor1)):
    sensor1['date'][i] = convertDayToDate(startdate,sensor1['day_number'][i])
    count += 1
    if (count % 5000) == 0:
        print(count)


sensor1.to_csv('/media/clisle/Backup Plus/terra/processing/V2/s6_height_s1_date.csv')


ModuleNotFoundError: No module named 'arrow'

In [46]:
arrow.now()

<Arrow [2019-11-09T00:29:00.655800-05:00]>

In [48]:
sensor1['day_number'][4]

123

In [51]:
sensor1.head()

Unnamed: 0,day_number,range,column,sensor,height(cm),genotype_id,genotype_string,date
0,118,3,2,1,5.89,1,PI329465,2019-01-01T00:29:00.655800-05:00
2,121,3,2,1,6.15,1,PI329465,2019-01-01T00:29:00.655800-05:00
4,123,3,2,1,5.98,1,PI329465,2019-01-01T00:29:00.655800-05:00
6,124,3,2,1,5.99,1,PI329465,2019-01-01T00:29:00.655800-05:00
8,125,3,2,1,6.02,1,PI329465,2019-01-01T00:29:00.655800-05:00
