# Manual Changes

## template mapping files are in the git repository
## original data in _CyVerse Discovery Environment_ 
### data file is: "J.Biogeo.2008.AllData.Final.csv"

### _catalogNumber_
- in Specimen.Number column (new catalogNumber)
- separate out institutionCode from Specimen.Number
- create new column titled institutionCode

### _measurementUnit_
- either in "g" or "mm"

### _otherCatalogNumbers_
- concatenated list of:
    - Proxy.Specimen.Number
    - Annual.Specimen.Number
    - YOC.Specimen.Number

### _unused columns_
- datum (units for latitude)

## To Code
### _elevationInMeters_
- in _elevation.ft_
- convert ot meters

In [34]:
import pandas as pd
import numpy as np
import re
import uuid

In [35]:
#Import Biogeo Data Locally
biogeo = pd.read_csv("../Original Data/biogeo.csv")

In [36]:
#Preliminary data cleaning

#Convert elevation.ft values from feet to meters
#1 foot is exactly 0.3048 meters
biogeo['elevation.ft']=biogeo['elevation.ft'].multiply(0.3048)
#it's not being renamed here because it is renamed later in the script

In [37]:
#Preliminary data cleaning

#Creating a new column called institutionCode and moving from Specimen.Number to institutionCode.  
biogeo=biogeo.assign(institutionCode = "")
for ind in biogeo.index:
    x=biogeo['Specimen.Number'][ind]
    y=str(x)
    z=str(y).split()
    biogeo['institutionCode'][ind]=z[0]
    y=re.sub(z[0],'',y)
    biogeo['Specimen.Number'][ind]=y


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [38]:
#Add measurementUnit column 
biogeo=biogeo.assign(measurementUnit = "")

In [39]:
#Add otherCatalogNumbers

biogeo=biogeo.assign(otherCatalogNumbers = biogeo['Proxy.Specimen.Number'].fillna('')+biogeo['Annual.Specimen.Number'].fillna('')+biogeo['Annual.Specimen.Number'].fillna('') )


#otherCatalogNumbers
#concatenated list of:
#Proxy.Specimen.Number
#Annual.Specimen.Number
#YOC.Specimen.Number
biogeo

#df2 = df.assign(ColumnA = df.Column2.astype(str) + ', ' + \
#  df.Column3.astype(str) + ', ' df.Column4.astype(str) + ', ' \
#  df.Column4.astype(str) + ', ' df.Column5.astype(str))

Unnamed: 0,Specimen.Number,Proxy.Specimen.Number,Annual.Specimen.Number,YOC.Specimen.Number,dec.lat,dec.long,max.error,datum,elevation.ft,c.diastema.1.mm,...,spr.precip.in,sum.max.c,sum.min.c,sum.precip.in,win.max.c,win.min.c,win.precip.in,institutionCode,measurementUnit,otherCatalogNumbers
0,100739,MVZ 100739,,,36.458730,-121.234230,0.16089,NAD27,457.20,8.17,...,,,,,,,,MVZ,,MVZ 100739
1,100740,MVZ 100740,,,35.328304,-119.845250,4.02300,NAD27,822.96,7.52,...,,,,,,,,MVZ,,MVZ 100740
2,101240,,MVZ 101240,MVZ 101240,37.850522,-122.536923,1.55600,NAD27,,8.20,...,2.106667,71.27667,52.82667,0.103333,58.02667,44.04667,4.473333,MVZ,,MVZ 101240MVZ 101240
3,101332,,MVZ 101332,MVZ 101332,38.107071,-122.841182,0.65000,NAD27,,9.78,...,1.956667,79.35000,47.49000,0.093333,60.95667,38.16333,4.840000,MVZ,,MVZ 101332MVZ 101332
4,101333,MVZ 101333,MVZ 101333,MVZ 101333,38.119278,-122.821322,2.17580,NAD27,45.72,9.52,...,1.956667,79.35000,47.49000,0.093333,60.95667,38.16333,4.840000,MVZ,,MVZ 101333MVZ 101333MVZ 101333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,99936,MVZ 99936,,,33.925100,-116.681400,1.00000,NAD27,365.76,9.20,...,,,,,,,,MVZ,,MVZ 99936
284,99937,MVZ 99937,,,33.925100,-116.681400,1.00000,NAD27,365.76,8.52,...,,,,,,,,MVZ,,MVZ 99937
285,99938,MVZ 99938,,,33.925100,-116.681400,1.00000,NAD27,365.76,7.86,...,,,,,,,,MVZ,,MVZ 99938
286,99940,MVZ 99940,,,33.877686,-116.621661,1.60900,NAD27,335.28,8.10,...,,,,,,,,MVZ,,MVZ 99940


In [40]:
#Rearrange columns so that template columns are first, followed by measurement values

#Create column list
cols = biogeo.columns.tolist()

#Specify desired columns
cols = ['Specimen.Number',
        'institutionCode',
        'otherCatalogNumbers',
        'dec.lat',
        'dec.long',  
        'max.error',
        'elevation.ft',
        'ear.length.mm',
        'hind.foot.length.mm',
        'tail.length.mm',
        'total.length.mm',
        'body.mass.g',
        'measurementUnit']

#Subset dataframe
biogeo = biogeo[cols]

In [41]:
#Matching template and column terms

#Renaming columns 
biogeo = biogeo.rename(columns = {'Specimen.Number':'catalogNumber', 
                                  'dec.lat':'decimalLatitude', 
                                  'dec.long':'decimalLongitude',  
                                  'max.error':'coordinateUncertaintyInMeters', 
                                  'elevation.ft':'pointElevationInMeters'})

In [42]:
#Matching trait and ontology terms

#Renaming columns
biogeo = biogeo.rename(columns={'ear.length.mm':'ear length',
                                'hind.foot.length.mm':'hind foot length',
                                'tail.length.mm': 'tail length',
                                'total.length.mm':'full body length',
                                'body.mass.g':'body mass'})

In [43]:
#create long version so that each trait has its own row

#creating long version, first specifiying keep variables, then naming variable and value
longVers=pd.melt(biogeo, 
                id_vars=['catalogNumber',
                         'institutionCode',
                         'otherCatalogNumbers',
                         'decimalLatitude',
                         'decimalLongitude',  
                         'coordinateUncertaintyInMeters',
                         'pointElevationInMeters',
                         'measurementUnit'], 
                          var_name = 'measurementType', 
                          value_name = 'measurementValue')

#Populating measurementUnit column with appropriate measurement units in long version
for ind in longVers.index:
    if longVers['measurementType'][ind] == "body mass":
        longVers['measurementUnit'][ind]="g"
    else:
        longVers['measurementUnit'][ind]="mm"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [44]:
#create materialSampleID which is a UUID for each measurement
longVers=longVers.assign(materialSampleID = '')
longVers['materialSampleID'] = [uuid.uuid4() for _ in range(len(longVers.index))]

In [45]:
#Writing long data csv file
longVers.to_csv('../Mapped Data/Biogeo_Data_Long.csv');