# Sort Data

## imports

In [1]:
import pandas as pd
import numpy  as np
import json

from geopy.distance import geodesic

In [2]:
def census_diff(df):

    grp = df.groupby(["plot",  "census"])
    cen = grp.date.agg(['min', 'max'])
    cen["mid"] = (cen["min"] + (cen["max"] - cen["min"])/2).dt.date
    cen["difference"] = cen["mid"].diff().astype('timedelta64[D]')
    cen.loc[cen["difference"] < 0 , "difference"] = np.NAN
    cen["diff_yrs"] = cen.difference/365

    cen.reset_index(level=0, inplace=True)
    cen.reset_index(level=0, inplace=True)

    cen.census.astype(str)
    cen["step"] = cen.census.astype(str).shift() + "-" + cen.census.astype(str)
    cen.loc[cen["difference"].isnull(), "step"] = np.NaN
    cen.index = cen['plot'] + "_" + cen['step']

    return cen


### plot locations and fractal nesting

In [3]:
# open plot locations
with open('../Data/rows.geojson') as f:
    data = json.load(f)

properties = pd.DataFrame()

# add each line of geojson file to dataframe
for i in range(0, len(data['features'])):
    properties = properties.append(pd.DataFrame(data['features'][i]['properties'], index = [i]))

# only really care about these columns
properties = properties.loc[: , ["plot_size",
                                 "centroid_y",
                                 "centroid_x",
                                 "fractal_order",
                                 "location"]]

properties.rename(columns={'centroid_y':'longitude', 'centroid_x':'latitude'}, inplace=True)
properties["longlat"] = properties.apply(lambda x: [x.longitude, x.latitude], axis=1)

# seperate dataframe for only second order fractal points
second_order = properties.loc[properties.fractal_order == 2, : ]

# function to find point closest to given point
def closest(pt, others):
    
    clst_pt = min(others.longlat, key = lambda x: geodesic(pt, x).meters)
        
    return others.location.loc[others.longlat.apply(lambda x: x == clst_pt)].reset_index(drop = True)


# fractal nesting and agb data
fpn = pd.read_csv("../Data/Fractal_point_nesting.csv")
agb = pd.read_csv("../Data/AGB.csv")

# specific wanted columns - and rename ***(going with Chave moist)***
agb = agb[["field_name", "Plot", "Date", "AGB_Chave_moist", "ForestQuality"]]
agb.columns = ["field_name", "plot", "date", "agb", "forestquality"]

## Mammals

### readin and sort raw data

In [37]:
# readin data

E  = pd.read_csv("../Data/small_mammals/E_2011-2016.csv")
F  = pd.read_csv("../Data/small_mammals/F_2011-2012.csv")
D  = pd.read_csv("../Data/small_mammals/D_2011-2016.csv")
OG = pd.read_csv("../Data/small_mammals/OG2_2011-2016.csv")

# sort and combine into one df
def sort_mams(df):

    # only need first 4 columns
    df  = df.iloc[:, 0:4]

    ncolnames = ["occasion", "date", "trap", "species"]

    df.columns  = ncolnames

    # stupid formatting sorted
    df["occasion"]  = df["occasion"].str.replace("--", "-")
    df["trap"]      = df["trap"].str.replace("--", "-")
    df["trap"]      = df.trap.apply(lambda x: x[:-1])
    df["date"]      = pd.to_datetime(df.date)
    df["census"]    = df.date.dt.year
    df["subplot"]   = df.trap.str.rpartition("-")[0]
    df["subplot"]   = df.subplot.astype("category").cat.codes
    
    return df

E  = sort_mams(E)
F  = sort_mams(F)
D  = sort_mams(D)
OG = sort_mams(OG)

E["plot"]  = "E"
F["plot"]  = "F"
D["plot"]  = "D"
OG["plot"] = "OG"

frames = [E, F, D, OG]

mamls_df = pd.concat(frames, sort = False)

# species lookup and dealing with ? and unknowns
m_lkup = pd.read_csv("../Data/small_mammals/mammals_lookup.csv")
m_lkup.columns = ["code", "species", "scientific"]

mamls_df["species"] = mamls_df.species.str.strip()

# if its a questionmark - I just go with it
# if its an either or I go with the first one!
mamls_df.loc[mamls_df.loc[:, "species"] == "CTRS-but see notes", "species"] = "CTRS"

mamls_df.loc[mamls_df.loc[:, "species"] == "SS?",          "species"] = "SS"
mamls_df.loc[mamls_df.loc[:, "species"] == "PR?",          "species"] = "PR"
mamls_df.loc[mamls_df.loc[:, "species"] == "RR?",          "species"] = "RR"
mamls_df.loc[mamls_df.loc[:, "species"] == "MR?",          "species"] = "MR"
mamls_df.loc[mamls_df.loc[:, "species"] == "RS?",          "species"] = "RS"
mamls_df.loc[mamls_df.loc[:, "species"] == "LGTRS?",       "species"] = "LGTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "BS?",          "species"] = "BS"
mamls_df.loc[mamls_df.loc[:, "species"] == "PSQ",          "species"] = "LSQ"      # not confident on this
mamls_df.loc[mamls_df.loc[:, "species"] == "RS or SS" ,    "species"] = "RS"
mamls_df.loc[mamls_df.loc[:, "species"] == "WH or SS",     "species"] = "WH"
mamls_df.loc[mamls_df.loc[:, "species"] == "BS/RS?",       "species"] = "BS"
mamls_df.loc[mamls_df.loc[:, "species"] == "PTSQ?",        "species"] = "PTSQ"
mamls_df.loc[mamls_df.loc[:, "species"] == "LETRS",        "species"] = "LETRS?"   # for some reason the lookup table has a ?
mamls_df.loc[mamls_df.loc[:, "species"] == "CBS?",         "species"] = "CBS"
mamls_df.loc[mamls_df.loc[:, "species"] == "SL?TRS",       "species"] = "SLTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "SLTRS?",       "species"] = "SLTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "L?TRS",        "species"] = "SLTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "LSQ?",         "species"] = "LSQ"
mamls_df.loc[mamls_df.loc[:, "species"] == "LTRS or CTRS", "species"] = "CTRS"     # went with CTRS as LTRS could refer to a couple
mamls_df.loc[mamls_df.loc[:, "species"] == "LTRS",         "species"] = "LETRS?"   # not convinced about this one
mamls_df.loc[mamls_df.loc[:, "species"] == "Squirrel",     "species"] = "squirrel"
mamls_df.loc[mamls_df.loc[:, "species"] == "squirrel",     "species"] = "unknown"  # ***mmm?***
mamls_df.loc[mamls_df.loc[:, "species"] == "?",            "species"] = "unknown"
mamls_df.loc[mamls_df.loc[:, "species"] == "Unknown",      "species"] = "unknown"

mamls_df = pd.merge(mamls_df,
                    m_lkup[["code", "scientific"]],
                    how      = "left",
                    left_on  = "species",
                    right_on = "code")

In [41]:
mamls_df = mamls_df.merge(properties[["location", "longlat"]],
                          how = "left", left_on = "trap", right_on = "location")

mamls_df["SecondOrder"] = mamls_df.longlat.apply(lambda x: closest(x, second_order))
mamls_df.SecondOrder = mamls_df.SecondOrder.str.partition("_")[2].astype(int)

mamls_df = mamls_df.merge(agb[["plot", "agb", "forestquality"]], how = "left",
                          left_on = "SecondOrder", right_on = "plot")

mamls_df = mamls_df[["occasion",
                     "date",
                     "trap",
                     "subplot",
                     "species",
                     "plot_x",
                     "census",
                     "scientific",
                     "longlat",
                     "SecondOrder",
                     "agb",
                     "forestquality"]]

mamls_df = mamls_df.rename(index = str, columns={"plot_x" : "plot"})

mamls_df["subplot"] = mamls_df.subplot.apply(lambda x: str(x).zfill(2))

mamls_df["plt_sub_cen"] = mamls_df["plot"] + "_" + mamls_df["subplot"] + "_c" + mamls_df["census"].astype(str)

In [16]:
mamls_df.to_csv("../Results/mammals_sorted2.csv")

In [21]:
mamls_df["subplot"] = mamls_df.subplot.apply(lambda x: str(x).zfill(2))
mamls_df["plt_sub_cen"] = mamls_df["plot"] + "_" + mamls_df["subplot"] + "_c" + mamls_df["census"].astype(str)

In [22]:
mamls_mx = mamls_df.groupby(["plt_sub_cen", "species"]).size().unstack()
mamls_mx = mamls_mx.fillna(value = 0)
mamls_mx.to_csv("../Results/mammals_matrix2.csv")

In [11]:
mamls_cen = census_diff(mamls_df)
mamls_cen.to_csv("../Results/mammals_census_dates.csv")

## readin data

In [78]:
trees_df = pd.read_csv("../Results/trees_sorted.csv")
mamls_df = pd.read_csv("../Results/mammals_sorted.csv")
btles_df = pd.read_csv("../Results/beetles_sorted.csv")

In [79]:
fpn["FirstOrder"] = fpn.FirstOrder.str.partition("_")[2].astype(int)

fractals = fpn.loc[:, ["FirstOrder", "SecondOrder"]]

## sort beetles (match to second order fragment)

In [80]:
btles_df = btles_df.merge(fractals, how = "left", left_on = "trap_N", right_on = "FirstOrder")
btles_df["SecondOrder"] = btles_df.SecondOrder.str.partition("_")[2].astype(int)
btles_df = btles_df.merge(agb[["Plot", "AGB_Chave_moist", "ForestQuality"]], how = "left",
                          left_on = "SecondOrder", right_on = "Plot")

In [120]:
btles_agb = pd.DataFrame(btles_df.groupby("block").AGB_Chave_moist.median())

btles_agb.to_csv("../Results/btles_agb.csv")

## sort mammals

In [81]:
mamls_df.Trap_ID = mamls_df.Trap_ID.apply(lambda x: x[:-1])
mamls_df = mamls_df.merge(properties[["location", "longlat"]],
                          how = "left", left_on = "Trap_ID", right_on = "location")

In [82]:
mamls_df["SecondOrder"] = mamls_df.longlat.apply(lambda x: closest(x, SO))
mamls_df.SecondOrder = mamls_df.SecondOrder.str.partition("_")[2].astype(int)

In [83]:
mamls_df = mamls_df.merge(agb[["Plot", "AGB_Chave_moist", "ForestQuality"]], how = "left",
                          left_on = "SecondOrder", right_on = "Plot")

In [128]:
mamls_df["plot"] = mamls_df.Trap_ID.str.rpartition("-")[0]

In [160]:
mamls_agb = pd.DataFrame(mamls_df.groupby("plot").AGB_Chave_moist.median())
mamls_agb.to_csv("../Results/mamls_agb.csv")

## sort trees

In [103]:
trees_agb = pd.DataFrame(trees_df.groupby(["plot", "census"]).stem_C.sum())

trees_agb = pd.DataFrame(trees_agb.groupby("plot").mean()*  0.0625 * 0.001)

trees_agb.to_csv("../Results/trees_agb.csv")