# Sort Data 

## Imports

In [78]:
import pandas as pd
import numpy as np
import json

from geopy.distance import geodesic

## Functions

In [79]:
def census_diff(df, census):

    grp = df.groupby(["plot",  census])
    cen = grp.date.agg(['min', 'max'])
    cen["mid"] = (cen["min"] + (cen["max"] - cen["min"])/2).dt.date
    cen["difference"] = cen["mid"].diff().astype('timedelta64[D]')
    cen.loc[cen["difference"] < 0 , "difference"] = np.NAN
    cen["diff_yrs"] = cen.difference/365

    cen.reset_index(level=0, inplace=True)
    cen.reset_index(level=0, inplace=True)

    cen[census].astype(str)
    cen["step"] = cen[census].astype(str).shift() + "-" + cen[census].astype(str)
    cen.loc[cen["difference"].isnull(), "step"] = np.NaN
    cen.index = cen['plot'] + "_" + cen['step']

    return cen

## Open general data

In [80]:
# open plot locations
with open('../Data/rows.geojson') as f:
    data = json.load(f)

properties = pd.DataFrame()

# add each line of geojson file to dataframe
for i in range(0, len(data['features'])):
    properties = properties.append(pd.DataFrame(data['features'][i]['properties'], index = [i]))

# only really care about these columns
properties = properties.loc[: , ["plot_size",
                                 "centroid_y",
                                 "centroid_x",
                                 "fractal_order",
                                 "location"]]

properties.rename(columns={'centroid_y':'longitude', 'centroid_x':'latitude'}, inplace=True)
properties["longlat"] = properties.apply(lambda x: [x.longitude, x.latitude], axis=1)

# seperate dataframe for only second order fractal points
second_order = properties.loc[properties.fractal_order == 2, : ]

# function to find point closest to given point
def closest(pt, others):
    
    clst_pt = min(others.longlat, key = lambda x: geodesic(pt, x).meters)
    distnce = geodesic(pt, clst_pt).meters
    clst_pt = others.location.loc[others.longlat.apply(lambda x: x == clst_pt)].reset_index(drop = True)

    return pd.Series([clst_pt.values[0], distnce])


# fractal nesting and agb data
fpn = pd.read_csv("../Data/Fractal_point_nesting.csv")
agb = pd.read_csv("../Data/AGB.csv")

fpn["FirstOrder"] = fpn.FirstOrder.str.partition("_")[2].astype(int)
fpn.columns = ["site",
               "habitat",
               "logging",
               "frag_area",
               "first_order",
               "second_order",
               "third_order",
               "fourth_order",
               "fifth_order"]
fractals = fpn.loc[:, ["site", "first_order", "second_order"]]

# specific wanted columns - and rename ***(going with Chave moist)***
agb = agb[["field_name", "Plot", "Date", "AGB_Chave_moist", "ForestQuality"]]
agb.columns = ["field_name", "second_order", "date", "agb", "forestquality"]

## Mammals Data

#### Open and sort the data

In [81]:
# open each seperate plot
E  = pd.read_csv("../Data/small_mammals/test/E_test.csv")
F  = pd.read_csv("../Data/small_mammals/test/F_test.csv")
D  = pd.read_csv("../Data/small_mammals/test/D_test.csv")
OG = pd.read_csv("../Data/small_mammals/test/OG_test.csv")

#### function to sort out each plot in turn

In [82]:
def sort_mams(df):

    # new column names
    ncolnames   = ["occasion", "date", "grid", "point", "trap", "trap_id", "species"]
    df.columns  = ncolnames

    # stupid formatting sorted
    df["occasion"]  = df.occasion.str.replace("--", "-")
    df["plot"]      = df.occasion.str[0]
    #df["plot"]      = df.grid  **think it would be worth giving this another go!**
    df["grid"]      = df.grid.str.replace("--", "-")
    df["trap_id"]   = df.trap_id.str.replace("--", "-")
    df["trap_id"]   = df.trap_id.apply(lambda x: x[:-1])
    df["date"]      = pd.to_datetime(df.date)
    df["year"]      = df.date.dt.year
    df["census"]    = df.occasion.str.partition("-")[2].str.partition("-")[2]
    
    return df

E  = sort_mams(E)
F  = sort_mams(F)
D  = sort_mams(D)
OG = sort_mams(OG)

frames = [E, F, D, OG]

mamls_df = pd.concat(frames, sort = False)

mamls_df["species"] = mamls_df.species.fillna("None")

#### match with species names from lookup

In [83]:
# mammals species lookup table 
m_lkup = pd.read_csv("../Data/small_mammals/mammals_lookup.csv")
m_lkup.columns = ["code", "species", "scientific"]

mamls_df["species"] = mamls_df.species.str.strip()


# my fairly questionable decisions...

# if its a questionmark - I just go with it
# if its an either or I go with the first one!
mamls_df.loc[mamls_df.loc[:, "species"] == "CTRS-but see notes", "species"] = "CTRS"

mamls_df.loc[mamls_df.loc[:, "species"] == "SS?",          "species"] = "SS"
mamls_df.loc[mamls_df.loc[:, "species"] == "WH?",          "species"] = "WH"
mamls_df.loc[mamls_df.loc[:, "species"] == "PR?",          "species"] = "PR"
mamls_df.loc[mamls_df.loc[:, "species"] == "RR?",          "species"] = "RR"
mamls_df.loc[mamls_df.loc[:, "species"] == "MR?",          "species"] = "MR"
mamls_df.loc[mamls_df.loc[:, "species"] == "MR??",         "species"] = "MR"
mamls_df.loc[mamls_df.loc[:, "species"] == "RS?",          "species"] = "RS"
mamls_df.loc[mamls_df.loc[:, "species"] == "LGTRS?",       "species"] = "LGTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "BS?",          "species"] = "BS"
mamls_df.loc[mamls_df.loc[:, "species"] == "PSQ",          "species"] = "LSQ"      # not confident on this
mamls_df.loc[mamls_df.loc[:, "species"] == "BSQ",          "species"] = "BSQ?"
mamls_df.loc[mamls_df.loc[:, "species"] == "SSQ",          "species"] = "SSQ?"
mamls_df.loc[mamls_df.loc[:, "species"] == "RS or SS" ,    "species"] = "RS"
mamls_df.loc[mamls_df.loc[:, "species"] == "WH or SS",     "species"] = "WH"
mamls_df.loc[mamls_df.loc[:, "species"] == "BS/RS?",       "species"] = "BS"
mamls_df.loc[mamls_df.loc[:, "species"] == "PTSQ?",        "species"] = "PTSQ"
mamls_df.loc[mamls_df.loc[:, "species"] == "LETRS",        "species"] = "LETRS?"   # for some reason the lookup table has a ?
mamls_df.loc[mamls_df.loc[:, "species"] == "CBS?",         "species"] = "CBS"
mamls_df.loc[mamls_df.loc[:, "species"] == "SL?TRS",       "species"] = "SLTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "SLTRS?",       "species"] = "SLTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "L?TRS",        "species"] = "SLTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "LSQ?",         "species"] = "LSQ"
mamls_df.loc[mamls_df.loc[:, "species"] == "CTRS?",        "species"] = "CTRS"
mamls_df.loc[mamls_df.loc[:, "species"] == "LTRS or CTRS", "species"] = "CTRS"     # went with CTRS as LTRS could refer to a couple
mamls_df.loc[mamls_df.loc[:, "species"] == "LTRS",         "species"] = "LETRS?"   # not convinced about this one
mamls_df.loc[mamls_df.loc[:, "species"] == "Squirrel",     "species"] = "squirrel"
mamls_df.loc[mamls_df.loc[:, "species"] == "DTT_DEAD",     "species"] = "DTT"
mamls_df.loc[mamls_df.loc[:, "species"] == "LSQ?_DEAD",    "species"] = "LSQ"
mamls_df.loc[mamls_df.loc[:, "species"] == "squirrel",     "species"] = "unknown"  # ***mmm?***
mamls_df.loc[mamls_df.loc[:, "species"] == "See notes",    "species"] = "unknown"  # i'm effectivley treating 'unknown' as a seperate species which seems spurious at best 
mamls_df.loc[mamls_df.loc[:, "species"] == "??",           "species"] = "unknown"
mamls_df.loc[mamls_df.loc[:, "species"] == "?",            "species"] = "unknown"
mamls_df.loc[mamls_df.loc[:, "species"] == "Unknown",      "species"] = "unknown"

# merge
mamls_df = pd.merge(mamls_df,
                    m_lkup[["code", "scientific"]],
                    how      = "left",
                    left_on  = "species",
                    right_on = "code")

# get rid of the leftovers... (there were a couple of birds/reptiles)
mamls_df = mamls_df.loc[-mamls_df.code.isna(), :]

#### find closest f2 point to each trap and get agb measure

In [84]:
# get all the unique trap names
trap_locs = pd.DataFrame({"trap_id" : mamls_df.trap_id.unique()})

trap_locs = trap_locs.merge(properties[["location", "longlat"]],
                            how      = "left",
                            left_on  = "trap_id",
                            right_on = "location")

# find the closest second order fractal point
trap_locs[["second_order", "distance_so"]] = trap_locs.longlat.apply(lambda x: closest(x, second_order))

# merge back to master dataframe
mamls_df = mamls_df.merge(trap_locs, how = "left", on = "trap_id")

# just want the point number 
mamls_df.second_order = mamls_df.second_order.str[-3:]
mamls_df.second_order = mamls_df.second_order.astype(int)

# merge to get agb and forest quality
#mamls_df = mamls_df.merge(agb[["second_order", "agb", "forestquality"]], how = "left",
#                          on = "second_order")

In [96]:
#trap_locs["plot"] = trap_locs.second_order.str.partition("_")[0]
#trap_locs["long"] = trap_locs.longlat.apply(lambda x: x[0])
#trap_locs["lat"]  = trap_locs.longlat.apply(lambda x: x[1])

#trap_locs.groupby("plot").mean()

Unnamed: 0_level_0,distance_so,long,lat
plot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D,46.15232,4.711225,117.586573
E,45.700009,4.692741,117.580786
F,45.283478,4.694528,117.539172
OG2,43.919643,4.748936,116.965892


#### final cleanup and save

In [8]:
mamls_df = mamls_df.rename(index=str, columns={"plot_x": "plot"})

mamls_df = mamls_df[["occasion",
                     "date",
                     "grid",
                     "point",
                     "trap",
                     "trap_id",
                     "species",
                     "year",
                     "plot",
                     "census",
                     "scientific",
                     "longlat",
                     "second_order",
                     "distance_so",
                     "agb",
                     "forestquality"]]

mamls_df.to_csv("../Results/mammals_sorted.csv")

#### Make the species/plot matrix

In [9]:
# i'll give four different combinations a go...
mamls_df["trap_year"]   = mamls_df["plot"] + "_" + mamls_df.trap_id + "_" + mamls_df.year.astype(str)
mamls_df["grid_year"]   = mamls_df["plot"] + "_" + mamls_df.grid    + "_" + mamls_df.year.astype(str)
mamls_df["trap_census"] = mamls_df["plot"] + "_" + mamls_df.trap_id + "_" + mamls_df.census
mamls_df["grid_census"] = mamls_df["plot"] + "_" + mamls_df.grid    + "_" + mamls_df.census

# function to make matrix
def make_matrix(df, what):
    mx = df.groupby([what, "species"]).size().unstack()  # groupby whatever i've chosen and species
    mx = mx.fillna(value = 0)                            # fill with 0's
    mx = mx.drop("None", axis = 1)                       # so we keep plot row even if nothing was trapped
    return mx

# do
mamls_TY = make_matrix(mamls_df, "trap_year")
mamls_GY = make_matrix(mamls_df, "grid_year")
mamls_TC = make_matrix(mamls_df, "trap_census")
mamls_GC = make_matrix(mamls_df, "grid_census")

# save
mamls_TY.to_csv("../Results/m_trap-year.csv")
mamls_GY.to_csv("../Results/m_grid-year.csv")
mamls_TC.to_csv("../Results/m_trap-census.csv")
mamls_GC.to_csv("../Results/m_grid_census.csv")

#### Presence/Absence matrix

In [10]:
mamls_TY_PA = (mamls_TY > 0)*1
mamls_GY_PA = (mamls_GY > 0)*1
mamls_TC_PA = (mamls_TC > 0)*1
mamls_GC_PA = (mamls_GC > 0)*1

mamls_TY_PA.to_csv("../Results/m_trap-year_PA.csv")
mamls_GY_PA.to_csv("../Results/m_grid-year_PA.csv")
mamls_TC_PA.to_csv("../Results/m_trap-census_PA.csv")
mamls_GC_PA.to_csv("../Results/m_grid-census_PA.csv")

#### agb

In [11]:
# get the agb (either mean or median of all the traps in plot)
mamls_agb = mamls_df.groupby("plot").agb.describe()
mamls_agb = pd.DataFrame(mamls_agb["50%"])
mamls_agb.to_csv("../Results/mamls_agb.csv")

#### standardise time

In [12]:
# and time difference between cesuses - i'm doing year or occasion...
mamls_cn_diff = census_diff(mamls_df, "census")
mamls_yr_diff = census_diff(mamls_df, "year")

mamls_yr_diff = mamls_yr_diff.rename(index=str, columns={"year": "census"})

mamls_cn_diff.to_csv("../Results/mamls_census_dates.csv")
mamls_yr_diff.to_csv("../Results/mamls_years_dates.csv")

## Tree Data

#### readin raw data

In [13]:
# readin RAW data
trees_df = pd.read_csv("../Data/SAFE_CarbonPlots_Tree+LianaCensus.csv")

#### function to sort everything out for each census

In [14]:
def sort_data(df, census_no):  # give new column names, delete NAs and dead...

    # consistant and better column names
    new_Cnames = ['f_type',       # forest type
                  'plot',
                  'subplot',
                  'date',         # date of measurements
                  'observers',
                  'tag_no',
                  'd_pom',        # diameter of tree (cm)
                  'h_pom',        # height diameter is taken (m) 1.3 by default
                  'height',
                  'flag',         # condition of trees (see flag list)
                  'alive',        # 1 = yes, NaN = no
                  'stem_C',       # aboveground biomass of tree (kg)
                  'root_C',       # root biomass of tree
                  'field_cmnts',  # comments from field
                  'data_cmnts',   # comments from data entry
                  'sbplt_X',
                  'sbplt_Y',
                  'CPA',          # projected area of the crown of the stem
                  'X_FMC',        # plot level X coordinate
                  'Y_FMC',        # plot level Y coordinte
                  'Z_FMC',        # plot level elevation
                  'family',
                  'binomial',
                  'wood_density']

    # give each census these column names
    df.columns = new_Cnames

    # get unique ID - combine plot and tag_no
    df = df.assign(ID = df['plot'] + df['tag_no'].map(str))

    # column with census number
    df = df.assign(census = census_no)

    # delete rows with NaNs in important columns
    impt_cols = ['tag_no', 'd_pom', 'h_pom', 'height', 'flag', 'alive',
                 'stem_C', 'root_C']

    df = df.dropna(subset = impt_cols, how = 'all')

    # delete dead trees (alive == 0)
    df = df[df.alive == 1]

    # sort out dates
    df.date = pd.to_datetime(df.date, dayfirst = True)

    return df

#### subset each census - weird column things...

In [15]:
# subset for each census
census_1 = trees_df.iloc[ :, list(range(0, 3))     # same for all
                           + list(range(3, 15))    # specific for census
                           + list(range(53, 62))]  # same for all

census_2 = trees_df.iloc[ :, list(range(0, 3))
                           + list(range(15, 27))
                           + list(range(53, 62))]

census_3 = trees_df.iloc[ :, list(range(0, 3))
                           + list(range(27, 39))
                           + list(range(53, 62))]

census_4 = trees_df.iloc[ :, list(range(0, 3))
                           + list(range(39, 51))
                           + list(range(53, 62))]

#### do function and combine census'

In [16]:
# sort data for each census
census_1 = sort_data(census_1, 1)
census_2 = sort_data(census_2, 2)
census_3 = sort_data(census_3, 3)
census_4 = sort_data(census_4, 4)

# recombine all census data (stack on top of each other)
trees_df = pd.concat([census_1, census_2, census_3, census_4], ignore_index = True)

#### add extra columns and save

In [17]:
trees_df["binomial"] = trees_df.binomial.str.strip()

# add genus column
trees_df['plot']        = trees_df['plot'].replace(" ", "", regex=True)
trees_df['subplot']     = trees_df['subplot'].apply(lambda x: str(x).zfill(2))
trees_df['genus']       = trees_df.apply(lambda row: row.binomial.split(" ")[0], axis = 1)
trees_df['plt_sub']     = trees_df['plot'] + "_sp" + trees_df['subplot'].astype(str)
trees_df['plt_sub_cen'] = trees_df['plt_sub'] + "_c" + trees_df['census'].astype(str)
trees_df['plot_c']      = trees_df['plot'] + "_c" + trees_df['census'].astype(str)
trees_df['census']      = "c" + trees_df.census.astype(str)

# save to csv
trees_df.to_csv("../Results/trees_sorted.csv", index = False)

#### make matrix

In [18]:
# species matrix
trees_matrix = trees_df.groupby(['plt_sub_cen', 'binomial']).size().unstack()
trees_matrix = trees_matrix.fillna(value = 0)
trees_matrix.to_csv("../Results/trees_matrix.csv")

trees_genus_matrix = trees_df.groupby(['plt_sub_cen', 'genus']).size().unstack()
trees_genus_matrix = trees_genus_matrix.fillna(value = 0)
trees_genus_matrix.to_csv("../Results/trees_genus_matrix.csv")

trees_family_matrix = trees_df.groupby(['plt_sub_cen', 'family']).size().unstack()
trees_family_matrix = trees_family_matrix.fillna(value = 0)
trees_family_matrix.to_csv("../Results/trees_family_matrix.csv")

#### Presence/Absence matrix

In [19]:
trees_matrix_PA = (trees_matrix > 0)*1
trees_genus_PA  = (trees_genus_matrix > 0)*1
trees_family_PA = (trees_family_matrix > 0)*1

trees_matrix_PA.to_csv("../Results/trees_matrix_PA.csv")
trees_genus_PA.to_csv("../Results/trees_genus_matrix_PA.csv")
trees_family_PA.to_csv("../Results/trees_family_matrix_PA.csv")

#### trees agb (different from the others as calculated from data)

In [20]:
# total biomass at each census for each plot
trees_agb = pd.DataFrame(trees_df.groupby(["plot", "census"]).stem_C.sum())

# take the mean of all census' 0.0625 and 0.001 to get it into Mg/0.0625 ha
trees_agb = pd.DataFrame(trees_agb.groupby("plot").median()*  0.0625 * 0.001)

trees_agb.to_csv("../Results/trees_agb.csv")

#### trees census standardise time

In [21]:
trees_cen = census_diff(trees_df, "census")
trees_cen.to_csv("../Results/trees_census_dates.csv")

## Beetles Data

#### read in raw data

In [22]:
btles_df = pd.read_csv("../Data/family_list.csv", index_col = 0)

# convert dates to datetime
btles_df.date = pd.to_datetime(btles_df.date, dayfirst = True)

#### initial sorting

In [23]:
btles_df = btles_df.merge(fractals, how = "left", left_on = "trap_N", right_on = "first_order")
btles_df["second_order"] = btles_df["second_order"].str.partition("_")[2].astype(int)
btles_df = btles_df.merge(agb[["second_order", "agb", "forestquality"]], how = "left",
                          on = "second_order")
btles_df = btles_df.drop("block", axis = 1)
btles_df = btles_df.rename(index=str, columns={"site": "plot"})

btles_df["subplot"] = btles_df["plot"] + "-" + btles_df.first_order.astype(str)

#### sorting out census'

In [24]:
# sampling periods
btles_df['census'] = "incomplete"


def sample_period(df, s_date, e_date, period):
    df.loc[(df.date >= s_date) & (df.date < e_date), 'census'] = period

s1 = pd.to_datetime("01/01/2011", dayfirst = True)  # might be good to check!!
e1 = pd.to_datetime("01/04/2011", dayfirst = True)
s2 = pd.to_datetime("01/09/2011", dayfirst = True)
e2 = pd.to_datetime("01/01/2012", dayfirst = True)
s3 = pd.to_datetime("01/04/2012", dayfirst = True)
e3 = pd.to_datetime("01/09/2012", dayfirst = True)

sample_period(btles_df, s1, e1, "P1")
sample_period(btles_df, s2, e2, "P2")
sample_period(btles_df, s3, e3, "P3")

btles_df = btles_df[btles_df.census != "incomplete"]

btles_df['plt_sub_cen'] = btles_df["plot"] + "_" + btles_df.subplot + "_" + btles_df.census

btles_df.to_csv("../Results/btles_sorted.csv", index = False)


btles_mx = btles_df.groupby(['plt_sub_cen', 'family']).size().unstack()
btles_mx = btles_mx.fillna(value = 0)

btles_mx_PA = (btles_mx > 0)*1


In [25]:
def rmv_identical(df):
    df["plotcen"]   = [i.split('_', 2)[0] + "_" +  i.split('_', 2)[2]  for i in df.index]
    df["identical"] = df.duplicated()

    a = df.groupby("plotcen").identical.apply(lambda x: len(x) - sum(x))  < 3
    a = a.index[a]

    df = df.loc[df.plotcen.apply(lambda x: x not in a), ]
    
    df = df.drop(["plotcen", "identical"], axis = 1)

    return df

In [26]:
btles_mx    = rmv_identical(btles_mx)
btles_mx_PA = rmv_identical(btles_mx_PA)

btles_mx.to_csv("../Results/btles_matrix.csv")
btles_mx_PA.to_csv("../Results/btles_matrix_PA.csv")

#### time between censuses

In [27]:
btles_cen = census_diff(btles_df, "census")
btles_cen.to_csv("../Results/btles_census_dates.csv")

#### agb

In [28]:
btles_agb = pd.DataFrame(btles_df.groupby("plot").agb.median())

btles_agb.to_csv("../Results/btles_agb.csv")

### Messing around for Table S1

In [109]:
btles_df.subplot = btles_df.subplot.str.replace("-", "_")

trap_locs = pd.DataFrame({"subplot" : btles_df.subplot.unique()})

trap_locs = trap_locs.merge(properties[["location", "longlat"]],
                            how      = "left",
                            left_on  = "subplot",
                            right_on = "location")

# find the closest second order fractal point
trap_locs[["second_order", "distance_so"]] = trap_locs.longlat.apply(lambda x: closest(x, second_order))

In [111]:
trap_locs["plot"] = trap_locs.second_order.str.partition("_")[0]
trap_locs["long"] = trap_locs.longlat.apply(lambda x: x[0])
trap_locs["lat"]  = trap_locs.longlat.apply(lambda x: x[1])

trap_locs.groupby("plot").mean()

Unnamed: 0_level_0,distance_so,long,lat
plot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,30.580543,4.709516,117.652067
B,30.570079,4.729947,117.61882
C,30.594762,4.710094,117.62371
D,30.570999,4.711176,117.586112
E,30.580928,4.693053,117.580392
F,30.559209,4.695445,117.541185
LF1,30.598656,4.769815,117.684144
LF2,30.58828,4.768426,117.700087
LF3,30.590402,4.755333,117.690906
LFE,30.676971,4.733184,117.592916


## Mozie

In [29]:
mozie1 = pd.read_csv("../Data/DailyHLC2012-2013.csv")
mozie2 = pd.read_csv("../Data/DailyHLC2013-2014.csv")

In [30]:
mozie1["census"] = "c1"
mozie2["census"] = "c2"

In [31]:
frames = [mozie1, mozie2]
mozie  = pd.concat(frames, sort = False)

In [32]:
mozie = mozie.drop(['field_name',
                    'Collector',
                    'Moonlight',
                    'Forest_cover',
                    'Height',
                    'Tree_Height',
                    'Wind',
                    'Rain',
                    'Temperature',
                    'Humidity'], axis = 1)

In [33]:
mozie_meta  = mozie[['Date', 'Location', 'Disturbance', 'census']]
mozie_count = mozie.drop(['Date', 'Location', 'Disturbance', 'census'], axis = 1)

mozie_meta.columns = ["date", "location", "disturbance", "census"]

mozie_count = mozie_count.fillna(0)
mozie_count.columns = pd.Series(mozie_count.columns).str.rpartition("_")[0]

In [34]:
frames = [mozie_meta, mozie_count]
mozie = pd.concat([mozie_meta, mozie_count], axis = 1,sort = False)
mozie = mozie.reset_index()

mozie = mozie.drop("index", axis = 1)

In [35]:
mozie["date"] = pd.to_datetime(mozie.date)

In [36]:
# get rid of Da_Tree
mozie = mozie.loc[mozie.location.str.rpartition("_")[0] != "Da_Tree", :]

In [37]:
mozie[["plot", "second_order"]] = mozie.location.str.partition("_")[[0,2]]

mozie.second_order = mozie.second_order.astype(int)

mozie = mozie.merge(agb[["second_order", "agb", "forestquality"]], how = "left", on = "second_order")

In [38]:
mozie['plt_sub_cen'] = mozie["plot"] + "_" + mozie.second_order.astype(str) + "_" + mozie.census

In [39]:
mozie.to_csv("../Results/mozie_sorted.csv")

In [40]:
mozie.index = mozie.plt_sub_cen

mozie_mx = mozie.drop(["date",
                       "location",
                       "disturbance",
                       "census",
                       "agb",
                       "forestquality",
                       "plot",
                       "second_order",
                       "plt_sub_cen"], axis = 1)

In [41]:
mozie_mx = mozie_mx.groupby("plt_sub_cen").sum()

In [42]:
mozie_mx.to_csv("../Results/mozie_matrix.csv")

In [43]:
mozie_cen = census_diff(mozie, "census")
mozie_cen.to_csv("../Results/mozie_census_dates.csv")

In [44]:
mozie_agb = pd.DataFrame(mozie.groupby("plot").agb.median())

mozie_agb.to_csv("../Results/mozie_agb.csv")

In [45]:
def census_diff(df, census):

    grp = df.groupby(["plot",  census])
    cen = grp.date.agg(['min', 'max'])
    cen["mid"] = (cen["min"] + (cen["max"] - cen["min"])/2).dt.date
    cen["difference"] = cen["mid"].diff().astype('timedelta64[D]')
    cen.loc[cen["difference"] < 0 , "difference"] = np.NAN
    cen["diff_yrs"] = cen.difference/365

    cen.reset_index(level=0, inplace=True)
    cen.reset_index(level=0, inplace=True)

    cen[census].astype(str)
    cen["step"] = cen[census].astype(str).shift() + "-" + cen[census].astype(str)
    cen.loc[cen["difference"].isnull(), "step"] = np.NaN
    cen.index = cen['plot'] + "_" + cen['step']

In [46]:
grp = mozie.groupby(["plot", "census"])

In [47]:
grp.date.agg(['min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
plot,census,Unnamed: 2_level_1,Unnamed: 3_level_1
D,c1,2012-01-12,2013-12-03
E,c1,2012-01-11,2013-10-04
E,c2,2014-01-06,2014-12-04
OG2,c1,2012-05-11,2013-12-01
OG3,c1,2013-01-04,2013-09-01
OP1,c1,2012-11-19,2013-02-25
OP2,c1,2012-06-12,2013-06-04
OP3,c1,2012-11-22,2013-02-26
VJR,c2,2014-01-07,2014-08-05


In [48]:
btles_df["census"].unique()

array(['P3', 'P1', 'P2'], dtype=object)

In [49]:
trees_df.groupby(["plot", "census"]).date.apply(lambda x: [min(x), max(x)])

plot    census
BNorth  c1        [2011-07-12 00:00:00, 2011-07-14 00:00:00]
        c2        [2012-07-04 00:00:00, 2013-05-09 00:00:00]
        c3        [2015-09-14 00:00:00, 2015-10-13 00:00:00]
        c4        [2016-09-19 00:00:00, 2016-09-22 00:00:00]
BSouth  c1        [2011-07-27 00:00:00, 2011-07-30 00:00:00]
        c2        [2012-08-04 00:00:00, 2012-08-30 00:00:00]
        c3        [2015-10-01 00:00:00, 2015-10-05 00:00:00]
        c4        [2016-09-20 00:00:00, 2016-09-22 00:00:00]
Belian  c1        [2011-06-10 00:00:00, 2011-08-06 00:00:00]
        c2        [2012-11-10 00:00:00, 2013-12-17 00:00:00]
        c3        [2016-01-19 00:00:00, 2016-10-19 00:00:00]
DC1     c1        [2011-04-10 00:00:00, 2012-12-10 00:00:00]
        c2        [2016-10-17 00:00:00, 2016-10-17 00:00:00]
DC2     c1        [2013-02-09 00:00:00, 2014-12-03 00:00:00]
        c2        [2016-10-12 00:00:00, 2016-10-12 00:00:00]
E       c1        [2011-05-28 00:00:00, 2011-07-01 00:00:00]
        c

In [50]:
min(trees_df.date)

Timestamp('2011-04-10 00:00:00')

In [57]:
mamls_df.groupby(["plot", "year"]).date.apply(lambda x: [min(x), max(x)])

plot  year
D     2011    [2011-08-20 00:00:00, 2011-12-09 00:00:00]
      2012    [2012-01-02 00:00:00, 2012-12-03 00:00:00]
      2014    [2014-06-02 00:00:00, 2014-12-02 00:00:00]
      2015    [2015-01-06 00:00:00, 2015-05-31 00:00:00]
      2016    [2016-01-03 00:00:00, 2016-02-29 00:00:00]
      2017    [2017-01-04 00:00:00, 2017-12-03 00:00:00]
E     2011    [2011-01-06 00:00:00, 2011-12-16 00:00:00]
      2012    [2012-01-12 00:00:00, 2012-12-01 00:00:00]
      2014    [2014-01-06 00:00:00, 2014-12-01 00:00:00]
      2015    [2015-03-04 00:00:00, 2015-12-07 00:00:00]
      2016    [2016-01-07 00:00:00, 2016-12-06 00:00:00]
      2017    [2017-01-08 00:00:00, 2017-12-08 00:00:00]
F     2011    [2011-01-08 00:00:00, 2011-12-08 00:00:00]
      2012    [2012-01-07 00:00:00, 2012-11-07 00:00:00]
O     2011    [2011-05-13 00:00:00, 2011-12-06 00:00:00]
      2013    [2013-01-10 00:00:00, 2013-09-30 00:00:00]
      2014    [2014-06-15 00:00:00, 2014-06-28 00:00:00]
      2015    [2015-

In [77]:
mamls_df.groupby("plot").median()

Unnamed: 0_level_0,point,year,second_order,distance_so,agb
plot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D,25.0,2014.0,641.0,42.561358,1.557695
E,24.5,2015.0,653.0,43.317961,2.381293
F,24.5,2011.0,673.0,43.63881,3.752427
O,24.0,2014.0,726.0,36.883596,23.456775


In [66]:
mamls_df

Unnamed: 0,occasion,date,grid,point,trap,trap_id,species,year,plot,census,scientific,longlat,second_order,distance_so,agb,forestquality,trap_year,grid_year,trap_census,grid_census
0,E1-1-2011-1,2011-05-22,E1-1,1,A,E1-1-1,,2011,E,2011-1,,"[4.69563250038444, 117.581947506955]",649,83.045556,0.676988,Very poor,E_E1-1-1_2011,E_E1-1_2011,E_E1-1-1_2011-1,E_E1-1_2011-1
1,E1-1-2011-1,2011-05-22,E1-1,1,B,E1-1-1,,2011,E,2011-1,,"[4.69563250038444, 117.581947506955]",649,83.045556,0.676988,Very poor,E_E1-1-1_2011,E_E1-1_2011,E_E1-1-1_2011-1,E_E1-1_2011-1
2,E1-1-2011-1,2011-05-22,E1-1,2,A,E1-1-2,,2011,E,2011-1,,"[4.69568477552689, 117.581746856828]",649,77.852218,0.676988,Very poor,E_E1-1-2_2011,E_E1-1_2011,E_E1-1-2_2011-1,E_E1-1_2011-1
3,E1-1-2011-1,2011-05-22,E1-1,2,B,E1-1-2,,2011,E,2011-1,,"[4.69568477552689, 117.581746856828]",649,77.852218,0.676988,Very poor,E_E1-1-2_2011,E_E1-1_2011,E_E1-1-2_2011-1,E_E1-1_2011-1
4,E1-1-2011-1,2011-05-22,E1-1,3,A,E1-1-3,WH,2011,E,2011-1,Maxomys whiteheadi,"[4.69573705060623, 117.581546206659]",649,79.267769,0.676988,Very poor,E_E1-1-3_2011,E_E1-1_2011,E_E1-1-3_2011-1,E_E1-1_2011-1
5,E1-1-2011-1,2011-05-22,E1-1,3,B,E1-1-3,MR,2011,E,2011-1,Sundamys muelleri,"[4.69573705060623, 117.581546206659]",649,79.267769,0.676988,Very poor,E_E1-1-3_2011,E_E1-1_2011,E_E1-1-3_2011-1,E_E1-1_2011-1
6,E1-1-2011-1,2011-05-22,E1-1,4,A,E1-1-4,WH,2011,E,2011-1,Maxomys whiteheadi,"[4.69578932562244, 117.581345556448]",649,86.970101,0.676988,Very poor,E_E1-1-4_2011,E_E1-1_2011,E_E1-1-4_2011-1,E_E1-1_2011-1
7,E1-1-2011-1,2011-05-22,E1-1,4,B,E1-1-4,,2011,E,2011-1,,"[4.69578932562244, 117.581345556448]",649,86.970101,0.676988,Very poor,E_E1-1-4_2011,E_E1-1_2011,E_E1-1-4_2011-1,E_E1-1_2011-1
8,E1-1-2011-1,2011-05-22,E1-1,5,A,E1-1-5,CBS,2011,E,2011-1,Maxomys ochraceiventer,"[4.69558801019568, 117.581293454213]",649,67.266150,0.676988,Very poor,E_E1-1-5_2011,E_E1-1_2011,E_E1-1-5_2011-1,E_E1-1_2011-1
9,E1-1-2011-1,2011-05-22,E1-1,5,B,E1-1-5,,2011,E,2011-1,,"[4.69558801019568, 117.581293454213]",649,67.266150,0.676988,Very poor,E_E1-1-5_2011,E_E1-1_2011,E_E1-1-5_2011-1,E_E1-1_2011-1
