**Neighborhood Growth Model**

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import category_encoders as ce
import xgboost as xgb


In [2]:

# load all data
raw_data_t1 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1985.csv")
raw_data_t2 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1990.csv")

# load the first quadrat that we will be working with
quad1_t1 = raw_data_t1.loc[raw_data_t1['quadrat']==1]
quad1_t2 = raw_data_t2.loc[raw_data_t2['quadrat']==1]




  raw_data_t1 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1985.csv")
  raw_data_t2 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1990.csv")


In [3]:
# we only care about the tree IDs and DBHs of quad1_t2
expected_labels = quad1_t2[['treeID', 'dbh']]
expected_labels = expected_labels.rename(columns={"dbh": "dbh2", "treeID": "treeID2"})

#quad1_t1.head()
# simplify the data to have less features
quad1_t1 = quad1_t1[['treeID', 'sp', 'gx', 'gy', 'dbh']]
quad1_t1 = quad1_t1.rename(columns={"dbh": "dbh1", "treeID":"treeID1"})



In [4]:
encoder= ce.BinaryEncoder(cols=['sp'],return_df=True)
quad1_t1 = encoder.fit_transform(quad1_t1)
df_combined = pd.concat([quad1_t1, expected_labels], axis=1)

# Drop rows with any NaN values
df_combined_clean = df_combined.dropna()

# Now has more values because species is encoded to 7 different categories



In [5]:
# Separate the cleaned DataFrame and labels
df_clean = df_combined_clean[quad1_t1.columns]
labels_clean = df_combined_clean[expected_labels.columns]

quad1_t1 = df_clean.to_numpy()
expected_labels = labels_clean.to_numpy()

quad1_t1 = quad1_t1.astype(np.float32)
expected_labels = expected_labels.astype(np.float32)

BELOW: attempt with just one tree - will become the body of the loop below

In [6]:
ids = quad1_t1[:,0]
x_coordinates = quad1_t1[:, 8]  
y_coordinates = quad1_t1[:, 9]
coord_matrix = np.column_stack((x_coordinates, y_coordinates))

spatial_tree = sp.spatial.KDTree(coord_matrix)
focal_tree_loc = coord_matrix[0]
dist, ind = spatial_tree.query(focal_tree_loc, k=6)
nn_ids = ids[ind]

BELOW: attempting with LOOP - works great!

In [7]:
nn_dist_matrix = np.zeros((len(coord_matrix),6))
nn_ind_matrix = np.zeros((len(coord_matrix),6))

In [8]:
for i, tree in enumerate(coord_matrix):
    dist2, ind2 = spatial_tree.query(tree, k=6)
    nn_ind_matrix[i] = ids[ind2]
    nn_dist_matrix[i]= dist2

# remove the columns with the focal tree's OWN distance and IDS
nn_ind_matrix = nn_ind_matrix[:,1:]
nn_dist_matrix = nn_dist_matrix[:,1:]

  # if we wanted to just use the ids instead of the distances of the nearest 
  # neighbors (mostly just helps with making sure the algorithm is running 
  # consistently, so used during testing), we would set ..[i] = ids[ind2]

BELOW: trying to increase efficiency by using numpy broadcasting isntead of loops. will get back to this later on because it adds another dimension such that the data is nx1x50 instead of nx50.

In [9]:
#dist3, ind3 = tree2.query(coord_matrix, k=50)
#ind_array = np.arange(len(coord_matrix))

#nn_dist_matrix2 = dist3[ind_array[:,None],:]
#before = ids[ind3]
#nn_dist_matrix3 = before[ind_array[:,None],:]
#ADDS ANOTHER DIMENSION
#np.zeros((len(coord_matrix),50))
#nn_dist_matrix2[ind_array] = ids[ind3]

Next: add the focal tree's species and dbh. This model will consider the focal tree's features as well as the distances from other trees. It will NOT consider the species or DBH of those other trees yet. I am removing the raw x and y coordinates here because distance is far more important.

In [10]:
dbhs = quad1_t1[:, 10]
species = quad1_t1[:,1:8]
features = np.column_stack([ids, dbhs])
#print (np.shape(features))
#print (np.shape(species))
features = np.column_stack([features, species, nn_dist_matrix])
#print (np.shape(features))

NEXT, use a loop to add the dbh of each tree. To do this, must add extra dimensions. Will not include species yet because it will add many more dimensions. 

In [11]:
feat_w_dbh = features
for i in range(5):
    n = np.shape(nn_ind_matrix)[0]
    relevant_inds = nn_ind_matrix[:,i]
    #print (relevant_inds)
    
    corresp_dbhs = np.zeros(n)
    for j in range(n):
        index = int(relevant_inds[j])
        row =  quad1_t1[quad1_t1[:, 0] == index][:,10]
        # print (index)
        # print (row)
        corresp_dbhs[j] = row
        #print("correct is" + str(index))

    feat_w_dbh = np.column_stack((feat_w_dbh, corresp_dbhs))

Things to try: is error higher or lower when adding the dbh right after the distance? alternated throughout?

Prepare Data For ML Algorithm

In [12]:
# change the labels so that they show GROWTH, not future DBH - better for readability of loss
expected_labels[:,1] = expected_labels[:,1] - quad1_t1[:,10]
expected_labels = np.where (expected_labels<0, 0, expected_labels)
X_train, X_test, y_train, y_test = train_test_split(feat_w_dbh, expected_labels, test_size=0.3)

feats = X_train[:,1:]
labels = y_train[:,1]

test_ids = X_test[:,0]
test_feats = X_test[:,1:]

Using SKLearn's Random Forest Regressor

In [13]:
random_forest = RandomForestRegressor()
random_forest.fit(feats,labels)

preds = random_forest.predict(test_feats)
preds_matrix = np.column_stack((test_ids, preds))

error1 = sklearn.metrics.mean_squared_error(y_test[:,1], preds)
print (np.sqrt(error1))
error1a = r2_score(y_test[:,1],preds)
print (error1a)

8.712857904106027
-0.16286558055070288


Using XGBoost Regressor

In [15]:
xgb_tree = xgb.XGBRegressor()
xgb_tree.fit(feats,labels)
preds2 = xgb_tree.predict(test_feats)
predictions_matrix = np.column_stack((test_ids, preds2))
error2 = sklearn.metrics.mean_squared_error(y_test[:,1], preds2)
print (np.sqrt(error2))
error2a = r2_score (y_test[:,1], preds2)
print (error2a)

6.802576
0.29114872161262717


Can now attempt to add species too, and then train on all of this - how does the error change when adding species? NOTE: current error with column 9 (all zeros) 

In [16]:
feat_w_dbh_species = features
for i in range(5):
    n = np.shape(nn_ind_matrix)[0]
    relevant_inds = nn_ind_matrix[:,i]
    #print (relevant_inds)
    
    corresp_dbhs = np.zeros((n,7))
    for j in range(n):
        index = int(relevant_inds[j])
        row =  quad1_t1[quad1_t1[:, 0] == index][:,2:9]
        # print (index)
        # print (row)
        corresp_dbhs[j] = row
        #print("correct is" + str(index))

    feat_w_dbh_species = np.hstack((feat_w_dbh_species, corresp_dbhs))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(feat_w_dbh_species, expected_labels, test_size=0.3)

feats = X_train[:,1:]
labels = y_train[:,1]

test_ids = X_test[:,0]
test_feats = X_test[:,1:]

Using SKLearn's Random Forest Regressor

In [18]:
random_forest = RandomForestRegressor()
random_forest.fit(feats,labels)

preds = random_forest.predict(test_feats)
preds_matrix = np.column_stack((test_ids, preds))

error1 = sklearn.metrics.mean_squared_error(y_test[:,1], preds)
print (np.sqrt(error1))
error1a = r2_score(y_test[:,1],preds)
print (error1a)

22.690216372869145
0.2049755215848027


Using XGBoost Regressor

In [19]:
xgb_tree = xgb.XGBRegressor()
xgb_tree.fit(feats,labels)
preds2 = xgb_tree.predict(test_feats)
predictions_matrix = np.column_stack((test_ids, preds2))
error2 = sklearn.metrics.mean_squared_error(y_test[:,1], preds2)
print (np.sqrt(error2))
error2a = r2_score (y_test[:,1], preds2)
print (error2a)

22.615017
0.2102364404481949


Randomization: at least as good as the model without neighbors.