**CREATING DISTANCE MATRICES**

In [10]:
import numpy as np
import pandas as pd
import scipy as sp
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics.pairwise import euclidean_distances
import category_encoders as ce
import xgboost as xgb


In [11]:

# load all data
raw_data_t1 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1985.csv")
raw_data_t2 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1990.csv")

# load the first quadrat that we will be working with
quad1_t1 = raw_data_t1.loc[raw_data_t1['quadrat']==1]
quad1_t2 = raw_data_t2.loc[raw_data_t2['quadrat']==1]




  raw_data_t1 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1985.csv")
  raw_data_t2 = pd.read_csv("/Users/dylanvanbramer/indresearch/xu/deep_learning/bci_census/bci_1990.csv")


In [12]:
# we only care about the tree IDs and DBHs of quad1_t2
expected_labels = quad1_t2[['treeID', 'dbh']]
expected_labels = expected_labels.rename(columns={"dbh": "dbh2", "treeID": "treeID2"})

#quad1_t1.head()
# simplify the data to have less features
quad1_t1 = quad1_t1[['treeID', 'sp', 'gx', 'gy', 'dbh']]
quad1_t1 = quad1_t1.rename(columns={"dbh": "dbh1", "treeID":"treeID1"})



In [13]:
encoder= ce.BinaryEncoder(cols=['sp'],return_df=True)
quad1_t1 = encoder.fit_transform(quad1_t1)
df_combined = pd.concat([quad1_t1, expected_labels], axis=1)

# Drop rows with any NaN values
df_combined_clean = df_combined.dropna()

# Now has more values because species is encoded to 7 different categories



In [14]:
# Separate the cleaned DataFrame and labels
df_clean = df_combined_clean[quad1_t1.columns]
labels_clean = df_combined_clean[expected_labels.columns]

quad1_t1 = df_clean.to_numpy()
expected_labels = labels_clean.to_numpy()

quad1_t1 = quad1_t1.astype(np.float32)
expected_labels = expected_labels.astype(np.float32)

In [15]:
#ids = quad1_t1[:,0]
#x_coordinates = quad1_t1[:, 8]  
#y_coordinates = quad1_t1[:, 9]
#coord_matrix = np.column_stack((x_coordinates, y_coordinates))

#tree1 = quad1_t1[0,8:10].reshape(1,-1)
#distances_matrix_1 = euclidean_distances(tree1, coord_matrix)

#sorted_indices = np.argsort(distances_matrix_1, axis = 1)
#sorted_distances = np.sort(distances_matrix_1, axis=1)

#sorted_matrix = np.column_stack(sorted_indices, sorted_distances)
#sorted_indices_matrix = np.argsort(distances_matrix, axis=1)
#sorted_indices_matrix = sorted_indices_matrix[:, :51]

#sorted_distances_matrix = ids[:, np.newaxis].repeat(51, axis=1)
#sorted_distances_matrix = np.column_stack((sorted_distances_matrix, sorted_indices_matrix))

#sorted_distances_matrix = sorted_distances_matrix[np.argsort(sorted_distances_matrix[:, 1:])]

In [16]:
#np.sort(distances_matrix_1, axis=1)

#sorted_matrix = np.column_stack(sorted_indices, sorted_distances)
#sorted_indices_matrix = np.argsort(distances_matrix, axis=1)
#sorted_indices_matrix = sorted_indices_matrix[:, :51]

#sorted_distances_matrix = ids[:, np.newaxis].repeat(51, axis=1)
#sorted_distances_matrix = np.column_stack((sorted_distances_matrix, sorted_indices_matrix))

#sorted_distances_matrix = sorted_distances_matrix[np.argsort(sorted_distances_matrix[:, 1:])]

BELOW: attempt with just tree 1

In [17]:


ids = quad1_t1[:,0]
x_coordinates = quad1_t1[:, 8]  
y_coordinates = quad1_t1[:, 9]
coord_matrix = np.column_stack((x_coordinates, y_coordinates))

tree2 = sp.spatial.KDTree(coord_matrix)

In [18]:
focal_tree_loc = coord_matrix[0]
dist, ind = tree2.query(focal_tree_loc, k =50)

In [19]:
nn_ids = ids[ind]
#this is just the nearest neighbors of the first tree. we want to do this for each tree.

BELOW: attempting with LOOP - works great!

In [20]:
nn_dist_matrix = np.zeros((len(coord_matrix),50))
nn_ind_matrix = np.zeros((len(coord_matrix),50))

In [21]:
for i, tree in enumerate(coord_matrix):
    dist2, ind2 = tree2.query(tree, k=50)
    nn_ind_matrix[i] = ids[ind2]
    nn_dist_matrix[i]= dist2


  # if we wanted to just use the ids instead of the distances of the nearest 
  # neighbors (mostly just helps with making sure the algorithm is running 
  # consistently, so used during testing), we would set ..[i] = ids[ind2]

BELOW: trying to increase efficiency by using numpy broadcasting isntead of loops. will get back to this later on because it adds another dimension such that the data is nx1x50 instead of nx50.

In [22]:
#dist3, ind3 = tree2.query(coord_matrix, k=50)
#ind_array = np.arange(len(coord_matrix))

#nn_dist_matrix2 = dist3[ind_array[:,None],:]
#before = ids[ind3]
#nn_dist_matrix3 = before[ind_array[:,None],:]
#ADDS ANOTHER DIMENSION
#np.zeros((len(coord_matrix),50))
#nn_dist_matrix2[ind_array] = ids[ind3]

Next: add the focal tree's species and dbh. This model will consider the focal tree's features as well as the distances from other trees. It will NOT consider the species or DBH of those other trees yet. I am removing the raw x and y coordinates here because distance is far more important.

In [23]:
dbhs = quad1_t1[:, 10]
species = quad1_t1[:,1:8]
features = np.column_stack([ids, dbhs])
#print (np.shape(features))
#print (np.shape(species))
features = np.column_stack([features, species, nn_dist_matrix])
#print (np.shape(features))

Actually first, let's train a regression tree on this data and see how it works! Ideally better than the error of the intrinsic growth using the same model (in attempt1nn)

In [24]:
kdtree = DecisionTreeRegressor()
X_train, X_test, y_train, y_test = train_test_split(features, expected_labels, test_size=0.3)


In [25]:
# don't include the tree ID tags as numerical data, then train.
kdtree.fit(X_train[:,1:], y_train[:,1])

In [26]:
# test the KD tree
test_ids = X_test[:, 0]  
preds = kdtree.predict(X_test[:,1:])
preds_matrix = np.column_stack((test_ids, preds))

In [27]:
# how well does it work?!
test_error = sklearn.metrics.mean_squared_error(y_test[:,1], preds)
print(test_error)

1570.642857142857


Error is still quite high, BUT we can try to change that by adding more test data soon. Still training on a very small amount of data (only 100 points)

NEXT, use a loop to add the dbh of each tree. To do this, must add extra dimensions. Will not species yet because it will add many more dimensions. 

In [44]:
for i in range(50):
    #first find the dbhs associated with those tree IDs - need to use the 
    #information from ids[ind], as the distances do not tell us which tree they correspond to.
    
    #below are the indices of the 50 closest trees to tree i
    relevant_inds = nn_ind_matrix[:,i]
    n = np.shape(nn_ind_matrix)[0]
    corresp_dbhs = np.zeros(n)
    for j in np.nditer(relevant_inds):
        # turn j into int
        corresp_dbhs[j] = quad1_t1[j,10]
    print(corresp_dbhs)
    # corresp_dbhs = quad1_t1[relevant_inds,10]


    feat_w_dbh = np.insert(features, 10+i, corresp_dbhs, axis =1 )

    



8224.0


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

just numerically , stop using encoding