## DSGA 1001 - Tree Analysis (Classification)
### December 16, 2022
Code by: Leo Chen and Xinyue Ma


In [50]:
# import libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [49]:
# seed the random number generator
np.random.seed(18962882)
rng = np.random.default_rng(18962882)
rs = 18962882

In [51]:
# read data 
df = pd.read_csv('../2015StreetTreesCensus_TREES.csv')

In [52]:
# rows & columns
print(df.shape)
df.head()

(683788, 42)


Unnamed: 0,created_at,tree_id,block_id,the_geom,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,...,st_assem,st_senate,nta,nta_name,boro_ct,state,Latitude,longitude,x_sp,y_sp
0,08/27/2015,180683,348711,POINT (-73.84421521958048 40.723091773924274),3,0,OnCurb,Alive,Fair,Acer rubrum,...,28,16,QN17,Forest Hills,4073900,New York,40.723092,-73.844215,1027431.0,202756.768749
1,09/03/2015,200540,315986,POINT (-73.81867945834878 40.79411066708779),21,0,OnCurb,Alive,Fair,Quercus palustris,...,27,11,QN49,Whitestone,4097300,New York,40.794111,-73.818679,1034456.0,228644.837379
2,09/05/2015,204026,218365,POINT (-73.93660770459083 40.717580740099116),3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,...,50,18,BK90,East Williamsburg,3044900,New York,40.717581,-73.936608,1001823.0,200716.891267
3,09/05/2015,204337,217969,POINT (-73.93445615919741 40.713537494833226),10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,...,53,18,BK90,East Williamsburg,3044900,New York,40.713537,-73.934456,1002420.0,199244.253136
4,08/30/2015,189565,223043,POINT (-73.97597938483258 40.66677775537875),21,0,OnCurb,Alive,Good,Tilia americana,...,44,21,BK37,Park Slope-Gowanus,3016500,New York,40.666778,-73.975979,990913.8,182202.425999


In [53]:
# features available
df.columns

Index(['created_at', 'tree_id', 'block_id', 'the_geom', 'tree_dbh',
       'stump_diam', 'curb_loc', 'status', 'health', 'spc_latin', 'spc_common',
       'steward', 'guards', 'sidewalk', 'user_type', 'problems', 'root_stone',
       'root_grate', 'root_other', 'trnk_wire', 'trnk_light', 'trnk_other',
       'brnch_ligh', 'brnch_shoe', 'brnch_othe', 'address', 'zipcode',
       'zip_city', 'cb_num', 'borocode', 'boroname', 'cncldist', 'st_assem',
       'st_senate', 'nta', 'nta_name', 'boro_ct', 'state', 'Latitude',
       'longitude', 'x_sp', 'y_sp'],
      dtype='object')

In [54]:
# levels with in status
df['status'].unique()

array(['Alive', 'Dead', 'Stump'], dtype=object)

In [55]:
df[df['status']=='Stump'].head(3)

Unnamed: 0,created_at,tree_id,block_id,the_geom,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,...,st_assem,st_senate,nta,nta_name,boro_ct,state,Latitude,longitude,x_sp,y_sp
307,09/07/2015,208525,103974,POINT (-73.98974807814508 40.764580563550844),0,9,OnCurb,Stump,,,...,75,27,MN15,Clinton,1013300,New York,40.764581,-73.989748,987089.907694,217834.227439
494,08/26/2015,176343,230511,POINT (-73.95830423336267 40.637879214299566),0,17,OnCurb,Stump,,,...,42,21,BK42,Flatbush,3051601,New York,40.637879,-73.958304,995822.209127,171675.729205
598,09/19/2015,239221,225750,POINT (-73.97718356345842 40.655987310935444),0,16,OnCurb,Stump,,,...,44,21,BK40,Windsor Terrace,3017100,New York,40.655987,-73.977184,990580.736999,178271.075594


In [56]:
df[df['status']=='Dead'].head(3)

Unnamed: 0,created_at,tree_id,block_id,the_geom,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,...,st_assem,st_senate,nta,nta_name,boro_ct,state,Latitude,longitude,x_sp,y_sp
61,09/01/2015,194505,341521,POINT (-73.88725059080657 40.75902778398188),2,0,OnCurb,Dead,,,...,34,13,QN28,Jackson Heights,4030903,New York,40.759028,-73.887251,1015486.0,215831.103492
370,09/13/2015,220914,105556,POINT (-73.97827071291084 40.74529871427063),4,0,OnCurb,Dead,,,...,73,28,MN20,Murray Hill-Kips Bay,1007000,New York,40.745299,-73.978271,990271.0,210809.804152
556,08/27/2015,179963,224926,POINT (-74.02148258671878 40.6463105957943),2,0,OnCurb,Dead,,,...,51,23,BK32,Sunset Park West,3002200,New York,40.646311,-74.021483,978288.5,174745.487829


In [57]:
df[df['status']=='Alive'].head(3)

Unnamed: 0,created_at,tree_id,block_id,the_geom,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,...,st_assem,st_senate,nta,nta_name,boro_ct,state,Latitude,longitude,x_sp,y_sp
0,08/27/2015,180683,348711,POINT (-73.84421521958048 40.723091773924274),3,0,OnCurb,Alive,Fair,Acer rubrum,...,28,16,QN17,Forest Hills,4073900,New York,40.723092,-73.844215,1027431.0,202756.768749
1,09/03/2015,200540,315986,POINT (-73.81867945834878 40.79411066708779),21,0,OnCurb,Alive,Fair,Quercus palustris,...,27,11,QN49,Whitestone,4097300,New York,40.794111,-73.818679,1034456.0,228644.837379
2,09/05/2015,204026,218365,POINT (-73.93660770459083 40.717580740099116),3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,...,50,18,BK90,East Williamsburg,3044900,New York,40.717581,-73.936608,1001823.0,200716.891267


In [58]:
# extract all the alive trees
alive = df[df['status']=='Alive']
print(alive.shape)
alive.columns

(652173, 42)


Index(['created_at', 'tree_id', 'block_id', 'the_geom', 'tree_dbh',
       'stump_diam', 'curb_loc', 'status', 'health', 'spc_latin', 'spc_common',
       'steward', 'guards', 'sidewalk', 'user_type', 'problems', 'root_stone',
       'root_grate', 'root_other', 'trnk_wire', 'trnk_light', 'trnk_other',
       'brnch_ligh', 'brnch_shoe', 'brnch_othe', 'address', 'zipcode',
       'zip_city', 'cb_num', 'borocode', 'boroname', 'cncldist', 'st_assem',
       'st_senate', 'nta', 'nta_name', 'boro_ct', 'state', 'Latitude',
       'longitude', 'x_sp', 'y_sp'],
      dtype='object')

In [59]:
# check for duplicates
alive[alive.duplicated()]

Unnamed: 0,created_at,tree_id,block_id,the_geom,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,...,st_assem,st_senate,nta,nta_name,boro_ct,state,Latitude,longitude,x_sp,y_sp


In [60]:
# Drop missing values
alive = alive.dropna()
alive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 652118 entries, 0 to 683787
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   created_at  652118 non-null  object 
 1   tree_id     652118 non-null  int64  
 2   block_id    652118 non-null  int64  
 3   the_geom    652118 non-null  object 
 4   tree_dbh    652118 non-null  int64  
 5   stump_diam  652118 non-null  int64  
 6   curb_loc    652118 non-null  object 
 7   status      652118 non-null  object 
 8   health      652118 non-null  object 
 9   spc_latin   652118 non-null  object 
 10  spc_common  652118 non-null  object 
 11  steward     652118 non-null  object 
 12  guards      652118 non-null  object 
 13  sidewalk    652118 non-null  object 
 14  user_type   652118 non-null  object 
 15  problems    652118 non-null  object 
 16  root_stone  652118 non-null  object 
 17  root_grate  652118 non-null  object 
 18  root_other  652118 non-null  object 
 19  tr

In [61]:
# drop unnecessary features
alive = alive.drop(columns = ['created_at','tree_id', 'stump_diam','the_geom', 'address', 'state', 'x_sp', 'y_sp','status'])
alive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 652118 entries, 0 to 683787
Data columns (total 33 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   block_id    652118 non-null  int64  
 1   tree_dbh    652118 non-null  int64  
 2   curb_loc    652118 non-null  object 
 3   health      652118 non-null  object 
 4   spc_latin   652118 non-null  object 
 5   spc_common  652118 non-null  object 
 6   steward     652118 non-null  object 
 7   guards      652118 non-null  object 
 8   sidewalk    652118 non-null  object 
 9   user_type   652118 non-null  object 
 10  problems    652118 non-null  object 
 11  root_stone  652118 non-null  object 
 12  root_grate  652118 non-null  object 
 13  root_other  652118 non-null  object 
 14  trnk_wire   652118 non-null  object 
 15  trnk_light  652118 non-null  object 
 16  trnk_other  652118 non-null  object 
 17  brnch_ligh  652118 non-null  object 
 18  brnch_shoe  652118 non-null  object 
 19  br

In [63]:
# top 10 species
top10 = alive['spc_common'].value_counts().head(10).index.tolist()
top10

['London planetree',
 'honeylocust',
 'Callery pear',
 'pin oak',
 'Norway maple',
 'littleleaf linden',
 'cherry',
 'Japanese zelkova',
 'ginkgo',
 'Sophora']

In [65]:
# top 10 species count
alive['spc_common'].value_counts().head(10)

London planetree     87012
honeylocust          64258
Callery pear         58924
pin oak              53179
Norway maple         34189
littleleaf linden    29742
cherry               29271
Japanese zelkova     29258
ginkgo               21019
Sophora              19338
Name: spc_common, dtype: int64

In [66]:
# filter for trees in the top 10 species
alive = alive[alive['spc_common'].isin(top10)]
print(alive.shape)
alive['spc_common'].unique() 

(426190, 33)


array(['pin oak', 'honeylocust', 'London planetree', 'ginkgo',
       'Norway maple', 'Sophora', 'Callery pear', 'Japanese zelkova',
       'littleleaf linden', 'cherry'], dtype=object)

In [67]:
# convert columns to appropriate types
cols = alive.columns
for col in cols:
    alive[col] = alive[col].astype("object")
    
alive['tree_dbh'] = pd.to_numeric(alive['tree_dbh'])
alive['Latitude'] = pd.to_numeric(alive['Latitude'])
alive['longitude'] = pd.to_numeric(alive['longitude'])

In [68]:
alive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426190 entries, 1 to 683783
Data columns (total 33 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   block_id    426190 non-null  object 
 1   tree_dbh    426190 non-null  int64  
 2   curb_loc    426190 non-null  object 
 3   health      426190 non-null  object 
 4   spc_latin   426190 non-null  object 
 5   spc_common  426190 non-null  object 
 6   steward     426190 non-null  object 
 7   guards      426190 non-null  object 
 8   sidewalk    426190 non-null  object 
 9   user_type   426190 non-null  object 
 10  problems    426190 non-null  object 
 11  root_stone  426190 non-null  object 
 12  root_grate  426190 non-null  object 
 13  root_other  426190 non-null  object 
 14  trnk_wire   426190 non-null  object 
 15  trnk_light  426190 non-null  object 
 16  trnk_other  426190 non-null  object 
 17  brnch_ligh  426190 non-null  object 
 18  brnch_shoe  426190 non-null  object 
 19  br

In [69]:
# Health (Good - 1, Not Good - 0)
alive['health'] = np.where(alive['health']=='Good', 1, 0)

In [70]:
alive['health']

1         0
2         1
3         1
5         1
6         1
         ..
683779    1
683780    0
683781    1
683782    1
683783    1
Name: health, Length: 426190, dtype: int32

### save processed data frame

In [71]:
#alive.to_pickle('df_alive')