# NYC trees dataset challenge

### Import the pandas library and read CSV file using pandas

In [40]:
import pandas as pd


df = pd.read_csv('./assets/data_100000.csv')

Print general info about datatypes and null values

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   created_at  100000 non-null  object 
 1   tree_id     100000 non-null  int64  
 2   block_id    100000 non-null  int64  
 3   the_geom    100000 non-null  object 
 4   tree_dbh    100000 non-null  int64  
 5   stump_diam  100000 non-null  int64  
 6   curb_loc    100000 non-null  object 
 7   status      100000 non-null  object 
 8   health      95007 non-null   object 
 9   spc_latin   95008 non-null   object 
 10  spc_common  95008 non-null   object 
 11  steward     95008 non-null   object 
 12  guards      95008 non-null   object 
 13  sidewalk    95008 non-null   object 
 14  user_type   100000 non-null  object 
 15  problems    95008 non-null   object 
 16  root_stone  100000 non-null  object 
 17  root_grate  100000 non-null  object 
 18  root_other  100000 non-null  object 
 19  trn

### Parse data type from created_at field to datetime

In [42]:
df.created_at = df.created_at.apply(lambda item: pd.to_datetime(item))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   created_at  100000 non-null  datetime64[ns]
 1   tree_id     100000 non-null  int64         
 2   block_id    100000 non-null  int64         
 3   the_geom    100000 non-null  object        
 4   tree_dbh    100000 non-null  int64         
 5   stump_diam  100000 non-null  int64         
 6   curb_loc    100000 non-null  object        
 7   status      100000 non-null  object        
 8   health      95007 non-null   object        
 9   spc_latin   95008 non-null   object        
 10  spc_common  95008 non-null   object        
 11  steward     95008 non-null   object        
 12  guards      95008 non-null   object        
 13  sidewalk    95008 non-null   object        
 14  user_type   100000 non-null  object        
 15  problems    95008 non-null   object        
 16  roo

### Check for null values in columns and rows

In [43]:
df_col_nulls = df.isnull().sum()
df_col_nulls = df_col_nulls[df_col_nulls > 0]
print(df_col_nulls)

health        4993
spc_latin     4992
spc_common    4992
steward       4992
guards        4992
sidewalk      4992
problems      4992
dtype: int64


In [44]:
df_row_nulls = df.isnull().sum(axis = 1)
df_row_nulls = df_row_nulls[df_row_nulls > 0]
print(df_row_nulls)

630      7
631      7
634      7
635      7
636      7
        ..
99291    7
99293    7
99294    7
99295    7
99296    7
Length: 4993, dtype: int64


In [45]:
df[df_col_nulls.index.to_list()].head()

Unnamed: 0,health,spc_latin,spc_common,steward,guards,sidewalk,problems
0,Fair,Acer rubrum,red maple,,,NoDamage,
1,Fair,Quercus palustris,pin oak,,,Damage,Stones
2,Good,Gleditsia triacanthos var. inermis,honeylocust,1or2,,Damage,
3,Good,Gleditsia triacanthos var. inermis,honeylocust,,,Damage,Stones
4,Good,Tilia americana,American linden,,,Damage,Stones


### Missing data cannot be imputed, proceeding with removing rows with null values

In [47]:
df.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95007 entries, 0 to 99999
Data columns (total 42 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  95007 non-null  datetime64[ns]
 1   tree_id     95007 non-null  int64         
 2   block_id    95007 non-null  int64         
 3   the_geom    95007 non-null  object        
 4   tree_dbh    95007 non-null  int64         
 5   stump_diam  95007 non-null  int64         
 6   curb_loc    95007 non-null  object        
 7   status      95007 non-null  object        
 8   health      95007 non-null  object        
 9   spc_latin   95007 non-null  object        
 10  spc_common  95007 non-null  object        
 11  steward     95007 non-null  object        
 12  guards      95007 non-null  object        
 13  sidewalk    95007 non-null  object        
 14  user_type   95007 non-null  object        
 15  problems    95007 non-null  object        
 16  root_stone  95007 non-

### Finding duplicate data by tree_id

In [48]:
df.tree_id.duplicated().sum()

0

### Exporting processed dataframe to a CSV file

In [49]:
df.to_csv('./assets/data_100000_output.csv', encoding='utf-8')