# cx2313_ly2637 Final Project

### Setup

In [2]:
import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base
from sqlalchemy import Column, Integer, Float, String, DateTime, create_engine
from sqlalchemy.ext.declarative import declarative_base

In [3]:
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "ZIP_CODE_040114.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "4P7xr8685SCdZVFOLXScTCqJi"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

### Part 1: Data Preprocessing

The first part of the project involves two main activities. The initial step is to download specific datasets manually. This is followed by using Python scripts for automated data downloads. Once the data is collected, the next step is to sort through it. This includes selecting the relevant information, fixing any missing or incorrect data, and creating samples from these datasets for further analysis.

#### 1.1 Load and clean data for zipcode file
- For the zipcode file, first, we'll remove any columns that are not needed. Next, we'll review the basic information of the dataset. Following that, we'll identify the parts that need cleaning and proceed with the cleaning process.

In [8]:
# Step 1: Load data and remove unnecessary columns
zipcode_data_file = DATA_DIR / "nyc_zipcodes.shp"
gdf = gpd.read_file(zipcode_data_file)

columns_to_keep = ['ZIPCODE', 'AREA', 'STATE', 'COUNTY', 'geometry']
zipcode_gdf = gdf[columns_to_keep]

# Rename columns for consistency
zipcode_gdf = zipcode_gdf.rename(columns={'ZIPCODE': 'zipcode', 'City': 'city','COUNTY': 'county','STATE': 'state','AREA': 'area'})

In [9]:
# Step 2: basic information of the data.
print(zipcode_gdf.info())
zipcode_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zipcode   263 non-null    object  
 1   area      263 non-null    float64 
 2   state     263 non-null    object  
 3   county    263 non-null    object  
 4   geometry  263 non-null    geometry
dtypes: float64(1), geometry(1), object(3)
memory usage: 10.4+ KB
None


Unnamed: 0,zipcode,area,state,county,geometry
0,11436,22699300.0,NY,Queens,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,29631000.0,NY,Kings,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,41972100.0,NY,Kings,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,23698630.0,NY,Kings,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,36868800.0,NY,Kings,"POLYGON ((991997.113 176307.496, 992042.798 17..."


- From the results we observed, it's clear that the geometry column in the zipcode file is not in the WGS coordinate system. Therefore, we will need to include a transformation in our clean function to convert the geometries to the WGS system for consistency and analysis compatibility.
- Furthermore, we will drop any duplicate rows to ensure data integrity and accuracy for our analysis and reporting. This step is crucial for maintaining the quality of our dataset and providing reliable insights.

In [10]:
# Step 3: Clean function
def clean_zipcode_data(gdf):
    gdf_cleaned = gdf.copy()
    
    # Change to WGS system for consistency
    gdf_cleaned = gdf_cleaned.to_crs(epsg=4326)
    
    # Validate geometric data in 'geometry'
    gdf_cleaned = gdf_cleaned[gdf_cleaned['geometry'].is_valid]
    
    # Remove duplicates
    gdf_cleaned = gdf_cleaned.drop_duplicates()
    
    # Ensure categorical consistency
    categorical_columns = ['state', 'county']
    for col in categorical_columns:
        gdf_cleaned.loc[:, col] = gdf_cleaned[col].str.title()
    
    return gdf_cleaned

cleaned_zipcode_gdf = clean_zipcode_data(zipcode_gdf)

- Check cleaned data in details

In [11]:
print(cleaned_zipcode_gdf.info())
cleaned_zipcode_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 262 entries, 0 to 262
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zipcode   262 non-null    object  
 1   area      262 non-null    float64 
 2   state     262 non-null    object  
 3   county    262 non-null    object  
 4   geometry  262 non-null    geometry
dtypes: float64(1), geometry(1), object(3)
memory usage: 12.3+ KB
None


Unnamed: 0,zipcode,area,state,county,geometry
0,11436,22699300.0,Ny,Queens,"POLYGON ((-73.80585 40.68291, -73.80569 40.682..."
1,11213,29631000.0,Ny,Kings,"POLYGON ((-73.93740 40.67973, -73.93487 40.679..."
2,11212,41972100.0,Ny,Kings,"POLYGON ((-73.90294 40.67084, -73.90223 40.668..."
3,11225,23698630.0,Ny,Kings,"POLYGON ((-73.95797 40.67066, -73.95576 40.670..."
4,11218,36868800.0,Ny,Kings,"POLYGON ((-73.97208 40.65060, -73.97192 40.650..."


Now that we have all the zipcode information for New York City, we need to extract this information to use as a reference. This will help us filter the data needed for `311`, `tree`, and `Zillow`. We will include this step in the cleaning steps for `tree`, `311`, and `Zillow` files.

#### 1.2 Download and clean data for trees 
- For the download part, due to the large volume of data, a script is implemented to download, process, and store New York City tree data in a GeoJSON file, handling the data in chunks to manage the size and complexity efficiently. 

In [None]:
def download_nyc_geojson_data(url_base, filename, force=False):
    limit = 50000  
    offset = 0  
    all_data = []

    if force or not filename.exists():
        print(f"Downloading data to {filename}...")

        try:
            while True:
                params = {
                    '$limit': limit,
                    '$offset': offset,
                    '$$app_token': "4P7xr8685SCdZVFOLXScTCqJi"
                }
                url_with_params = f"{url_base}?{urllib.parse.urlencode(params)}"
                print("Requesting URL:", url_with_params)
                response = requests.get(url_with_params)
                response.raise_for_status()

                data = response.json()
                if not data['features']: 
                    break

                all_data.extend(data['features'])
                offset += limit
            
            geojson_feature_collection = {
                "type": "FeatureCollection",
                "features": all_data
            }

            with open(filename, "w") as f:
                json.dump(geojson_feature_collection, f)

            print(f"Done downloading data to {filename}.")
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    else:
        print(f"Reading from {filename}...")

    return filename

nyc_tree_data_file = DATA_DIR / "nyc_tree_data.geojson"
download_nyc_geojson_data("https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson", nyc_tree_data_file)

- Next, we will use GeoPandas to read the GeoJSON file, and then examine the data for any inconsistencies or areas that may require cleaning.
- Upon understanding the issues that require analysis, we will also select the necessary columns in this step, The columns we will use are `tree_id`, `zipcode`, `latitude`, `longitude`, `status`, `health`, `spc_common`, and `geometry`.

In summary, regarding the tree data, our approach will be as follows: First, we will remove unnecessary columns. Second, we will examine the basic information of the data. Third, we will clean the data where necessary. Since we are using GeoPandas to read the GeoJSON data, the dataframe will include a column named 'geometry', which is already in the WGS coordinate system, so no further processing is required for this column.

In [4]:
# Step 1: Load data and remove unnecessary columns.
nyc_tree_data_file = DATA_DIR / "nyc_tree_data.geojson"

geodf_tree_data=gpd.read_file(nyc_tree_data_file)
columns_to_keep = ['tree_id', 'zipcode', 'latitude', 'longitude', 'status', 'health', 'spc_common', 'geometry']
tree_gdf = geodf_tree_data[columns_to_keep]

In [5]:
# Step 2: basic information of the data.
print(tree_gdf.info())
tree_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 683788 entries, 0 to 683787
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   tree_id     683788 non-null  object  
 1   zipcode     683788 non-null  object  
 2   latitude    683788 non-null  object  
 3   longitude   683788 non-null  object  
 4   status      683788 non-null  object  
 5   health      652172 non-null  object  
 6   spc_common  652169 non-null  object  
 7   geometry    683788 non-null  geometry
dtypes: geometry(1), object(7)
memory usage: 41.7+ MB
None


Unnamed: 0,tree_id,zipcode,latitude,longitude,status,health,spc_common,geometry
0,180683,11375,40.72309177,-73.84421522,Alive,Fair,red maple,POINT (-73.84422 40.72309)
1,200540,11357,40.79411067,-73.81867946,Alive,Fair,pin oak,POINT (-73.81868 40.79411)
2,204026,11211,40.71758074,-73.9366077,Alive,Good,honeylocust,POINT (-73.93661 40.71758)
3,204337,11211,40.71353749,-73.93445616,Alive,Good,honeylocust,POINT (-73.93446 40.71354)
4,189565,11215,40.66677776,-73.97597938,Alive,Good,American linden,POINT (-73.97598 40.66678)


In [6]:
tree_with_nan = tree_gdf[tree_gdf.isnull().any(axis=1)]
num_rows_with_nan = len(tree_with_nan)
total_rows = len(tree_gdf)
percent_with_nan = (num_rows_with_nan / total_rows) * 100

print(f"percentage with nan: {percent_with_nan:.2f}%")
print(f"Number of duplicates: {tree_gdf.duplicated().sum()}")

percentage with nan: 4.62%
Number of duplicates: 0


The output indicates that the GeoDataFrame contains 683,788 entries across 7 columns. Of these, only the `health` and 
`spc_common` columns have missing values, as all other columns have a non-null count of 683,788. 

Given that the `health` and `spc_common` columns cannot be effectively imputed, and after assessing that missing values account for only 4.62% of the total data, the decision to delete all rows with missing values is deemed reasonable. 

In [12]:
# Step 3: Clean function
def clean_tree_data(gdf):
    gdf_cleaned = gdf.copy()
    
    # Filter based on zip codes present in the 'zipcode' dataframe
    gdf_cleaned = gdf_cleaned[gdf_cleaned['zipcode'].isin(cleaned_zipcode_gdf['zipcode'])]
    
    # Convert 'latitude' and 'longitude' to float
    gdf_cleaned['latitude'] = pd.to_numeric(gdf_cleaned['latitude'], errors='coerce')
    gdf_cleaned['longitude'] = pd.to_numeric(gdf_cleaned['longitude'], errors='coerce')
    
    # Ensure categorical consistency
    categorical_columns = ['status', 'health', 'spc_common']
    for col in categorical_columns:
        gdf_cleaned.loc[:, col] = gdf_cleaned[col].str.title() 
    
    # Validate geometric data in 'geometry'
    gdf_cleaned = gdf_cleaned[gdf_cleaned['geometry'].is_valid]
    
    # Remove duplicates
    gdf_cleaned = gdf_cleaned.drop_duplicates()
    
    # Remove rows with missing values in 'health' and 'spc_common'
    gdf_cleaned = gdf_cleaned.dropna(subset=['health', 'spc_common'])
    
    return gdf_cleaned

cleaned_tree_gdf = clean_tree_data(tree_gdf)

- Check clean data for details

In [13]:
print(cleaned_tree_gdf.info())
cleaned_tree_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 652167 entries, 0 to 683787
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   tree_id     652167 non-null  object  
 1   zipcode     652167 non-null  object  
 2   latitude    652167 non-null  float64 
 3   longitude   652167 non-null  float64 
 4   status      652167 non-null  object  
 5   health      652167 non-null  object  
 6   spc_common  652167 non-null  object  
 7   geometry    652167 non-null  geometry
dtypes: float64(2), geometry(1), object(5)
memory usage: 44.8+ MB
None


Unnamed: 0,tree_id,zipcode,latitude,longitude,status,health,spc_common,geometry
0,180683,11375,40.723092,-73.844215,Alive,Fair,Red Maple,POINT (-73.84422 40.72309)
1,200540,11357,40.794111,-73.818679,Alive,Fair,Pin Oak,POINT (-73.81868 40.79411)
2,204026,11211,40.717581,-73.936608,Alive,Good,Honeylocust,POINT (-73.93661 40.71758)
3,204337,11211,40.713537,-73.934456,Alive,Good,Honeylocust,POINT (-73.93446 40.71354)
4,189565,11215,40.666778,-73.975979,Alive,Good,American Linden,POINT (-73.97598 40.66678)
