In [None]:
# Import libraries
import pandas as pd
from sodapy import Socrata
from dotenv import load_dotenv
import os
import pyvin
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import shape
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
tqdm.pandas()

In [76]:
def run_client(client_url):
    '''
    Input:
        - N/A
    Output:
        - Client data for New York State API
    '''
    #Load enviroment file
    load_dotenv()
    
    # Login to NYS data with enviornment data
    client = Socrata(client_url,
                 os.getenv('nys_dmv_app_token'),
                 username = os.getenv('nys_dmv_api_key_id'),
                 password = os.getenv('nys_dmv_secret'))
    return client

def load_dmv_api(client_url):
    '''
    Input:
        - N/A
    Output:
        - Dataframe with all vehicle registrations in New York that have not expired two years ago
            - The data frame includes only vehicles from New York City that are electric
    '''
    
    # Set up endpoint
    endpoint = 'w4pv-hbkt'
    
    # Get client information
    client = run_client(client_url)
    
    # Get total number of records in api
    query_count = "SELECT COUNT(*)"
    NUM_RECORDS = 1000
    # NUM_RECORDS = int(client.get(endpoint, query = query_count)[0]['COUNT'])
    
    # Set up query for api. Select only only vehicles from New York City that are electric
    query = f"""
        SELECT vin, county, zip
        WHERE record_type = 'VEH'
        AND county in ('NEW YORK')
        AND fuel_type = 'ELECTRIC'
        LIMIT {NUM_RECORDS}
    """
    #
    
    # Pull results from DMV registration that are vehicles
    results = client.get(endpoint, query=query)
    
    # Change results into dataframe
    return pd.DataFrame.from_records(results)

def load_zip_api(client_url):
    '''
    Input:
        - N/A
    Output:
        - Dataframe with NYC zipcodes 
    '''
    
    # Set up endpoint
    endpoint = 'pri4-ifjk'
    
    # Get client information
    client = run_client(client_url)
    
    # Get total number of records in api
    query_count = "SELECT COUNT(*)"
    NUM_RECORDS = int(client.get(endpoint, query = query_count)[0]['COUNT'])
    
    # Set up query for api. Select only only vehicles from New York City that are electric
    query = f"""
        SELECT *
        LIMIT {NUM_RECORDS}
    """
    #
    
    # Pull results from DMV registration that are vehicles
    results = client.get(endpoint, query=query)
    
    # Change results into dataframe
    return pd.DataFrame.from_records(results)

In [77]:
df = load_dmv_api('data.ny.gov')

# Get only vins with the correct number of digits for a person vehicle
df = df.loc[df['vin'].str.len() == 17].copy()

# Drop nan
df.dropna(inplace = True)


In [None]:
# Change vin numbers to models
df['model'] = df['vin'].progress_apply(lambda x: pyvin.VIN(x, error_handling=pyvin.PASS).Model)

  0%|          | 0/996 [00:00<?, ?it/s]

In [None]:
# Drop values if model could not be calculated
df.dropna(subset=['model'], inplace = True) 

In [73]:
# Load zipcode data
zip_df = load_zip_api('data.cityofnewyork.us')

# Change geometry into shaply object
zip_df['geometry'] = zip_df['the_geom'].apply(lambda x: shape(x))

# Convert dataframe into geopandas dataframe
gdf_zip = gpd.GeoDataFrame(zip_df, geometry = 'geometry', crs = 'EPSG:4326')

In [72]:
X = df.loc[:, df.columns != 'model']

y = df['model']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

clf = LogisticRegression().fit(X, y)

Unnamed: 0_level_0,vin,county,model
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10001,7SAYGDEE5PA023568,NEW YORK,Model Y
10002,7SAYGDEE1PA039041,NEW YORK,Model Y
10003,7SAYGDEE2PF611379,NEW YORK,Model Y
10004,7SAXCBE58PF370813,NEW YORK,Model X
10005,7SAYGDEE2NF424348,NEW YORK,Model Y
10007,7SAYGAEE2NF515571,NEW YORK,Model Y
10011,7PDSGABL4NN001771,NEW YORK,R1S
10012,5YJ3E1EC0LF736062,NEW YORK,Model 3
10013,7PDSGABL5NN001729,NEW YORK,R1S
10014,7SAXCAE59NF346530,NEW YORK,Model X
