In [None]:
import pandas as pd
import requests
from sodapy import Socrata
from sqlalchemy import create_engine
from time import time
import argparse
#import psycopg2
from tenacity import retry, wait_exponential, stop_after_attempt
import requests
import pyspark


In [None]:
client = Socrata("data.cityofnewyork.us", "token_alphanumeric")

In [None]:
#Get API between two dates
@retry(wait=wait_exponential(multiplier=2, min=2, max=16), stop=stop_after_attempt(5))
def get_data_from_api(api_url,datasetid,year_from,year_to):
    # Define the API endpoint
    url = f"https://{api_url}/resource/{datasetid}.json"
    params = {
        "$where": f"yr >= '{year_from}' AND yr <= '{year_to}'"
    }
    # Make the GET request
    response = requests.get(url, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        print(data)
    else:
        print(f"Error: {response.status_code}")
    
    return data
try:
    #results = client.get("8m42-w767", limit=50)
    results = get_data_from_api('data.cityofnewyork.us','7ym2-wayt',2021,2021)
    print("Connected to API")
    
except requests.exceptions.RequestException as e:
    print(f"Failed to fetch data from API: {e}")

In [None]:
print(results)

In [None]:
@retry(wait=wait_exponential(multiplier=2, min=2, max=16), stop=stop_after_attempt(5))
def get_data_from_api(client,data_set,limit_rows):
    results = client.get(data_set,limit=limit_rows)
    return results
try:
    #results = client.get("8m42-w767", limit=50)
    results = get_data_from_api(client,"7ym2-wayt",10000)
    print("Connected to API")
    
except requests.exceptions.RequestException as e:
    print(f"Failed to fetch data from API: {e}")

## Pandas

In [None]:
df = pd.DataFrame.from_records(results)

In [None]:
df.head()

In [None]:
df.dtypes

## PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, to_timestamp, to_date, to_timestamp, concat, lit, upper, length
#from pyspark.sql.functions import to_timestamp, to_date, to_timestamp
spark = SparkSession.builder.appName("Transformations_NYC_Traffic_Data").getOrCreate()
print(spark.version)
#from pyspark.sql.functions import concat, lit, initcap
#from pyspark.sql.functions import length

In [None]:
df = spark.read.json(spark.sparkContext.parallelize([results]))

In [None]:
df.show()

In [None]:
df.createOrReplaceTempView('Temp_Tbl')

In [None]:
query = spark.sql("SELECT boro,street,sum(vol),yr,m,d,hh,mm FROM Temp_Tbl where yr = 2021 Group By boro,street,yr,m,d,hh,mm order by street,yr,m,d,hh,mm asc;")
query.show()

In [None]:
query = spark.sql("SELECT yr,m,count(m) FROM Temp_Tbl where yr = 2021 Group By yr,m order by yr,m asc;")
query.show()

In [None]:
#Clean up month column
df = df.withColumn(
        'm',
        when(length(df.m) == 1 ,concat(lit("0"),df.m))
        .otherwise(df.m))
#Clean up day column
df = df.withColumn(
        'd',
        when(length(df.d) == 1 ,concat(lit("0"),df.d))
        .otherwise(df.d))
#Clean up month column
df = df.withColumn(
        'hh',
        when(length(df.hh) == 1 ,concat(lit("0"),df.hh))
        .otherwise(df.hh))
#Clean up day column
df = df.withColumn(
        'mm',
        when(length(df.mm) == 1 ,concat(lit("0"),df.mm))
        .otherwise(df.mm))

#Create new field called report_date_time which concatenates yr,m,d,hh,mm and converts to datetime field in dataframe
df = df.withColumn("report_date_time",concat(df.yr,lit("-"),df.m,lit("-"),df.d,lit(" "),df.hh,lit(":"),df.mm,lit(":00")))
df = df.withColumn("report_date_time", to_timestamp(df.report_date_time, "yyyy-MM-dd HH:mm:ss"))

In [None]:
#Make Borough Upper Case
df = df.withColumn("boro", upper(df.boro))

In [None]:
#Convert vol to integer
df = df.withColumn('vol', df['vol'].cast("int"))

In [None]:
df.show()

In [None]:
df.createOrReplaceTempView('Temp_Tbl')

In [None]:
query = spark.sql("SELECT vol,yr,m,d,hh,dd FROM Temp_Tbl limit 5;")
query.show()

In [None]:
query = spark.sql("SELECT report_time,report_date,yr,m,d,hh,mm FROM Temp_Tbl order by d desc limit 5;")
query.show()

In [None]:
df.printSchema()

## Postgres Load

In [None]:
from sqlalchemy import create_engine
import pandas as pd

In [None]:
#Creating the engine postgressql://username:password@host:port/db_name
username = 'root'
password = 'root'
host = "fire_incidents_db"
port = 5432
database = "fire_incidents"
engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')
#engine = create_engine(f'postgresql://{username}:{password}@{host_name}:{port}/{database}')
#engine = create_engine('postgresql://root:root@fire_incidents_db:5432/fire_incidents')

In [None]:
pandas_df = df.toPandas()

In [None]:
#Defines a schema, names it to fire_incidents_schema, and then assigns it to postgres
print(pd.io.sql.get_schema(pandas_df,name='fire_incidents_schema',con=engine))

In [None]:
#Creates the table in postgres with only the field names. Name = yellow_taxi_data, Engine is the postgres database, if_exists = 'replace' if a table already exists with this name it will replace it
df.head(n=0).to_sql(name='nyc_traffic_tbl',con=engine,if_exists='replace')

In [None]:
start = 0
batchsize = 1000
def create_batches_of_rows(dataframe,batchsize):
    start = 0
    while start < len(df) + 1:
        yield df.iloc[start:start + batchsize]
        start += batchsize

In [None]:
#Creates a list of batches. Parses the dataframe and the batchsize through the create_batches_of_rows function and sets the variable batches to the list
batches = list(create_batches_of_rows(df,100))

In [None]:
#Loops through each one of the batches and appends the batch to the postgressql database.
counter = 1
for batch in batches:
    batch.to_sql(name='nyc_traffic_tbl', con=engine, if_exists='append')
    print(f'Batch Loaded..... {counter}')
    counter += 1
    