In [1]:
# Import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
from unicodedata import normalize

In [18]:
def source_data_from_csv(csv_file_name):
    try:
        df_csv = pd.read_csv(csv_file_name)
    except Exception as e:
        df_csv = pd.DataFrame()
    return df_csv.head()

In [19]:
def source_data_from_parquet(parquet_file_name):
    try:
        df_parquet = pd.read_parquet(parquet_file_name)
    except Exception as e:
        df_parquet = pd.DataFrame()
    return df_parquet.head()

In [20]:
def source_data_from_api(api_endpoint):
    try:
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        api_response = http.request('GET', api_endpoint)
        apt_status = api_response.status
        if apt_status == 200:
            data = json.loads(api_response.data.decode('utf-8'))
            df_api = pd.json_normalize(data)
        else:
            df_api = pd.DataFrame()
    except Exception as e:
        df_api = pd.DataFrame()
    return df_api.head()

In [21]:
def source_data_from_db(db_name, table_name):
    try:
        with sqlite3.connect(db_name) as conn:
            df_db = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    except Exception as e:
        df_db = pd.DataFrame()
    return df_db.head()

In [22]:
source_data_from_csv(csv_file_name)

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,2022-03-26T00:00:00.000,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,2023-11-01T00:00:00.000,1:29,BROOKLYN,11230.0,40.62179,-73.970024,"\n, \n(40.62179, -73.970024)",OCEAN PARKWAY,AVENUE K,,...,Unspecified,Unspecified,,,4675373,Moped,Sedan,Sedan,,
3,2022-06-29T00:00:00.000,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
4,2022-09-21T00:00:00.000,13:21,,,,,,BROOKLYN BRIDGE,,,...,Unspecified,,,,4566131,Station Wagon/Sport Utility Vehicle,,,,


In [23]:
source_data_from_parquet(parquet_file_name)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [24]:
source_data_from_api(api_endpoint)

Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,contributing_factor_vehicle_3,vehicle_type_code_3,location.latitude,location.longitude,location.human_address,cross_street_name,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
1,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
2,2023-11-01T00:00:00.000,1:29,OCEAN PARKWAY,AVENUE K,1,0,0,0,0,0,...,Unspecified,Sedan,40.62179,-73.970024,"{""address"": """", ""city"": """", ""state"": """", ""zip""...",,,,,
3,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4,2022-09-21T00:00:00.000,13:21,BROOKLYN BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,


In [25]:
source_data_from_db(db_name, table_name)

Unnamed: 0,id,original_title,budget,popularity,release_date,revenue,title,vote_average,vote_count,overview,tagline,uid,director_id
0,43597,Avatar,237000000,150,2009-12-10,2787965087,Avatar,7.2,11800,"In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,19995,4762
1,43598,Pirates of the Caribbean: At World's End,300000000,139,2007-05-19,961000000,Pirates of the Caribbean: At World's End,6.9,4500,"Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.",285,4763
2,43599,Spectre,245000000,107,2015-10-26,880674609,Spectre,6.3,4466,A cryptic message from Bond’s past sends him o...,A Plan No One Escapes,206647,4764
3,43600,The Dark Knight Rises,250000000,112,2012-07-16,1084939099,The Dark Knight Rises,7.6,9106,Following the death of District Attorney Harve...,The Legend Ends,49026,4765
4,43601,John Carter,260000000,43,2012-03-07,284139100,John Carter,6.1,2124,"John Carter is a war-weary, former military ca...","Lost in our world, found in another.",49529,4766


In [28]:
def extracted_data():
    csv_file_name = "h9gi-nx95.csv"
    parquet_file_name = "yellow_tripdata_2022-01.parquet"
    api_endpoint = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'
    db_name = "movies.sqlite"
    table_name = "movies"

    df_csv, df_parquet, df_api, df_db = (source_data_from_csv(csv_file_name),
                                         source_data_from_parquet(parquet_file_name),
                                         source_data_from_api(api_endpoint),
                                         source_data_from_db(db_name, table_name))
    return df_csv, df_parquet, df_api, df_db

In [29]:
extracted_data()

(                crash_date crash_time   borough  zip_code  latitude  \
 0  2021-09-11T00:00:00.000       2:39       NaN       NaN       NaN   
 1  2022-03-26T00:00:00.000      11:45       NaN       NaN       NaN   
 2  2023-11-01T00:00:00.000       1:29  BROOKLYN   11230.0  40.62179   
 3  2022-06-29T00:00:00.000       6:55       NaN       NaN       NaN   
 4  2022-09-21T00:00:00.000      13:21       NaN       NaN       NaN   
 
    longitude                       location           on_street_name  \
 0        NaN                            NaN    WHITESTONE EXPRESSWAY   
 1        NaN                            NaN  QUEENSBORO BRIDGE UPPER   
 2 -73.970024  \n,  \n(40.62179, -73.970024)            OCEAN PARKWAY   
 3        NaN                            NaN       THROGS NECK BRIDGE   
 4        NaN                            NaN          BROOKLYN BRIDGE   
 
   off_street_name cross_street_name  ...  contributing_factor_vehicle_2  \
 0       20 AVENUE               NaN  ...         