# Preparación de datos
## Importar bibliotecas

In [1]:
import pandas as pd 
import numpy as np 
import scipy as sp
import math
import re
from tqdm import tqdm
import json
import gzip
from unidecode import unidecode
from pyspark.sql.functions import udf
import matplotlib.pyplot as plt

import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, BooleanType
from pyspark.sql.functions import from_json, col
from pyspark.sql.functions import explode

# Importar funciones necesarias
from pyspark.sql.functions import col, to_date, weekofyear,year,trim, month, dayofmonth, sum
from pyspark.sql.functions import col,  count, coalesce, sum as spark_sum
from pyspark.sql.functions import regexp_replace, col, when, explode_outer,lit, to_timestamp,regexp_extract,lower,split
from pyspark.sql.functions import format_number
from pyspark.sql.types import IntegerType,FloatType
# Puedes obtener estadísticas específicas para una columna
from pyspark.sql.functions import mean, min, max
from pyspark.sql.functions import approx_count_distinct
from pyspark.sql.window import Window
from pyspark.sql.functions import log1p
from pyspark.sql import functions as F

from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *


## Inicio de Sessión Cluster

In [2]:
spark = SparkSession.builder.appName("MVP_NY").getOrCreate()
#spark = SparkSession.builder.master("spark://localhost:7077").appName("MLops1").getOrCreate()

In [3]:
spark

In [4]:
# Librerías para Scraping
import requests
import gzip
import shutil
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import timedelta, date
import math
from pathlib import Path

# Librerías de Normalización y Limpieza
import pandas as pd
import numpy as np
import re
from unidecode import unidecode

# Librerías para Gráficos
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import Image, display

import geopandas as gpd
import geojson
from shapely.geometry import Point, Polygon
from shapely.wkt import loads

# Modelado y Procesamiento
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
from bs4 import BeautifulSoup
import requests
from pathlib import Path

class DatasetDownloader:
    def __init__(self, base_url, download_folder):
        self.base_url = base_url
        self.download_folder = Path(download_folder)
        # Asegúrate de que el directorio de descarga exista
        self.download_folder.mkdir(parents=True, exist_ok=True)

    def scrape_siniestros_urls(self):
        response = requests.get(self.base_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a')

        file_urls = {}
        for link in links:
            href = link.get('href')
            if href:
                if href.endswith(('.zip', '.xlsx', '.csv', '.gz', '.geojson', '.pdf', '.parquet')):
                    file_name = href.split('/')[-1]
                    # Filtrar por años 2021, 2022 y 2023
                    if any(year in file_name for year in ['2021', '2022', '2023']):
                        file_urls[file_name] = href
                elif 'taxi_zone_lookup.csv' in href or 'taxi_zones.zip' in href:
                    file_name = href.split('/')[-1]
                    file_urls[file_name] = href

        return file_urls
    
    def scrape_especial_urls(self):
        response = requests.get(self.base_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a')

        file_urls = {}
        for link in links:
            href = link.get('href')
            if href and href.endswith(('.zip', '.xlsx', '.csv', '.gz', '.geojson', '.pdf')):
                file_name = href.split('/')[-1]
                file_urls[file_name] = href
        return file_urls
    
    def download_file(self, url, save_path):
        try:
            if not save_path.exists():
                response = requests.get(url, stream=True)
                response.raise_for_status()
                with open(save_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"Archivo guardado: {save_path}")
            else:
                print(f"El archivo {save_path} ya existe, no es necesario descargarlo nuevamente.")
        except Exception as e:
            print(f"Error al descargar el archivo: {url}")
            print(f"Error details: {e}")

    def download_datasets(self):
        file_urls_dict = self.scrape_siniestros_urls()
        for file_name, url in file_urls_dict.items():
            save_path = self.download_folder / file_name
            self.download_file(url, save_path)

# Uso de ejemplo
base_url = 'https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'
download_folder = '../datasets/raw'
downloader = DatasetDownloader(base_url, download_folder)
downloader.download_datasets()


El archivo ../datasets/raw/yellow_tripdata_2023-01.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/green_tripdata_2023-01.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/fhv_tripdata_2023-01.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/fhvhv_tripdata_2023-01.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/yellow_tripdata_2023-02.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/green_tripdata_2023-02.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/fhv_tripdata_2023-02.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/fhvhv_tripdata_2023-02.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datasets/raw/yellow_tripdata_2023-03.parquet ya existe, no es necesario descargarlo nuevamente.
El archivo ../datase

In [None]:
from bs4 import BeautifulSoup
import requests
from pathlib import Path

class DatasetDownloader:
    def __init__(self, base_url, download_folder):
        self.base_url = base_url
        self.download_folder = Path(download_folder)
        # Asegúrate de que el directorio de descarga exista
        self.download_folder.mkdir(parents=True, exist_ok=True)

    def scrape_siniestros_urls(self):
        response = requests.get(self.base_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a')

        file_urls = {}
        for link in links:
            href = link.get('href')
            if href and href.endswith(('.zip', '.xlsx', '.csv', '.gz', '.geojson', '.pdf')):
                file_name = href.split('/')[-1]
                file_urls[file_name] = href
        return file_urls
    
    
    def download_file(self, url, save_path):
        try:
            if not save_path.exists():
                response = requests.get(url, stream=True)
                response.raise_for_status()
                with open(save_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"Archivo guardado: {save_path}")
            else:
                print(f"El archivo {save_path} ya existe, no es necesario descargarlo nuevamente.")
        except Exception as e:
            print(f"Error al descargar el archivo: {url}")
            print(f"Error details: {e}")

    def download_datasets(self):
        file_urls_dict = self.scrape_siniestros_urls()
        for file_name, url in file_urls_dict.items():
            save_path = self.download_folder / file_name
            self.download_file(url, save_path)

# Uso de ejemplo
base_url = 'https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'
download_folder = '../datasets/raw'
downloader = DatasetDownloader(base_url, download_folder)
downloader.download_datasets()


Error al descargar el archivo: /assets/tlc/downloads/pdf/trip_record_user_guide.pdf
Error details: Invalid URL '/assets/tlc/downloads/pdf/trip_record_user_guide.pdf': No scheme supplied. Perhaps you meant https:///assets/tlc/downloads/pdf/trip_record_user_guide.pdf?
Error al descargar el archivo: /assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
Error details: Invalid URL '/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf': No scheme supplied. Perhaps you meant https:///assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf?
Error al descargar el archivo: /assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf
Error details: Invalid URL '/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf': No scheme supplied. Perhaps you meant https:///assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf?
Error al descargar el archivo: /assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf
Error details: Invalid URL '/a

In [None]:
"""
Archivo guardado: ..\datasets\raw\yellow_tripdata_2023-01.parquet
Archivo guardado: ..\datasets\raw\green_tripdata_2023-01.parquet
Archivo guardado: ..\datasets\raw\fhv_tripdata_2023-01.parquet
Archivo guardado: ..\datasets\raw\fhvhv_tripdata_2023-01.parquet
"""

In [None]:
# Ruta al archivo Parquet
ruta_archivo = "..\\datasets\\raw\\yellow_tripdata_2023-12.parquet"
ruta_archivo2 = "..\\datasets\\raw\\green_tripdata_2023-01.parquet"
ruta_archivo3 = "..\\datasets\\raw\\fhv_tripdata_2023-01.parquet"
ruta_archivo4 = "..\\datasets\\raw\\fhvhv_tripdata_2023-01.parquet"

# Leer el archivo Parquet en un DataFrame de Spark
df = spark.read.parquet(ruta_archivo)
df2 = spark.read.parquet(ruta_archivo2)
df3 = spark.read.parquet(ruta_archivo3)
df4 = spark.read.parquet(ruta_archivo4)

In [None]:
df.show(15)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-12-01 00:06:06|  2023-12-01 00:15:47|              0|          1.1|         1|                 N|         230|          48|           1|       10.0|  3.5|    0.5|       1.

In [None]:
df2.show(15)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2023-01-01 00:26:10|  2023-01-01 00:37:11|                 N|       1.0|         166|         143|            1.0|         2.58|       14.9|  1.0|    0.

In [None]:
df3.show(15)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00008|2023-01-01 00:30:00|2023-01-01 01:00:00|        NULL|        NULL|   NULL|                B00008|
|              B00078|2023-01-01 00:01:00|2023-01-01 03:15:00|        NULL|        NULL|   NULL|                B00078|
|              B00111|2023-01-01 00:30:00|2023-01-01 01:05:00|        NULL|        NULL|   NULL|                B03406|
|              B00112|2023-01-01 00:34:45|2023-01-01 00:52:03|        NULL|        14.0|   NULL|                B00112|
|              B00112|2023-01-01 00:11:20|2023-01-01 00:22:03|        NULL|        14.0|   NULL|                B00112|
|              B00112|2023-01-01 00:33:2

In [None]:
df4.show(15)

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+-----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee| tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+

In [None]:
print("Cantidad de registros", df.count(), "Cantidad de las Columnas", len(df.columns))
print("Cantidad de registros", df2.count(), "Cantidad de las Columnas", len(df2.columns))
print("Cantidad de registros", df3.count(), "Cantidad de las Columnas", len(df3.columns))
print("Cantidad de registros", df4.count(), "Cantidad de las Columnas", len(df4.columns))

Cantidad de registros 3376567 Cantidad de las Columnas 19
Cantidad de registros 68211 Cantidad de las Columnas 20
Cantidad de registros 1114320 Cantidad de las Columnas 7
Cantidad de registros 18479031 Cantidad de las Columnas 24


In [None]:
df.describe().show()

+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+---------------------+------------------+--------------------+-------------------+
|summary|          VendorID|   passenger_count|     trip_distance|       RatecodeID|store_and_fwd_flag|      PULocationID|      DOLocationID|      payment_type|       fare_amount|             extra|            mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        Airport_fee|
+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+---------------------+------------------+--------------------+----

In [None]:
df2.describe().show()

+-------+------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+------------------+-------------------+---------+---------------------+------------------+------------------+-------------------+--------------------+
|summary|          VendorID|store_and_fwd_flag|        RatecodeID|     PULocationID|     DOLocationID|   passenger_count|    trip_distance|       fare_amount|             extra|            mta_tax|        tip_amount|       tolls_amount|ehail_fee|improvement_surcharge|      total_amount|      payment_type|          trip_type|congestion_surcharge|
+-------+------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+------------------+-------------------+---------+---------------------+------------------+-----------------

In [None]:
df3.describe().show()

+-------+--------------------+------------------+------------------+-------+----------------------+
|summary|dispatching_base_num|      PUlocationID|      DOlocationID|SR_Flag|Affiliated_base_number|
+-------+--------------------+------------------+------------------+-------+----------------------+
|  count|             1114320|            234892|            935715|      0|               1114320|
|   mean|                NULL|138.75656471910494|136.96903330608146|   NULL|              323201.0|
| stddev|                NULL|  76.3625039271574|  80.1436222993559|   NULL|                   0.0|
|    min|              B00001|               1.0|               1.0|   NULL|                      |
|    max|              B03482|             265.0|             265.0|   NULL|                 ubber|
+-------+--------------------+------------------+------------------+-------+----------------------+



In [None]:
df4.describe().show()

+-------+-----------------+--------------------+--------------------+------------------+------------------+-----------------+------------------+-------------------+-----------------+------------------+------------------+--------------------+-------------------+------------------+------------------+-------------------+-----------------+------------------+----------------+--------------+
|summary|hvfhs_license_num|dispatching_base_num|originating_base_num|      PULocationID|      DOLocationID|       trip_miles|         trip_time|base_passenger_fare|            tolls|               bcf|         sales_tax|congestion_surcharge|        airport_fee|              tips|        driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-------+-----------------+--------------------+--------------------+------------------+------------------+-----------------+------------------+-------------------+-----------------+------------------+------------------+--

In [None]:
df.shape

(3376567, 19)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3376567 entries, 0 to 3376566
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           