# Big Data Analytics: NYC Crashes

## Einleitung

### Rahmenbedingungen des Projekts

### Aufbau der Datenpipeline

#### Installation der benötigten Docker-Container

### Datenquelle

### Analyseziele

### Installation der Python Packages

In [None]:
pip install kafka-python pymongo

### Importieren benötigter Module

In [None]:
from kafka import KafkaProducer, KafkaConsumer
from pymongo import MongoClient
import datetime as dt
import requests
import os
from pathlib import Path

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Herunterladen der Datensätze (H)
Zum herunterladen der Datensätze werden die Daten von der Datenquelle mithilfe des nachfolgenden Python-Scripts heruntergeladen. Resultat sind drei `.csv`-Dateien die die Unfalldaten zeilenweise enthalten.

In [None]:
if not os.path.exists('data/'):
    os.mkdir('data')

for file_name, download_url in [
    ('crashes.csv', 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'),
    ('vehicles.csv', 'https://data.cityofnewyork.us/api/views/bm4k-52h4/rows.csv?accessType=DOWNLOAD'),
    ('persons.csv', 'https://data.cityofnewyork.us/api/views/f55k-p6yu/rows.csv?accessType=DOWNLOAD'),
]:
    if not os.path.isfile(fp:= (Path('data') / file_name)):
        with open(fp, 'wb') as crash_file:
            crash_file.write(requests.get(download_url).content)

## Einlesen der Daten und senden über den Producer (F)

Hier wird der Producer erstellt, sodass dieser verwendet werden kann, um Daten zu senden

In [None]:
producer = KafkaProducer(bootstrap_servers=['localhost:9092'])

### Hier nur einlesen von Testdaten, das muss dann weg (die anderen dauern aber ewig)

In [None]:
dataset = open('Motor_Vehicle_Collisions_-_Crashes.csv', encoding='utf-8')
rows = dataset.readlines()[1:]

for i, row in enumerate(rows):
        producer.send('nyc_crashes', value=bytearray(row, encoding='utf-8'), key=bytearray(str(i), encoding='utf-8'))

### Einlesen der Crashes (F)

Achtung: Viele Datensätze, dauert einige Minuten

In [None]:
dataset = open('data/crashes.csv', encoding='utf-8')
rows = dataset.readlines()[1:]

for i, row in enumerate(rows):
        producer.send('nyc_crashes', value=bytearray(row, encoding='utf-8'), key=bytearray(str(i), encoding='utf-8'))

### Einlesen der Fahrzeuge (F)

Achtung: Viele Datensätze, dauert einige Minuten

In [None]:
dataset = open('data/vehicles.csv', encoding='utf-8')
rows = dataset.readlines()[1:]

for i, row in enumerate(rows):
        producer.send('nyc_crashes', value=bytearray(row, encoding='utf-8'), key=bytearray(str(i), encoding='utf-8'))

### Einlesen der Personen (F)

Achtung: Viele Datensätze, dauert einige Minuten

In [None]:
dataset = open('data/persons.csv', encoding='utf-8')
rows = dataset.readlines()[1:]

for i, row in enumerate(rows):
        producer.send('nyc_crashes', value=bytearray(row, encoding='utf-8'), key=bytearray(str(i), encoding='utf-8'))

## Lesen der Daten mit dem Consumer und Import in die Datenbank

In [None]:
client = MongoClient("localhost:27017")

crashes = client['nyc_crashes']['crashes']
crashes.delete_many({})

consumer = KafkaConsumer('nyc_crashes', bootstrap_servers=['localhost:9092'], auto_offset_reset="earliest")

for row in consumer: 
    row = row.value.decode('utf-8').split(',')
    try:
        res = {}
        for idx, (db_field, field_type) in enumerate([
            ('crash_date', str), 
            ('crash_time', str), 
            ('borough', str), 
            ('zip_code', int), 
            ('latitude', float), 
            ('longitude', float), 
            ('location', str), 
            ('on_street_name', str), 
            ('cross_street_name', str), 
            ('off_street_name', str), 
            ('persons_injured', int), 
            ('persons_killed', int), 
            ('pedestrians_injured', int), 
            ('pedestrians_killed', int), 
            ('cyclists_injured', int), 
            ('cyclists_killed', int), 
            ('motorists_injured', int), 
            ('motorists_killed', int), 
            ('contributing_factor_vehicle_1', str),
            ('contributing_factor_vehicle_2', str),
            ('contributing_factor_vehicle_3', str),
            ('contributing_factor_vehicle_4', str),
            ('contributing_factor_vehicle_5', str),
            ('_id', int),
            ('vehicle_type_code_1', str),
            ('vehicle_type_code_2', str),
            ('vehicle_type_code_3', str),
            ('vehicle_type_code_4', str),
            ('vehicle_type_code_5', str),
        ]):
            if row_data := row[idx]:
                res[db_field] = field_type.__call__(row_data) if not isinstance(row_data, field_type) else row_data

        res['vehicles'] = []
        res['persons'] = []
        crashes.insert_one(res)
        
    except Exception as e:
        print(e)

In [None]:


producer = KafkaProducer(bootstrap_servers=['localhost:9092'])

for i in range(1, 13):
    if i < 10:
        i = "0%s" % i
    dataset = open('./data/2019/yellow-cabs-2019-%s.csv' % i, encoding='utf-8')
    rows = dataset.readlines()[1:]

    for i, row in enumerate(rows):
        producer.send('yellow-cabs', value=bytearray(row, encoding='utf-8'), key=bytearray(str(i), encoding='utf-8'))

for i in range(1, 7):
    if i < 10:
        i = "0%s" % i
    dataset = open('./data/2019/yellow-cabs-2020-%s.csv' % i, encoding='utf-8')
    rows = dataset.readlines()[1:]

    for i, row in enumerate(rows):
        producer.send('yellow-cabs', value=bytearray(row, encoding='utf-8'), key=bytearray(str(i), encoding='utf-8'))

In [None]:


consumer = KafkaConsumer('yellow-cabs', bootstrap_servers=[""])

client = MongoClient("")

yellow_collection = client['datawarehouse']['bg-yellowcabs']
yellow_collection.delete_many({})

count = 0

for msg in consumer: 
    count += 1
    print('Received new message: %s' % count)
    values = msg.value.decode('utf-8').split(',')
    
    yellow_collection.insert_one({
        'pickup_datetime': dt.datetime.strptime(values[1], "%Y-%m-%d %H:%M:%S"),
        'dropoff_datetime': dt.datetime.strptime(values[2], "%Y-%m-%d %H:%M:%S"),
        'passenger_count': int(values[3]),
        'trip_distance': float(values[4]),
        'PULocationID': values[5],
        'DOLocationID': values[6],
        'payment_type': int(values[9]),
        'fare_amount': float(values[10]),
        'tip_amount': float(values[15]),
        'total_amount': float(values[16])
    })