In [None]:
from datetime import datetime

# Lab 2 - Task 1

We must analyze historical data about the stations of the bike-sharing system in Barcelona. Your task consists in identifying the most “critical” timeslot (day of the week, hour) for each station, storing the result in a KML file and visualizing its content on a map. The analysis is based on two files:
1. data/register.csv
2. data/stations.csv

Write a single Spark application that identifies the most “critical” timeslot for each station. This analysis can support the planning of the rebalancing operations among stations. Solve the problem using RDDs. **Do not use DataFrames and the other Spark SQL features for this first part**.
In this application, each combination _“day of the week – hour”_ is a timeslot and is associated with all the readings associated with that combination, independently of the date. For
instance, the timeslot “Wednesday - 15” corresponds to all the readings made on Wednesday from 15:00:00 to 15:59:59.
A station Si is in the critical state if the number of free slots is equal to 0 (i.e., the station if full).
The “criticality” of a station Si in the timeslot Tj is defined as

$$\frac{\#\: readings\: with\: free\: slots = 0\: (S_i, T_j)}{total\: \#\: readings\: (S_i, T_j)}$$

Write an application that:
- Computes the criticality value for each combination (Si,Tj).
- Selects only the combinations having a criticality value greater than a minimum criticality threshold. The minimum criticality threshold is an argument of the application.
- Selects the most critical timeslot for each station (consider only timeslots with a criticality greater than the minimum criticality threshold). If there are two or more timeslots characterized by the highest criticality value for a station, select only one of those timeslots. Specifically, select the one associated with the earliest hour. If also the hour is the same, consider the lexicographical order of the name of the week day.
- Stores in one single (KML) file the information about the most critical timeslot for each station. Specifically, the output (KML) file must contain one marker of type
Placemark for each combination $(S_i,most\: critical\: timeslot\: for\: S_i)$ characterized by the following features:
    - StationId
    - Day of the week and hour of the critical timeslot
    - Criticality value
    - Coordinates of the station (longitude, latitude)

Do not include in the output (KML) file the stations for which there are no timeslots
satisfying the minimum criticality threshold.

### Hints

- To extract hour and weekday from a timestamp, consider using the datetime package of Python (e.g., functions `strptime()` and `strftime()`)
    - Example
```
from datetime import datetime
timestamp = "2008-05-15 12:01:00"
datetimeObject = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
dayOfTheWeek = datetimeObject.strftime("%a")
hour = datetimeObject.hour
```
- To create one single output file, set the number of partitions of the final RDD to 1 by
using `coalesce(1)` before invoking the `saveAsTextFile()` method.

In [None]:
# Set input and output folders

inputPath  = "data/register.csv" # args[0]
inputPath2 = "data/stations.csv" # args[1]
threshold  = 0.4 # args[2]
outputPath = "out1.2" # args[3]

# Read the content of the input file register.csv
inputRDD = sc.textFile(inputPath)

# Return
# - key = stationId
# - value = (long, lat) 
def extractStationLongLat(line):
    fields = line.split("\t")
    
    return (fields[0], (fields[1] ,fields[2]) )

# Read the location of the stations
stationLocation = sc.textFile(inputPath2).map(extractStationLongLat)

## Suggestions

Try to think about this problem, if you get stuck, you can have a look to a list of suggested steps... But please try to do it **by yourself** first!

In [None]:
with open('suggestions1.1.txt', 'r') as f:
    print(f.read())

### Output format

The output (KML) file must have the following format (one KML Placemark per line):

```
<Placemark>
    <name>44</name>
    <ExtendedData>
        <Data name="DayWeek"><value>Mon</value></Data>
        <Data name="Hour"><value>3</value></Data>
        <Data name="Criticality"><value>0.5440729483282675</value></Data>
    </ExtendedData>
    <Point>
        <coordinates>2.189700,41.379047</coordinates>
    </Point>
</Placemark>
<Placemark>
    <name>9</name>
    <ExtendedData>
        <Data name="DayWeek"><value>Sat</value></Data>
        <Data name="Hour"><value>10</value></Data>
        <Data name="Criticality"><value>0.5215827338129496</value></Data>
    </ExtendedData>
    <Point>
        <coordinates>2.185294,41.385006</coordinates>
    </Point>
</Placemark>
```

Copy and paste the output inside a KML file formatted as follows:

```
<kml xmlns="http://www.opengis.net/kml/2.2">
    <Document>
        ***Copy and paste here the output generated by your application***
    </Document>
</kml>
```

In [None]:
# Return a string that represents a KML marker
def formatKMLMarker(pair):
    # input
    # (stationId, ( (weekday, hour, criticality), (long, lat) ) )
    stationId = pair[0]
    
    weekday = pair[1][0][0]
    hour = pair[1][0][1]
    criticality = pair[1][0][2]
    coordinates = pair[1][1][0]+","+pair[1][1][1]
    
    result = "<Placemark><name>" + stationId + "</name>" + "<ExtendedData>"\
    + "<Data name=\"DayWeek\"><value>" + weekday + "</value></Data>"\
    + "<Data name=\"Hour\"><value>" + str(hour) + "</value></Data>"\
    + "<Data name=\"Criticality\"><value>" + str(criticality) + "</value></Data>"\
    + "</ExtendedData>" + "<Point>" + "<coordinates>" + coordinates + "</coordinates>"\
    + "</Point>" + "</Placemark>"
    
    return result

# Create a string containing the description of a marker, in the KML format, for each
# sensor and the associated information
resultKML = resultLocations.map(formatKMLMarker)

In [None]:
# Set the number of partitions to 1 for resultKML and store it in the output folder
resultKML.coalesce(1).saveAsTextFile(outputPath)

## Visualization

In [None]:
from pathlib import Path

# Opening the first (only) output file and adding at the beginning the interesting rows
with next(Path(outputPath).glob('part*')).open('r') as f:
    content = f'''<kml xmlns="http://www.opengis.net/kml/2.2">
                    <Document>
                        {f.read()}
                    </Document>
                </kml>'''
    with (Path(outputPath) / 'map.kml').open('w') as fw:
        fw.write(content)


Installing packages for visualization

In [None]:
!pip install geopandas fiona ipyleaflet

In [None]:
import geopandas as gpd
import fiona

# Sometimes this doesn't work... try commenting this line, run the cell, uncommenting and run again... This is computer science!
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

df = gpd.read_file(Path(outputPath) / 'map.kml', driver='KML')
df.head()

In [None]:
from ipyleaflet import Map, basemaps, basemap_to_tiles, Marker, MarkerCluster

m = Map(
    basemap=basemap_to_tiles(basemaps.OpenStreetMap.Mapnik),
    center=(41.3874, 2.1686),
    zoom=13
    )

m.add_layer(MarkerCluster(
 markers=[Marker(location=geolocation.geometry.coords[0][::-1], title=geolocation.Name) for _, geolocation in df.iterrows()])
 )
m