In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statistics
from get_bounds import *
from plotting_utility import *
from misc_utility import *

### Grid Points Load (output from scoring.ipynb)
Grid Points are now scored in `scoring.ipynb`.

In [None]:
YYZ_GridPoints = pd.read_csv('../res/grid_points/yyz_grid_points.csv', sep = ',')
PT_DENSITY = len(YYZ_GridPoints["lat"].unique())

### Clustered Grid Points Load (output from clustering.ipynb)
Clustered Grid Points (a subset of Grid Points) are now scored in `clustering.ipynb`.

In [None]:
YYZ_Clustered_GridPoints = pd.read_csv('../res/grid_points/yyz_clustered_grid_points.csv', sep = ',')
NUM_CLUSTERS = len(YYZ_Clustered_GridPoints["cluster"].unique())

### Parameters

In [None]:
INTERCHANGE_CANDIDATE_BONUS_FACTOR = 5
POLY_REGRESSION_ORDER = 5
NUMBER_INTERCHANGE_CANDIDATES = 3
STATION_MIN_DISTANCE = 0.9
INTERCHANGE_MIN_DISTANCE = 5

### Filtering and Processing

In [None]:
def trimOutOfBounds(points, upperLatBound, bottomLatBound, leftLongBound, rightLongBound):
    df = pd.DataFrame({'lat': points[0], 'long': points[1]})
    df = df.loc[(df['lat'] >= bottomLatBound) & (df['lat'] <= upperLatBound) & (df['long'] >= leftLongBound) & (df['long'] <= rightLongBound)]
    return (df['lat'].values, df['long'].values)

In [None]:
def snapLongToGrid(points, leftLongBound, rightLongBound):
    step = round((rightLongBound-leftLongBound)/(PT_DENSITY-1), 4)
    pointsDf = pd.DataFrame({'lat': points[0], 'long': points[1]})
    for index, row in pointsDf.iterrows():
        pointsDf.at[index, 'long'] = pointsDf.at[index, 'long'] - ((pointsDf.at[index, 'long'] - leftLongBound) % step)
    return (pointsDf['lat'].values, pointsDf['long'].values)

### Regression

In [None]:
def linearRegression(scores):
    # step 1: don't use points that are very small (threshold)
    threshold = statistics.median(scores['score'])
    thresholded_scores = scores.loc[scores['score'] >= threshold]
    #print(thresholded_scores)

    regr = LinearRegression()
    lats = thresholded_scores["lat"].values.reshape(-1, 1)
    longs = thresholded_scores["long"].values.reshape(-1, 1)
    regr.fit(lats, longs, thresholded_scores["score"]) # Score is Squared because sample_weight is square rooted thresholded_scores["score"]
    x_begin = lats[0]
    x_end = lats[-1]
    y_begin = regr.predict(x_begin.reshape(1, -1)).item()
    y_end = regr.predict(x_end.reshape(1, -1)).item()

    return (x_begin.item(), x_end.item(), y_begin, y_end)

In [None]:
def polynomialRegression(scores, degree, upperLatBound, bottomLatBound):
    poly = PolynomialFeatures(degree, include_bias=False)
    poly_features = poly.fit_transform(scores["lat"].values.reshape(-1, 1))
    poly_reg_model = LinearRegression().fit(poly_features, scores["long"], scores["score"])
    # = scores['lat'].unique()
    x = np.linspace(scores['lat'].min(), scores['lat'].max(), PT_DENSITY)
    #x = np.arange(bottomLatBound, upperLatBound, round((upperLatBound-bottomLatBound)/PT_DENSITY, 4))
    y = poly_reg_model.predict(poly.fit_transform(x.reshape(-1, 1)))
    return (x, y, poly_reg_model.intercept_, poly_reg_model.coef_)

In [None]:
def getLine(scores, upper, bottom, left, right):
    [*line, line_intercept, line_coefficients] = polynomialRegression(scores, POLY_REGRESSION_ORDER, upper, bottom)
    line = trimOutOfBounds(line, upper, bottom, left, right)
    line = snapLongToGrid(line, left, right)
    return (line, line_intercept, line_coefficients)

### Query Functions

In [None]:
# Gets the points associated with the n highest scores of the line, separated by at least dist
# Does not guarantee that n highest scores will be returned, if restricted by dist. Guarantees at MOST n scores returned.
def getHighestNScorePointsWithMinDist(pts, n, dist):
    sortedPoints = pts.sort_values(by=['score'], ascending=False)
    highestPoints = pd.DataFrame()

    # initialization
    highestPoints = highestPoints.append(sortedPoints.iloc[0])

    for idx, p in sortedPoints.iterrows():
        if len(highestPoints) >= n:
            break

        previousPoint = (highestPoints.iloc[-1]["lat"], highestPoints.iloc[-1]["long"])
        currentPoint = (p["lat"], p["long"])
        delta = getDistance(previousPoint, currentPoint)

        if delta >= dist:
            highestPoints = highestPoints.append(p)

    return highestPoints

In [None]:
def getGridPointsFromLine(scores, line):
    numPoints = len(line[0])
    lineGridPoints = pd.DataFrame()

    for index in range(0, numPoints):
        lati = line[0][index]
        longi = line[1][index]

        scoreIndex = scores.loc[(abs(scores["lat"] - lati) <= 0.005) & (abs(scores["long"] - longi) <= 0.005)].index.values
        if len(scoreIndex) > 0:
            lineGridPoints = lineGridPoints.append(scores.iloc[scoreIndex[0]])

    return lineGridPoints

### Line Generation

In [None]:
# This function generates the lines for the system, and returns the lines in [[lats], [longs]] format
# Also returns the interchange candidates (what possibly can be an interchange) as an array of DataFrames
def generateLines(gridPoints, clusterGridPointsArray, upperBound, bottomBound, leftBound, rightBound):
    lines = []
    interchangeCandidates = []
    for i in range(NUM_CLUSTERS):
        clusterGridPointsArray[i] = clusterGridPointsArray[i].append(interchangeCandidates)
        [line, _, _] = getLine(clusterGridPointsArray[i], upperBound, bottomBound, leftBound, rightBound)
        lines.append(line)
        linePoints = getGridPointsFromLine(gridPoints, line)
        highestPoints = getHighestNScorePointsWithMinDist(linePoints, NUMBER_INTERCHANGE_CANDIDATES, INTERCHANGE_MIN_DISTANCE)
        highestPoints = highestPoints.assign(score=highestPoints['score'] * INTERCHANGE_CANDIDATE_BONUS_FACTOR)
        interchangeCandidates.append(highestPoints)

        # Code to plot how the subway system is built
        interchangeCandidatePoints = [(pd.concat(interchangeCandidates)["lat"].to_numpy(), pd.concat(interchangeCandidates)["long"].to_numpy())]
        plotHeatmapPoints(gridPoints, lines + interchangeCandidatePoints, PT_DENSITY)
    return (lines, interchangeCandidates)

In [None]:
def splitGridPointsByClusters(gridPointsWithCluster):
    gridPointClusterArray = []
    for i in range(NUM_CLUSTERS):
        gridPointClusterArray.append(gridPointsWithCluster.loc[gridPointsWithCluster['cluster'] == i])
    return gridPointClusterArray

### Station Generation

In [None]:
# This function generates the stations for a line and returns them in an array
#line numbers are 0 indexed
#distance in km
def generateStationsOnLine(statDistance, subwayLines, lineNum):
    stations = []
    size = len(subwayLines[lineNum][0])

    lati = subwayLines[lineNum][0][0]
    longi = subwayLines[lineNum][1][0]

    stationCor = (lati, longi)
    placeStation = stationCor

    d = {'lat': [lati], 'long': [longi]}
    df = pd.DataFrame(data=d)

    for index in range(0, size - 1):

        lati2 = subwayLines[lineNum][0][index + 1]
        longi2 = subwayLines[lineNum][1][index + 1]

        nextStation = (lati2, longi2)

        distance = getDistance(nextStation, placeStation)

        if distance >= statDistance:

            new_row = {'lat': lati2, 'long': longi2}
            df = df.append(new_row, ignore_index=True)

            placeStation = nextStation


    #convert data frame to numpy array to append to list so we can plot
    lineStat = (df['lat'].values, df['long'].values)

    stations.append(lineStat)

    #return the new set of points
    return stations

In [None]:
#lineNum is not 0 indexed
def generateStationsInSystem(statDistance):
    stationsInSystem = []
    for idx in range(0, NUM_CLUSTERS):
        stationsInSystem += generateStationsOnLine(statDistance,YYZ_Lines, idx)
    return stationsInSystem

### Toronto (YYZ)

In [None]:
[YYZ_UpperLatBound, YYZ_BottomLatBound, YYZ_LeftLongBound, YYZ_RightLongBound] = getBounds(YYZ_GridPoints['lat'], YYZ_GridPoints['long'], 4)

In [None]:
# YYZ_Cluster_GP_Array is an array with each line being its own cluster's grid points
YYZ_Cluster_GP_Array = splitGridPointsByClusters(YYZ_Clustered_GridPoints)

In [None]:
YYZ_Lines, YYZ_Interchange_Candidates = generateLines(
    YYZ_GridPoints, YYZ_Cluster_GP_Array, YYZ_UpperLatBound, YYZ_BottomLatBound, YYZ_LeftLongBound, YYZ_RightLongBound
)

In [None]:
# Draw the System
interchangeCandidatePoints = [(pd.concat(YYZ_Interchange_Candidates)["lat"].to_numpy(), pd.concat(YYZ_Interchange_Candidates)["long"].to_numpy())]
plotHeatmapPoints(YYZ_GridPoints, YYZ_Lines + interchangeCandidatePoints, PT_DENSITY)

In [None]:
YYZ_Stations = generateStationsInSystem(STATION_MIN_DISTANCE)

In [None]:
plotHeatmapPoints(YYZ_GridPoints, YYZ_Stations, PT_DENSITY)

### Export Cells