In [6]:
import Pkg; Pkg.add("JLD2")
using DataFrames, CSV, Dates, JLD2

trips = CSV.read("../../data/trips/202210-bluebikes-tripdata.csv", DataFrame);
station_information = CSV.read("../../data/stations/station_information.csv", DataFrame);

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.7/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.7/Manifest.toml`


# Trips 

In [7]:
# find unique station_id
station_ids = unique(station_information[:, :station_id]);
nb_stations = length(station_ids);
stations_match = Dict(zip(station_ids, 1:nb_stations));

# eliminate trips with station_id not in station_ids
trips = trips[in.(trips[:, :"start station id"], [Set(station_ids)]), :];
trips = trips[in.(trips[:, :"end station id"], [Set(station_ids)]), :];

# convert starttime and stoptime to DateTime
trips[!, :starttime] = DateTime.(trips[!, :starttime], "yyyy-mm-dd HH:MM:SS.ssss");
trips[!, :stoptime] = DateTime.(trips[!, :stoptime], "yyyy-mm-dd HH:MM:SS.ssss");

# extract day and hour from starttime and stoptime
trips[!, :startday] = Dates.day.(trips[!, :starttime]);
trips[!, :starthour] = Dates.hour.(trips[!, :starttime]);
trips[!, :stopday] = Dates.day.(trips[!, :stoptime]);
trips[!, :stophour] = Dates.hour.(trips[!, :stoptime]);

# create matrix of hourly trips between stations_match
nb_days = maximum(trips[:, :startday]);
trips_matrix = zeros(Int, nb_stations, nb_stations, nb_days*24);
for i in 1:size(trips, 1)
    trips_matrix[stations_match[trips[i, :"start station id"]], stations_match[trips[i, :"end station id"]], (trips[i, :startday]-1)*24+trips[i, :starthour]+1] += 1
end

In [11]:
jldsave("../../data/parameters/202210-trips-matrix.jld2", true; trips_matrix)

In [12]:
trips_matrix = load("../../data/parameters/202210-trips-matrix.jld2", "trips_matrix");

# Capacity

In [13]:
capacity = station_information[:, :capacity];

# Demand

In [14]:
function estimate_demand(trips_matrix)
    # if number of trips is more than 75% of capacity, corresponding number is quadrupled
    nb_stations, _, nb_hours = size(trips_matrix)
    demand = copy(trips_matrix)
    for k in 1:nb_hours

        # outgoing demand
        for i in 1:nb_stations
            if sum(trips_matrix[i, :, k]) > capacity[i]
                total_outgoing = sum(trips_matrix[i, :, k])
                for j in 1:nb_stations
                    demand[i, j, k] += ceil(trips_matrix[i, j, k] - trips_matrix[i, j, k] * capacity[i]/total_outgoing)
                end
            end
        end

        # incoming demand
        for j in 1:nb_stations
            if sum(trips_matrix[:, j, k]) > capacity[j]
                total_incoming = sum(trips_matrix[:, j, k])
                for i in 1:nb_stations
                    demand[i, j, k] += ceil(trips_matrix[i, j, k] - trips_matrix[i, j, k] * capacity[j]/total_incoming)
                end
            end
        end
    end
    return demand
end

estimate_demand (generic function with 1 method)

In [15]:
demand = estimate_demand(trips_matrix);

In [16]:
jldsave("../../data/parameters/202210-demand.jld2", true; demand)

In [20]:
n_stations, _, nb_hours = size(demand)
# create empty list of size n_stations_test
stations_score = zeros(n_stations)
# for each station i, sum the number of bikes that gets in an out ot the station over the month
for i in 1:n_stations
    stations_score[i] += sum(demand[i,:,:]) + sum(demand[:,i,:])
end
stations_score;

In [48]:
# sort stations by score
stations_score_sorted = sortperm(stations_score, rev=true);
# keep only top 30 stations
top30_stations = stations_score_sorted[1:30];

# keep only top 50 stations
top50_stations = stations_score_sorted[1:50];

# keep only top 200 stations
top200_stations = stations_score_sorted[1:200];

In [49]:
stations_score_sorted[200]

218

In [50]:
stations_score[218]

1298.0

In [51]:
# invert the dictionary stations_match
stations_match_inv = Dict(zip(values(stations_match), keys(stations_match)));

In [52]:
# get the station_id of the top 30 stations
top30_stations_id = [stations_match_inv[i] for i in top30_stations];
# get the station_id of the top 50 stations
top50_stations_id = [stations_match_inv[i] for i in top50_stations];
# get the station_id of the top 200 stations
top200_stations_id = [stations_match_inv[i] for i in top200_stations];

In [53]:
top30_stations_id
# save in csv file
CSV.write("../../data/stations/top30-stations.csv", DataFrame(station_id=top30_stations_id))

"../../data/stations/top30-stations.csv"

In [54]:
top50_stations_id
# save in csv file
CSV.write("../../data/stations/top50-stations.csv", DataFrame(station_id=top50_stations_id))

"../../data/stations/top50-stations.csv"

In [55]:
top200_stations_id
# save in csv file
CSV.write("../../data/stations/top200-stations.csv", DataFrame(station_id=top200_stations_id))

"../../data/stations/top200-stations.csv"

In [56]:
# sort stations by score inv
stations_score_sorted_incr = sortperm(stations_score, rev=false);
# keep only top 50 stations
worse_top100_stations = stations_score_sorted_incr[1:100];

In [57]:
# get the station_id of the worse top 100 stations
worse_top100_stations_id = [stations_match_inv[i] for i in worse_top100_stations];

In [58]:
CSV.write("../../data/stations/worse_top100-stations.csv", DataFrame(station_id=worse_top100_stations_id))

"../../data/stations/worse_top100-stations.csv"