In [1]:
import sys
sys.path.append("..") # Adds higher directory to python modules path.
from utils import *

### Task 1
Processing information to generate new data and attributes on large datasets such as the bike data below is often likely going to be time-expensive; as such, it is important to seek out approaches that can optimise and streamline these processes. The Pool object contained within Python's multiprocessing module offers a way to parallelize the execution of a function across a number of input values that are distributed between processes. This data parallelism is used here to calculate bike usage data on the bike data. By approaching a problem from a non-sequential standpoint, tasks and jobs can be completed more efficiently.

 1. Read in tables as dataframes

In [3]:
core_bike_df = query_db()
core_weather_df = query_db("live_weather_data")
bike_df = core_bike_df.copy()
weather_df = core_weather_df.copy()

 2. Print the number of rows in each table

In [4]:
print("\tBikes dataframe contains", f'{len(bike_df.index):,}', "rows.")
print("\tWeather dataframe contains", f'{len(weather_df.index):,}', "rows.")

	Bikes dataframe contains 1,077,071 rows.
	Weather dataframe contains 41,616 rows.


 3. Remove any rows related to stations that are not currently in use.

In [5]:
bike_df = bike_df.loc[bike_df['status'] == 0]
print("\t", bike_df['status'].nunique())

1


 4. Generate the support attribute dd-mm-yy

In [6]:
bike_df['dd_mm_yy'] = pd.to_datetime(bike_df['dt'],unit='s').dt.strftime('%d-%m-%y')

 5. Generate a city indicator attribute for each dataframe.

In [7]:
bike_df['city_id'] = np.where(bike_df['station_id'] > 4000, 4, 
         (np.where(bike_df['station_id'] < 3000, 2, 3)))
weather_df['city_id'] = np.where(weather_df.name == "Cork", 2,
                                 (np.where(weather_df.name == "Limerick", 3, 4)))

In [8]:
# a list of tuples containing df, station_id, date to be used as arguments for pool_process function
stations_data = [(bike_df, station, day) for day in bike_df['dd_mm_yy'].unique() for station in bike_df['station_id'].unique()]
city_data = [(bike_df, city, day) for day in bike_df['dd_mm_yy'].unique() for city in bike_df['city_id'].unique()]

 6. Generate a bikes available citywide attribute that represents the total number of bikes available within each city for all recorded moments.

In [9]:
%%time
bike_df['bikes_available_citywide'] = 0
ba_citywide_output = pool_process(func=calculate_available_bikes_citywide, data=city_data, pool_size=8)

CPU times: user 23.7 s, sys: 6.2 s, total: 29.9 s
Wall time: 2min 13s


In [10]:
%%time
for dictionary in ba_citywide_output:
    bike_df.loc[[*dictionary], 'bikes_available_citywide'] = list(dictionary.values())

CPU times: user 3.3 s, sys: 686 ms, total: 3.98 s
Wall time: 4.01 s


 7. Generate a count attribute named count_1 that cumulatively tracks bike usage at each station across every day.

In [11]:
bike_df['count_1'] = 0

In [12]:
%%time
count_1_output = pool_process(func=track_station_usage_by_date, data=stations_data, pool_size=8)

CPU times: user 25.8 s, sys: 6.34 s, total: 32.2 s
Wall time: 7min 46s


In [None]:
%%time
for dictionary in count_1_output:
    bike_df.loc[[*dictionary], 'count_1'] = list(dictionary.values())

 8. Generate a count attribute named count_2 that displays the total bike usage at each city across every day.

In [None]:
bike_df['count_2'] = 0

In [None]:
%%time
count_2_output = pool_process(func=track_city_usage_by_date, data=city_data, pool_size=8)

In [None]:
%%time
for dictionary in count_2_output:
    bike_df.loc[[*dictionary], 'count_2'] = list(dictionary.values())