In [None]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import os
import json
import pymongo
import datetime
from datetime import datetime
import requests
from zipfile import ZipFile
from io import BytesIO
import matplotlib.pyplot as plt
import meteostat
from meteostat import Point, Daily
from pymongo import MongoClient

In [None]:
# Connecting to Mongo
client = MongoClient('localhost', 27017) #27017 local, 27027 sharding
mydb=client['Citibike']
mycol=mydb['NewYork']

In [None]:
# Creating year-month combination to automatize the process
years=['19','20']
combo=[]
for y in years:
    for m in range(1,13):
        m=str(m)
        if len(m)==1:
            combo.append(y+'0'+m)
        else:
            combo.append(y+m)

In [None]:
# Option 1 without download (unzip through python)
# advice: do not upload everything one-shot, better in groups (in particular for sharding)
for month in combo:

    r = requests.get(f'https://s3.amazonaws.com/tripdata/20{month}-citibike-tripdata.csv.zip')
  
    with zipfile.ZipFile(io.BytesIO(r.content)) as ar:
        trip_data = pd.read_csv(ar.open(f'20{month}-citibike-tripdata.csv'))
        
        # Preprocessing 
        trip_data.rename(columns = {'start station id': 'S', 
                                    'end station id': 'E',
                                    'birth year': 'BY',
                                    'bikeid':'B',
                                    'usertype':'U',
                                    'gender':'G',
                                    'tripduration':'D'}, inplace = True)

        trip_data['ST']= pd.to_datetime(trip_data['starttime'])
        trip_data['ET']= pd.to_datetime(trip_data['stoptime'])

        columns = ['S','E','ST','ET','B','U','BY','G','D']
        data_ready = trip_data[columns]
        
        docs = data_ready.to_dict(orient='records')
        
        mycol.insert_many(docs)

In [None]:
# Option 2 with download
root=r"path" #ex. windows C:\Users\...\data\citibike\\
for month in combo:
    trip_data=pd.read_csv(f'{root}20{month}-citibike-tripdata.csv')
    trip_data.rename(columns = {'start station id': 'S', 
                                    'end station id': 'E',
                                    'birth year': 'BY',
                                    'bikeid':'B',
                                    'usertype':'U',
                                    'gender':'G',
                                    'tripduration':'D'}, inplace = True)

    trip_data['ST']= pd.to_datetime(trip_data['starttime'])
    trip_data['ET']= pd.to_datetime(trip_data['stoptime'])

    columns = ['S','E','ST','ET','B','U','BY','G','D']
    data_ready = trip_data[columns]
    docs=data_ready.to_dict(orient='records')
    mycol.insert_many(docs)

In [None]:
# Station data
stations = requests.get("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
st = stations.json().get('data').get('stations')

In [None]:
# Cleaning data
st_clean = []
for stat in st:
    station = [stat.get('lat'), stat.get('lon'), stat.get('name'), stat.get('station_id'),stat.get('capacity')]
    st_clean.append(station)
df_stations = pd.DataFrame(st_clean, columns = ['latitude', 'longitude','name','id','capacity']) 
df_stations.head()

In [None]:
# Saving data
df_stations.to_csv('stations.csv', index = False)

In [None]:
# Weather data
lat=df_stations['latitude'].mean()
lon=df_stations['longitude'].mean()
print(f'''
The average latitude is {lat}
The average longitude is {lon}
''')

In [None]:
# Create geographical center based on stations
center=Point(lat,lon)
# Set time period
start = datetime(2019, 1, 1)
end = datetime(2020, 12, 31)
# Retrieve weather
weather = Daily(center, start, end)
coverage = weather.coverage()
weather = weather.normalize()
weather = weather.interpolate()
weather = weather.fetch()
weather.reset_index(inplace=True)

In [None]:
# Cleaning and exploration
weather['date']=pd.to_datetime(weather['time'])
df_weather=weather[['date','tavg','tmin','tmax','prcp','snow','wspd']]
df_weather.head

In [None]:
# Temperature visualization
df_weather.plot(y=['tavg', 'tmin', 'tmax'])
plt.show()

In [None]:
# Precipitation visualization
df_weather.plot(y=['prcp'])
plt.show()

In [None]:
# Saving data
df_weather.to_csv('weather.csv',index = False)

In [None]:
# COVID-19 data
df_covid=pd.read_csv('https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/data-by-day.csv')
df_covid.head()

In [None]:
# Cleaning and saving data
df_covid['date_of_interest']=pd.to_datetime(df_covid['date_of_interest'])
df_covid=df_covid.iloc[:,0:10]
df_covid.to_csv('covid_nyc.csv',index=False)