# Imports

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

## Matplotlib basic Chart

In [None]:
time = np.arange(0, 2, 0.01)
voltage = 1 + np.sin(2 * np.pi * time)

fig, ax = plt.subplots()

ax.plot(time, voltage)
ax.set(xlabel='time (s)', ylabel='voltage (mV)', title='OOh nice graph')
ax.grid()

plt.show()

fig.savefig('simple wavey.png')

## Scatter

In [None]:
random_num = 50
x = np.random.rand(random_num)
y = np.random.rand(random_num)

colours = np.random.rand(random_num)
area = (30 * np.random.rand(random_num)) ** 2

In [None]:
plt.scatter(x, y, c=colours, s=area, alpha=0.5)
plt.show()

## Scatter Plot - Polar

In [None]:
# Dummy data
N = 150
radius = 2 * np.random.rand(N)
theta = 2 * np.pi * np.random.rand(N)

area = 200 * radius**2
colours = theta

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection = 'polar')
ax.scatter(theta, radius, c=colours, s=area, cmap='hsv', alpha=0.5)
plt.show()

# Seaborn

In [None]:
import seaborn as sns

In [None]:
df = sns.load_dataset('penguins')
df

In [None]:
df.describe()

In [None]:
plt.title('Bigger bod = longer bill?')

sns.scatterplot(data=df, x='flipper_length_mm', y='bill_depth_mm', hue='species')
plt.show()

In [None]:
df

In [None]:
sns.pairplot(df, hue='species')

# Maps

In [None]:
import folium
from branca.element import Figure
from folium.plugins import HeatMapWithTime

In [None]:
fmap = folium.Map(location=[51.508446, -0.113687], zoom_start=19)
fmap

In [None]:
fmap2 = folium.Map(location=[51.508446, -0.113687])

folium.TileLayer('Stamen Terrain').add_to(fmap2)
folium.TileLayer('Stamen Toner').add_to(fmap2)
folium.TileLayer('Stamen Water Color').add_to(fmap2)
folium.LayerControl().add_to(fmap2)

folium.Marker(location=[51.5, -0.113], popup='Food market', tooltip='Check this out').add_to(fmap2)
folium.Marker(location=[51.49, -0.12], popup='Hipster Bar', tooltip='Check this out too').add_to(fmap2)

fmap2

# New York Mass Transit

In [34]:
from IPython.display import HTML
HTML('<img src="https://i.giphy.com/media/HudvNjWRtXEBi/giphy.webp">')

### Data Options
**1** You can start at the beginning with the rawest mta_1706.csv I've been able to find. It is however 1.4GB!
This will let you learn and operate the pre-processing steps yourself.
It illustrates the data wrangling we need to do to refine data for specific visualisations
 - Either download directly from Kaggle: https://www.kaggle.com/stoney71/new-york-city-transport-statistics
 - Or grab from the Cog Tech Community box: https://ibm.ent.box.com/folder/127757715737

**2** You can skip the pre-processing and use the much smaller pre-processed file 'pre-processed buses.csv'. If you do, skip to the 'Shortcut without Pre-Processing section'
 - Grab from the Cog Tech Community box: https://ibm.ent.box.com/folder/127757715737

In [35]:
# this will take some time
%%time
df = pd.read_csv('mta_1706.csv', error_bad_lines=False, warn_bad_lines=False)
df

### Pre-Processing

In [36]:
# Data is from 2017 for the month of June
# We want just one day's data and only specific features from the dataset

In [None]:
df = df[df['RecordedAtTime'].str.split(' ').apply(lambda x:x[0]=='2017-06-01')]
df.shape

In [None]:
df

In [None]:
df = df[['RecordedAtTime', 'VehicleRef', 'VehicleLocation.Latitude', 'VehicleLocation.Longitude']]
df

In [None]:
# Check for duplicates
df.duplicated().value_counts()

In [None]:
# If we wanted to drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Check for is nulls
df.isnull().any()

In [None]:
# If we want to drop null values
df.dropna(inplace=True)

In [None]:
df.dtypes

In [None]:
# Looks like our RecordedAtTime isn't in a datetime format

In [None]:
# Convert Recorded Time to DateTime
df['RecordedAtTime'] = pd.to_datetime(df['RecordedAtTime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
dtypes

In [None]:
# We only want to track buses every hour, not every minute or second
# So our final map will show 24 segments (you can break it down to minutes if you really want)

# create Hour Column
df['Hour'] = df['RecordedAtTime'].apply(lambda x: x.hour+1)
df

In [None]:
# We only want the buses' last positions every hour
df2 = pd.DataFrame(df.groupby(['Hour', 'VehicleRef'])['RecordedAtTime'].max())
df2

In [None]:
# Re-add the index column so we don't use the hour column as the index
df2.reset_index(inplace=True)
df2

In [None]:
# Merge df and df2 to get our bus locations at the end of every hour
df3 = pd.merge(df2, df,
              left_on=['Hour', 'VehicleRef', 'RecordedAtTime'],
              right_on=['Hour', 'VehicleRef', 'RecordedAtTime'])
df3

In [None]:
# Save as our pre-processed csv
df3.to_csv('PreProcessedBuses.csv')

# Shortcut without Pre-Processing

In [None]:
# pd.read_csv('Pre-Processed Buses.csv')

In [None]:
# HeatMapWithTime takes data in a specific format
# Needs lat and long in nested arrays for each hour position

[
    [[Bus1 LL], [Bus2 LL], [Bus3 LL], ... ], # Hour 1
    [[Bus1 LL], [Bus2 LL], [Bus3 LL], ... ], # Hour 2
    [[Bus1 LL], [Bus2 LL], [Bus3 LL], ... ], # Hour 3 ...
    ....
]

In [None]:
# We need to refine our pre-processed data into this format for our HeatMap

In [None]:
# Don't worry if you don't understand this
# We're looping through our pre-processed data
# For each hour, we're finding each bus' final location, then adding that to a list of Lat/Long for that hour
# Then adding each hour's list of Lat/Longs to our nested array

lat_long_na = []
for i in range(1, 25):
    location = []
    # iterrates over rows and returns index and instance (contents)
    for index, instance, in df3[df3['Hour'] ==i].iterrows():
        print(instance) # shows each bus location each hour, from which we're 
        location.append([instance['VehicleLocation.Latitude'], instance['VehicleLocation.Longitude']])
    
    lat_long_na.append(location)

In [None]:
# check our our lat long nested array
lat_long_na

# Presentation

In [None]:
# Create our core map
NYfig = Figure(width=700, height=700)
NYmap = folium.Map(location=[40.712, -74.005], zoom_start=10)
NYfig.add_child(NYmap)

# Create our heat map and add it to the core map
heatmap = HeatMapWithTime(lat_long_na,
               radius=5,
               auto_play=True,
               position='bottomright')
heatmap.add_to(NYmap)

In [None]:
NYmap