# 02806 - Social Data Analysis and Viszalization -  Project Assignment B

# 0. Package and data import

## 0.1 Package import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
import seaborn as sns
import json
import urllib.request

## 0.2 Data import

### Trip Data

In [3]:
def getDataByYear(year):
    '''
    A function to combine the data records of a year to a dataframe.
    Due to different headers in the individual files (spaces, upper and
    lower case letters, etc.), these are normalized.
    Output is a Pandas Dataframe for the requested year.
    '''
    name=""
    df=[]
    
    for k in range(1,13):
        if k<10:
            name = str(year)+"0"+str(k)
        else:
            name = str(year)+""+str(k)
        name +="-citibike-tripdata.csv"
        
        df.append(pd.read_csv('data/'+name))
        #set columns to lower case
        df[k-1].columns = map(str.lower, df[k-1].columns)
        #Filter data 
        df[k-1].columns = [x.replace(" ","") for x in list(df[k-1].columns)]
        df[k-1]["starttime"] = pd.to_datetime(df[k-1]["starttime"], infer_datetime_format=True)
        df[k-1]["stoptime"] = pd.to_datetime(df[k-1]["stoptime"], infer_datetime_format=True)
        
    return pd.concat(df,ignore_index=True)

In [None]:
#Dataset of years 2016-2018
bd16=getDataByYear(2016)
bd17=getDataByYear(2017)
bd18=getDataByYear(2018)

In [None]:
#Merge all trip data
frames = [df16,df17,df18]
df_trips=pd.concat(frames)

### Station data

In [None]:
#Import data from json url. Delete unnecessary columns
with urllib.request.urlopen("https://feeds.citibikenyc.com/stations/stations.json") as url:
    data = json.loads(url.read().decode())
    del data['executionTime']
    data = data['stationBeanList']

#Add data from json objects to list. Only include necessary fields
stations = []
for station in data:
    stations.append([station['id'], station['stationName'], station['latitude'], station['longitude'], station['totalDocks']])

#make dataframe of station info from list derived from json object
df_stations = pd.DataFrame(stations, columns=["id", "stationname", "lat", "long", "capacity"])

# 1. Motivation.
What is your dataset?

Why did you choose this/these particular dataset(s)?

What was your goal for the end user's experience?


# 2. Basic stats. 

Let's understand the dataset better


Write about your choices in data cleaning and preprocessing


Write a short section that discusses the dataset stats, containing key points/plots from your exploratory data analysis.

# 3. Data Analysis

Describe your data analysis and explain what you've learned about the dataset.

If relevant, talk about your machine-learning.

## 3.1. Hotspots

## 3.2. Demand

#### Delete unnecessary columns and change to datetime objects

In [None]:
df_demand = df_trips.copy()
del_cols = ['tripduration', 'startstationname', 'startstationlatitude', 'startstationlongitude', 'endstationname',
       'endstationlatitude', 'endstationlongitude', 'usertype', 'birthyear', 'gender', 'bikeid']
df_demand = df_demand.drop(del_cols, axis=1)
df_demand['startday'] = df_demand['starttime'].dt.dayofweek
df_demand['stopday'] = df_demand['stoptime'].dt.dayofweek
df_demand['starttime'] = df_demand['starttime'].dt.hour
df_demand['stoptime'] = df_demand['stoptime'].dt.hour

#### Find stations that have both incoming and outgoing bikes

In [None]:
df2_demand = df_demand[['startstationid']].drop_duplicates()
df3_demand = df_demand[['endstationid']].drop_duplicates()
df4_demand = pd.merge(df2_demand,df3_demand, how='inner', left_on='startstationid', right_on='endstationid')
df4_demand = df4_demand[['startstationid']].drop_duplicates().dropna()
df4_demand['startstationid'] = df4_demand['startstationid'].apply(lambda x : int(x))
df4_demand.columns = ['id']

#### Find stations that are in both dataset, and merge these. Save data to csv.

In [None]:
df_demand_common_stations = pd.merge(df1_stations, df4_demand, how='inner', on=['id'])
df_demand_common_stations.to_csv('common_stations.csv', index=False)

#### Count bikes in and out for weekdays and hours.

In [None]:
df_demand_start = df_demand[['startstationid', 'startday','starttime']]
df_demand_stop = df_demand[['endstationid', 'stopday', 'stoptime']]
df_demand_start= pd.DataFrame({'startcount' : df_demand_start.groupby(['startstationid', 'startday', 'starttime']).size()}).reset_index()
df_demand_stop = pd.DataFrame({'stopcount' : df_demand_stop.groupby(['endstationid', 'stopday', 'stoptime']).size()}).reset_index()

#### Merge in and out data

In [None]:
df_demand_count = pd.merge(df_demand_start, df_demand_stop, left_on=['startstationid', 'startday', 'starttime'], 
                    right_on=['endstationid', 'stopday', 'stoptime'])
df_demand_count = df_demand_count.drop(['endstationid', 'stopday', 'stoptime'], axis=1)
df_demand_count = df_demand_count.rename(columns={'startstationid': 'id', 'startday': 'weekday','starttime': 'hour'})

#### Calculate netcount and demand

In [None]:
df_demand_count['netcount'] = df_demand_count['stopcount'] - df_demand_count['startcount']
df_demand_count = pd.merge(df_demand_count,df_demand_common_stations[['id','capacity']],on='id', how='left')

#drop rows with stations that are not in capacity dataset
df_demand_count = df_demand_count[np.isfinite(df_demand_count['capacity'])]

#Demand function. Function is negated to give stations in higher demand a higher value
df_demand_count['demand'] = -(df_demand_count['netcount'])/df_demand_count['capacity']

#Convert id to int
df_demand_count['id'] = df_demand_count['id'].apply(lambda x : int(x))

#drop rows with stations that have 0 in capacity
df_demand_count = df_demand_count[df_demand_count.capacity != 0]

#drop duplicates
df_demand_count = df_demand_count.drop_duplicates() 

#### Save filtered data to .csv

In [None]:
df_demand_count.to_csv('capacity.csv', index=False)

## 3.3. Business prediction

# 4. Genre. Which genre of data story did you use?

Which tools did you use from each of the 3 categories of Visual Narrative (Figure 7 in Segal and Heer). Why?

Which tools did you use from each of the 3 categories of Narrative Structure (Figure 7 in Segal and Heer). Why?

# 5. Visualizations.

Explain the visualizations you've chosen.

Why are they right for the story you want to tell?

# 6. Discussion. Think critically about your creation
What went well?,
What is still missing? What could be improved?, Why?