#### Feature Engineering ####
The process of `feature engineering` includes following steps:

- Brainstorming or Testing features;
- Deciding what features to create;
- Creating features;
- Checking how the features work with your model;
- Improving your features if needed;
- Go back to brainstorming/creating more features until the work is done.

In [24]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
import holidays as hd
import calendar
from datetime import datetime, date
from pprint import pprint

In [25]:
cycle_usage = pd.read_csv("cycleusage_cleansed.csv")
cycle_usage.count()

StartStation Id               33927
Start Date                    33927
EndStation Id                 33927
End Date                      33927
Duration                      33927
StartStation Id Used          33927
EndStation Id Used            33927
Frequency                     33927
StartStation Address          33927
StartStation latitude         33927
StartStation longitude        33927
StartStation capacity         33927
EndStation Address            33927
EndStation latitude           33927
EndStation longitude          33927
EndStation capacity           33927
distance (geodesic)           33927
Daily Weather                 33927
Hourly Weather                33927
Humidity                      33927
Windspeed                     33927
Apparent Temperature (Avg)    33927
dtype: int64

In [26]:
cycle_usage[cycle_usage["StartStation Id"] == 191]

Unnamed: 0,StartStation Id,Start Date,EndStation Id,End Date,Duration,StartStation Id Used,EndStation Id Used,Frequency,StartStation Address,StartStation latitude,...,EndStation Address,EndStation latitude,EndStation longitude,EndStation capacity,distance (geodesic),Daily Weather,Hourly Weather,Humidity,Windspeed,Apparent Temperature (Avg)
0,191,04/09/2015 11:24,172,04/09/2015 11:41,1020,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-day,"[{'time': 1441321200, 'summary': 'Partly Cloud...",0.76,1.62,5395
1,191,02/08/2015 16:23,172,02/08/2015 19:15,10320,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-night,"[{'time': 1438470000, 'summary': 'Clear', 'ico...",0.65,1.86,69165
2,191,23/10/2017 19:55,172,23/10/2017 20:13,1080,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-day,"[{'time': 1508713200, 'summary': 'Overcast', '...",0.89,3.17,59145
3,191,19/08/2016 21:14,172,19/08/2016 21:33,1140,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-day,"[{'time': 1471561200, 'summary': 'Partly Cloud...",0.88,2.57,6246
4,191,07/08/2015 21:05,172,07/08/2015 21:25,1200,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-day,"[{'time': 1438902000, 'summary': 'Clear', 'ico...",0.67,0.41,6611
5,191,28/11/2017 11:19,223,28/11/2017 11:39,1200,239846,39262,48,"Hyde Park Corner, Hyde Park",51.503118,...,"Rodney Road , Walworth",51.491485,-0.090221,24,4.582428,partly-cloudy-day,"[{'time': 1511827200, 'summary': 'Clear', 'ico...",0.82,2.97,36725
6,191,21/06/2015 20:09,172,21/06/2015 20:30,1260,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-day,"[{'time': 1434841200, 'summary': 'Mostly Cloud...",0.70,3.76,6033
7,191,08/05/2016 19:55,223,08/05/2016 20:16,1260,239846,39262,48,"Hyde Park Corner, Hyde Park",51.503118,...,"Rodney Road , Walworth",51.491485,-0.090221,24,4.582428,clear-day,"[{'time': 1462662000, 'summary': 'Clear', 'ico...",0.55,2.89,68235
8,191,21/06/2015 20:08,172,21/06/2015 20:30,1320,239846,30287,104,"Hyde Park Corner, Hyde Park",51.503118,...,"Sumner Place, South Kensington",51.491212,-0.173716,19,1.929121,partly-cloudy-day,"[{'time': 1434841200, 'summary': 'Mostly Cloud...",0.70,3.76,6033
9,191,03/07/2017 19:51,223,03/07/2017 20:13,1320,239846,39262,48,"Hyde Park Corner, Hyde Park",51.503118,...,"Rodney Road , Walworth",51.491485,-0.090221,24,4.582428,partly-cloudy-day,"[{'time': 1499036400, 'summary': 'Clear', 'ico...",0.70,3.07,6469


In [27]:
rm_columns = {
    #"StartStation Id",
    #"Start Date",
    "StartStation Address",
   # "StartStation capacity",
    #"EndStation Id",
    "End Date",
    "EndStation Address",
   # "EndStation capacity",
   # "Duration",
   # "Frequency",
  #  "Humidity",
   # "Windspeed",
  #  "Apparent Temperature (Avg)",
    "StartStation Id Used",
    "EndStation Id Used",
    "StartStation latitude",
    "StartStation longitude",
    "EndStation latitude",
    "EndStation longitude",
    "Hourly Weather",
   # "distance (geodesic)"
   # "Daily Weather"
}

cycle_usage.drop(columns=rm_columns, inplace=True)
cycle_usage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33927 entries, 0 to 33926
Data columns (total 12 columns):
StartStation Id               33927 non-null int64
Start Date                    33927 non-null object
EndStation Id                 33927 non-null int64
Duration                      33927 non-null int64
Frequency                     33927 non-null int64
StartStation capacity         33927 non-null int64
EndStation capacity           33927 non-null int64
distance (geodesic)           33927 non-null float64
Daily Weather                 33927 non-null object
Humidity                      33927 non-null float64
Windspeed                     33927 non-null float64
Apparent Temperature (Avg)    33927 non-null object
dtypes: float64(3), int64(6), object(3)
memory usage: 3.1+ MB


In [28]:
# Check for empty values and empty strings
np.where(pd.isnull(cycle_usage))
np.where(cycle_usage.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

#### Darksky note:#####
> Our system is presently very simple: it finds the “worst” weather condition that will happen during the day (4AM to 4AM), and uses the icon for it. The only case where a daily icon will show a *-night value is partly-cloudy-night, and this is done to match the daily summary text. We already have intentions to change this behavior, because it is confusing. 
In the meantime, you can assume that if partly-cloudy-night is the worst weather condition that was found, that it was clear during the day. So you can just treat partly-cloudy-night as an alias for clear-day. 

In [29]:
#print(cycle_usage.iloc[452861]["Start Date"][:10], cycle_usage.iloc[452861]["Start Date"][-5:])

In [30]:
cycle_usage.groupby(by="Daily Weather").count()
cycle_usage["Daily Weather"].loc[cycle_usage["Daily Weather"]=="partly-cloudy-night"] = "clear-day"

In [31]:
# Inconsistent dates e.g. 00:00 01:25:00
#cycle_usage.iloc[452861]
for index, p in cycle_usage.iterrows():
    if (len(p["Start Date"]) == 19):
      #  print(index, p["Start Date"])
        cycle_usage["Start Date"].iloc[index] = p["Start Date"][:16]
    elif (len(p["Start Date"]) > 19):
        print("anomaly", index, p["Start Date"])
        cycle_usage["Start Date"].iloc[index] = p["Start Date"][:10] + " " +p["Start Date"][-5:]

cycle_usage.dropna(inplace=True)
cycle_usage.count()


StartStation Id               33927
Start Date                    33927
EndStation Id                 33927
Duration                      33927
Frequency                     33927
StartStation capacity         33927
EndStation capacity           33927
distance (geodesic)           33927
Daily Weather                 33927
Humidity                      33927
Windspeed                     33927
Apparent Temperature (Avg)    33927
dtype: int64

Adding weekdays (Monday, Tuesday...)

In [32]:
#Add weekdays
cycle_usage["Start Date"] =  pd.to_datetime(cycle_usage["Start Date"], format='%d/%m/%Y %H:%M')
cycle_usage['Weekday'] = cycle_usage.apply(lambda row: calendar.day_name[row["Start Date"].weekday()],axis=1)

In [33]:
cycle_usage.head()

Unnamed: 0,StartStation Id,Start Date,EndStation Id,Duration,Frequency,StartStation capacity,EndStation capacity,distance (geodesic),Daily Weather,Humidity,Windspeed,Apparent Temperature (Avg),Weekday
0,191,2015-09-04 11:24:00,172,1020,104,28,19,1.929121,partly-cloudy-day,0.76,1.62,5395,Friday
1,191,2015-08-02 16:23:00,172,10320,104,28,19,1.929121,clear-day,0.65,1.86,69165,Sunday
2,191,2017-10-23 19:55:00,172,1080,104,28,19,1.929121,partly-cloudy-day,0.89,3.17,59145,Monday
3,191,2016-08-19 21:14:00,172,1140,104,28,19,1.929121,partly-cloudy-day,0.88,2.57,6246,Friday
4,191,2015-08-07 21:05:00,172,1200,104,28,19,1.929121,partly-cloudy-day,0.67,0.41,6611,Friday



`Meteorologische Jahreszeiten` <br>
Nördliche Hemisphäre <br>
Frühling: 1. März bis 31. Mai <br>
Sommer: 1. Juni bis 31. August <br>
Herbst: 1. September bis 30. November <br>
Winter: 1. Dezember bis 28. Februar <br>

In [34]:
#Add seasons
def seasons(p):
    """Get meteorological season"""
    year = int(str(p["Start Date"])[:4])
    date_m = p["Start Date"]
    if date_m >= datetime(year, 3, 1, 0,0,0) and date_m <= datetime(year, 5, 31, 23,59,59):
        return "Spring"
    elif date_m >= datetime(year, 6, 1, 0,0,0) and date_m <= datetime(year, 8, 31, 23,59,59):
        return "Summer"
    elif date_m >= datetime(year, 9, 1, 0,0,0) and date_m <= datetime(year, 11, 30, 23,59,59):
        return "Autumn"
    elif date_m >= datetime(year, 12, 1, 0,0,0) or date_m < datetime(year, 3, 1, 23,59,59):
        return "Winter"
        
cycle_usage['Season'] = cycle_usage.apply(lambda row: seasons(row),axis=1)
cycle_usage.head()

Unnamed: 0,StartStation Id,Start Date,EndStation Id,Duration,Frequency,StartStation capacity,EndStation capacity,distance (geodesic),Daily Weather,Humidity,Windspeed,Apparent Temperature (Avg),Weekday,Season
0,191,2015-09-04 11:24:00,172,1020,104,28,19,1.929121,partly-cloudy-day,0.76,1.62,5395,Friday,Autumn
1,191,2015-08-02 16:23:00,172,10320,104,28,19,1.929121,clear-day,0.65,1.86,69165,Sunday,Summer
2,191,2017-10-23 19:55:00,172,1080,104,28,19,1.929121,partly-cloudy-day,0.89,3.17,59145,Monday,Autumn
3,191,2016-08-19 21:14:00,172,1140,104,28,19,1.929121,partly-cloudy-day,0.88,2.57,6246,Friday,Summer
4,191,2015-08-07 21:05:00,172,1200,104,28,19,1.929121,partly-cloudy-day,0.67,0.41,6611,Friday,Summer


 ##### Day & Night cycle ####
 After 6 pm night, after 6 am day → “Day” & “Night”

In [35]:
#Day night state
def daynight(p):
    """Returns day or night depending on clock"""
    timeclock = p["Start Date"].hour
    if timeclock > 6 and timeclock < 18:
        return "day"
    else:
        return "night"       
cycle_usage["Day & Night"] = cycle_usage.apply(lambda row: daynight(row), axis=1)

###### Holidays ######
Check if that day is a specific holiday?

In [36]:
#Consider holidays (e.g. Good Friday in UK)
def holiday(p):
    """ Checks if holiday """
    uk_holidays = hd.UK()
    if (p["Start Date"].date() in uk_holidays):
        return True
    else:
        return False
    
for date2, name in sorted(hd.UK(state='London', years=[2015,2016,2017], observed=False).items()):
    print(date2, name)
    
cycle_usage["Holiday"] = cycle_usage.apply(lambda row: holiday(row), axis=1)            
cycle_usage.head()

2015-01-01 New Year's Day
2015-01-02 New Year Holiday [Scotland]
2015-03-17 St. Patrick's Day [Northern Ireland]
2015-04-03 Good Friday
2015-04-06 Easter Monday [England, Wales, Northern Ireland]
2015-05-04 May Day
2015-05-25 Spring Bank Holiday
2015-07-12 Battle of the Boyne [Northern Ireland]
2015-08-03 Summer Bank Holiday [Scotland]
2015-08-31 Late Summer Bank Holiday [England, Wales, Northern Ireland]
2015-11-30 St. Andrew's Day [Scotland]
2015-12-25 Christmas Day
2015-12-26 Boxing Day
2016-01-01 New Year's Day
2016-01-02 New Year Holiday [Scotland]
2016-03-17 St. Patrick's Day [Northern Ireland]
2016-03-25 Good Friday
2016-03-28 Easter Monday [England, Wales, Northern Ireland]
2016-05-02 May Day
2016-05-30 Spring Bank Holiday
2016-07-12 Battle of the Boyne [Northern Ireland]
2016-08-01 Summer Bank Holiday [Scotland]
2016-08-29 Late Summer Bank Holiday [England, Wales, Northern Ireland]
2016-11-30 St. Andrew's Day [Scotland]
2016-12-25 Christmas Day
2016-12-26 Boxing Day
2017-01-01

Unnamed: 0,StartStation Id,Start Date,EndStation Id,Duration,Frequency,StartStation capacity,EndStation capacity,distance (geodesic),Daily Weather,Humidity,Windspeed,Apparent Temperature (Avg),Weekday,Season,Day & Night,Holiday
0,191,2015-09-04 11:24:00,172,1020,104,28,19,1.929121,partly-cloudy-day,0.76,1.62,5395,Friday,Autumn,day,False
1,191,2015-08-02 16:23:00,172,10320,104,28,19,1.929121,clear-day,0.65,1.86,69165,Sunday,Summer,day,False
2,191,2017-10-23 19:55:00,172,1080,104,28,19,1.929121,partly-cloudy-day,0.89,3.17,59145,Monday,Autumn,night,False
3,191,2016-08-19 21:14:00,172,1140,104,28,19,1.929121,partly-cloudy-day,0.88,2.57,6246,Friday,Summer,night,False
4,191,2015-08-07 21:05:00,172,1200,104,28,19,1.929121,partly-cloudy-day,0.67,0.41,6611,Friday,Summer,night,False


In [37]:
# Months
def months_names(p):
    """Returns month name"""
    months = {
        1: "January",
        2: "February",
        3: "March",
        4: "April",
        5: "May",
        6: "June",
        7: "July",
        8: "August",
        9: "September",
        10: "October",
        11: "November",
        12: "December"
    }
    return months.get(p["Start Date"].month, "not defined")

cycle_usage["Month"] = cycle_usage.apply(lambda row: months_names(row), axis=1)

##### Split Start Date #####
> Dates are difficult to handle for ML. Idea: splitting to several columns

In [38]:
#Extract only dd-mm-YYYY
cycle_usage['Date'] = cycle_usage.apply(lambda row: str(row["Start Date"])[:10], axis=1)
cycle_usage['Date'] = pd.to_datetime(cycle_usage.Date, format="%Y/%m/%d")
#Extracting Year
cycle_usage['Year'] = cycle_usage['Date'].dt.year
#Extracting Month
cycle_usage['Month'] = cycle_usage['Date'].dt.month
#Extracting passed years since the date
cycle_usage['Passed_Years'] = date.today().year - cycle_usage['Date'].dt.year
#Extracting passed months since the date
cycle_usage['Passed_Months'] = (date.today().year - cycle_usage['Date'].dt.year) * 12 + date.today().month - cycle_usage['Date'].dt.month
cycle_usage.head()

Unnamed: 0,StartStation Id,Start Date,EndStation Id,Duration,Frequency,StartStation capacity,EndStation capacity,distance (geodesic),Daily Weather,Humidity,...,Apparent Temperature (Avg),Weekday,Season,Day & Night,Holiday,Month,Date,Year,Passed_Years,Passed_Months
0,191,2015-09-04 11:24:00,172,1020,104,28,19,1.929121,partly-cloudy-day,0.76,...,5395,Friday,Autumn,day,False,9,2015-09-04,2015,4,43
1,191,2015-08-02 16:23:00,172,10320,104,28,19,1.929121,clear-day,0.65,...,69165,Sunday,Summer,day,False,8,2015-08-02,2015,4,44
2,191,2017-10-23 19:55:00,172,1080,104,28,19,1.929121,partly-cloudy-day,0.89,...,59145,Monday,Autumn,night,False,10,2017-10-23,2017,2,18
3,191,2016-08-19 21:14:00,172,1140,104,28,19,1.929121,partly-cloudy-day,0.88,...,6246,Friday,Summer,night,False,8,2016-08-19,2016,3,32
4,191,2015-08-07 21:05:00,172,1200,104,28,19,1.929121,partly-cloudy-day,0.67,...,6611,Friday,Summer,night,False,8,2015-08-07,2015,4,44


In [39]:
cycle_usage.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33927 entries, 0 to 33926
Data columns (total 21 columns):
StartStation Id               33927 non-null int64
Start Date                    33927 non-null datetime64[ns]
EndStation Id                 33927 non-null int64
Duration                      33927 non-null int64
Frequency                     33927 non-null int64
StartStation capacity         33927 non-null int64
EndStation capacity           33927 non-null int64
distance (geodesic)           33927 non-null float64
Daily Weather                 33927 non-null object
Humidity                      33927 non-null float64
Windspeed                     33927 non-null float64
Apparent Temperature (Avg)    33927 non-null object
Weekday                       33927 non-null object
Season                        33927 non-null object
Day & Night                   33927 non-null object
Holiday                       33927 non-null bool
Month                         33927 non-null int64
Date 

In [40]:
rm_columns = {
    #"StartStation Id",
    "Start Date",
    "StartStation Address",
    "StartStation capacity",
    #"EndStation Id",
    "End Date",
    "EndStation Address",
    "EndStation capacity",
    "Duration",
    "Frequency",
   # "Humidity",
   # "Windspeed",
   # "Apparent Temperature (Avg)",
    "StartStation Id Used",
    "EndStation Id Used",
    "StartStation latitude",
    "StartStation longitude",
    "EndStation latitude",
    "EndStation longitude",
    "Hourly Weather",
   # "distance (geodesic)"
   # "Daily Weather"
}

cycle_usage.drop(columns=rm_columns, inplace=True, errors="ignore")
#cycle_usage.drop_duplicates(inplace=True)
cycle_usage.head()

Unnamed: 0,StartStation Id,EndStation Id,distance (geodesic),Daily Weather,Humidity,Windspeed,Apparent Temperature (Avg),Weekday,Season,Day & Night,Holiday,Month,Date,Year,Passed_Years,Passed_Months
0,191,172,1.929121,partly-cloudy-day,0.76,1.62,5395,Friday,Autumn,day,False,9,2015-09-04,2015,4,43
1,191,172,1.929121,clear-day,0.65,1.86,69165,Sunday,Summer,day,False,8,2015-08-02,2015,4,44
2,191,172,1.929121,partly-cloudy-day,0.89,3.17,59145,Monday,Autumn,night,False,10,2017-10-23,2017,2,18
3,191,172,1.929121,partly-cloudy-day,0.88,2.57,6246,Friday,Summer,night,False,8,2016-08-19,2016,3,32
4,191,172,1.929121,partly-cloudy-day,0.67,0.41,6611,Friday,Summer,night,False,8,2015-08-07,2015,4,44


In [41]:
cycle_usage.count()

StartStation Id               33927
EndStation Id                 33927
distance (geodesic)           33927
Daily Weather                 33927
Humidity                      33927
Windspeed                     33927
Apparent Temperature (Avg)    33927
Weekday                       33927
Season                        33927
Day & Night                   33927
Holiday                       33927
Month                         33927
Date                          33927
Year                          33927
Passed_Years                  33927
Passed_Months                 33927
dtype: int64

In [42]:
# Calculate new frequency of rented bikes
cycle_usage = pd.merge(cycle_usage, cycle_usage.groupby(["Date"])["Humidity"].count().reset_index(name="Frequency"), how='left', on="Date", 
         left_index=False, right_index=False, sort=True)

In [43]:
#cycle_usage.iloc[67]

In [44]:
#Get the most frequently occurring record
cycle_usage.groupby(by="StartStation Id").count().sort_values("Frequency", ascending=False).head(1)

Unnamed: 0_level_0,EndStation Id,distance (geodesic),Daily Weather,Humidity,Windspeed,Apparent Temperature (Avg),Weekday,Season,Day & Night,Holiday,Month,Date,Year,Passed_Years,Passed_Months,Frequency
StartStation Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
191,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927,33927


In [45]:
cycle_usage = cycle_usage[cycle_usage["StartStation Id"] == 191]

In [46]:
cycle_usage.to_csv("features.csv", header=True, index=False)