# Building Model For Predicting Bike Availability

# Loading data and merging into one data frame

In [22]:
import pandas as pd

stations = pd.read_csv("static/csv/stations.csv")
availability = pd.read_csv("static/csv/availability.csv")
weather = pd.read_csv("static/csv/current_weather.csv")

df = availability.merge(weather[["weather_desc", "temp", "visibility", "wind_speed", "wind_deg", "timestamp"]], on = ["timestamp"], how = "left")
df = df.merge(stations[["bike_stands", "number"]], on = ["number"], how = "left")
df.head()

Unnamed: 0,number,available_bikes,available_bike_stands,status,last_update,timestamp,weather_desc,temp,visibility,wind_speed,wind_deg,bike_stands
0,42,30,0,OPEN,1677185535000,1677185702,,,,,,30.0
1,30,4,16,OPEN,1677185621000,1677185702,,,,,,20.0
2,54,5,28,OPEN,1677185522000,1677185702,,,,,,33.0
3,108,13,22,OPEN,1677185161000,1677185702,,,,,,35.0
4,20,6,24,OPEN,1677185529000,1677185702,,,,,,30.0


# Cleaning data frame

## Columns - dropping and renaming

In [23]:
df.drop(["status", "last_update"], axis = 1, inplace = True)
df.rename(columns = {"bike_stands": "total_stands", "available_bike_stands": "available_stands", "number": "station_num"}, inplace = True)
df.head()

Unnamed: 0,station_num,available_bikes,available_stands,timestamp,weather_desc,temp,visibility,wind_speed,wind_deg,total_stands
0,42,30,0,1677185702,,,,,,30.0
1,30,4,16,1677185702,,,,,,20.0
2,54,5,28,1677185702,,,,,,33.0
3,108,13,22,1677185702,,,,,,35.0
4,20,6,24,1677185702,,,,,,30.0


## Rows - dropping rows with no weather data

In [24]:
df.dropna(inplace = True)
df.head()

Unnamed: 0,station_num,available_bikes,available_stands,timestamp,weather_desc,temp,visibility,wind_speed,wind_deg,total_stands
336580,42,16,14,1678113903,light rain,278.81,10000.0,3.09,290.0,30.0
336581,30,0,20,1678113903,light rain,278.81,10000.0,3.09,290.0,20.0
336582,54,17,16,1678113903,light rain,278.81,10000.0,3.09,290.0,33.0
336583,108,14,21,1678113903,light rain,278.81,10000.0,3.09,290.0,35.0
336584,20,22,8,1678113903,light rain,278.81,10000.0,3.09,290.0,30.0


## Date and Time - changing Unix timestamp

In [25]:
df["timestamp"] = pd.to_datetime(df["timestamp"], unit = "s")
df["month"] = df['timestamp'].dt.month  # 1-12
df["day"] = df['timestamp'].dt.dayofweek  # 0-6
df["hour"] = df['timestamp'].dt.hour  # 0-23

df.drop("timestamp", axis = 1, inplace = True)
df.head()

Unnamed: 0,station_num,available_bikes,available_stands,weather_desc,temp,visibility,wind_speed,wind_deg,total_stands,month,day,hour
336580,42,16,14,light rain,278.81,10000.0,3.09,290.0,30.0,3,0,14
336581,30,0,20,light rain,278.81,10000.0,3.09,290.0,20.0,3,0,14
336582,54,17,16,light rain,278.81,10000.0,3.09,290.0,33.0,3,0,14
336583,108,14,21,light rain,278.81,10000.0,3.09,290.0,35.0,3,0,14
336584,20,22,8,light rain,278.81,10000.0,3.09,290.0,30.0,3,0,14


## Reindexing columns

In [26]:
column_order = ["month", "day", "hour", "station_num", "total_stands", "available_stands", "temp", "weather_desc", "visibility", "wind_speed", "wind_deg", "available_bikes"]
df = df.reindex(columns = column_order)
df.head()

Unnamed: 0,month,day,hour,station_num,total_stands,available_stands,temp,weather_desc,visibility,wind_speed,wind_deg,available_bikes
336580,3,0,14,42,30.0,14,278.81,light rain,10000.0,3.09,290.0,16
336581,3,0,14,30,20.0,20,278.81,light rain,10000.0,3.09,290.0,0
336582,3,0,14,54,33.0,16,278.81,light rain,10000.0,3.09,290.0,17
336583,3,0,14,108,35.0,21,278.81,light rain,10000.0,3.09,290.0,14
336584,3,0,14,20,30.0,8,278.81,light rain,10000.0,3.09,290.0,22


## Replacing weather_desc values with corresponding weather IDs

When I started scraping the weather data, I never included the weather IDs for each call to the OpenWeather APIs, preferring to pull a description of the weather in English. It has occured to me that it would have been far more useful to scrape the weather IDs for the purposes of training the model. So, I have counted each unique value for weather descriptions from our database, and below I will replace all of the weather description strings in the weather_desc feature of the data frame with their corresponding weather IDs, as described here: https://openweathermap.org/weather-conditions

In [27]:
df["weather_desc"] = df["weather_desc"].replace({"light rain": 500, "broken clouds": 803, "moderate rain": 501, "light intensity drizzle": 300, "clear sky": 800, "overcast clouds": 804, 
                                                 "scattered clouds": 802, "few clouds": 801, "snow": 601, "heavy intensity rain": 502, "sleet": 611, "mist": 701, "light snow": 600, 
                                                 "light intensity shower rain": 520, "light intensity drizzle rain": 310, "shower rain": 521, "fog": 741})
df.head()

Unnamed: 0,month,day,hour,station_num,total_stands,available_stands,temp,weather_desc,visibility,wind_speed,wind_deg,available_bikes
336580,3,0,14,42,30.0,14,278.81,500,10000.0,3.09,290.0,16
336581,3,0,14,30,20.0,20,278.81,500,10000.0,3.09,290.0,0
336582,3,0,14,54,33.0,16,278.81,500,10000.0,3.09,290.0,17
336583,3,0,14,108,35.0,21,278.81,500,10000.0,3.09,290.0,14
336584,3,0,14,20,30.0,8,278.81,500,10000.0,3.09,290.0,22


# Data separation to X and Y

## Y variable

In [28]:
y = df["available_bikes"]
y

336580     16
336581      0
336582     17
336583     14
336584     22
           ..
1281561     7
1281562     0
1281563     0
1281564    16
1281565    11
Name: available_bikes, Length: 787740, dtype: int64

## X variables

In [29]:
x = df.drop("available_bikes", axis = 1)
x

Unnamed: 0,month,day,hour,station_num,total_stands,available_stands,temp,weather_desc,visibility,wind_speed,wind_deg
336580,3,0,14,42,30.0,14,278.81,500,10000.0,3.09,290.0
336581,3,0,14,30,20.0,20,278.81,500,10000.0,3.09,290.0
336582,3,0,14,54,33.0,16,278.81,500,10000.0,3.09,290.0
336583,3,0,14,108,35.0,21,278.81,500,10000.0,3.09,290.0
336584,3,0,14,20,30.0,8,278.81,500,10000.0,3.09,290.0
...,...,...,...,...,...,...,...,...,...,...,...
1281561,4,1,8,39,20.0,13,282.00,803,10000.0,3.09,160.0
1281562,4,1,8,83,40.0,40,282.00,803,10000.0,3.09,160.0
1281563,4,1,8,92,40.0,40,282.00,803,10000.0,3.09,160.0
1281564,4,1,8,21,30.0,14,282.00,803,10000.0,3.09,160.0


# Splitting data into sets

In [30]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

# Model building

## Linear regression

In [31]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

TypeError: LinearRegression.fit() missing 2 required positional arguments: 'X' and 'y'