In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_json('data.json')

In [3]:
df.shape

(797565, 7)

In [4]:
df.head

<bound method NDFrame.head of         station_id  available_bikes  temp  rain  wind  day  hour
0                1               26     9     0    24    4     9
1                1               27    10     0    23    4    10
2                1               26    10     0    23    4    10
3                1               26    10     0    23    4    10
4                1               27    10     0    23    4    10
...            ...              ...   ...   ...   ...  ...   ...
797560         117                5     7     0    16    2     2
797561         117                5     7     0    16    2     2
797562         117                5     7     0    16    2     2
797563         117                5     7     0    16    2     2
797564         117                5     7     0    16    2     2

[797565 rows x 7 columns]>

In [5]:
def r_squared(y_val, pred_y_val):
    SST = np.sum(np.square(y_val - np.mean(y_val)))
    SSE = np.sum(np.square(y_val - pred_y_val))
    return 1 - (SSE / SST)

In [6]:
stand_dfs = []

In [7]:
for i in range(1,118):
    df_stand = df[df["station_id"] == i]
    stand_dfs.append(df_stand)
    

In [8]:
r = []
for i in range(len(stand_dfs)):
    if (stand_dfs[i].empty):
        continue
    df_current = stand_dfs[i]
    train_set = df_current.sample(frac=0.7, random_state=42)
    test_set = df_current.drop(train_set.index)
       
    model = LinearRegression()
    x = train_set[['day', 'hour', 'temp', 'rain', 'wind']]  # Independent variable(s)
    y = train_set['available_bikes']    # Dependent variable
    model.fit(x, y)
    
    test_x = train_set[['day', 'hour', 'temp', 'rain', 'wind']]
    test_y = train_set['available_bikes']
    predictions = model.predict(test_x)
    
    r.append(r_squared(test_y, predictions))
print("max: ", max(r) * 100)
print("min: ", min(r) * 100)
print("avg: ", np.mean(r) * 100)

max:  35.87997244797035
min:  0.28077439418029426
avg:  9.814594968744487


In [9]:
r = []
for i in range(len(stand_dfs)):
    if (stand_dfs[i].empty):
        continue
    df_current = stand_dfs[i]
    train_set = df_current.sample(frac=0.7, random_state=42)
    test_set = df_current.drop(train_set.index)
       
    model = RandomForestRegressor()
    x = train_set[['day', 'hour', 'temp', 'rain', 'wind']]  # Independent variable(s)
    y = train_set['available_bikes']    # Dependent variable
    model.fit(x, y)
    
    test_x = train_set[['day', 'hour', 'temp', 'rain', 'wind']]
    test_y = train_set['available_bikes']
    predictions = model.predict(test_x)
    
    r.append(r_squared(test_y, predictions))
print("max: ", max(r) * 100)
print("min: ", min(r) * 100)
print("avg: ", np.mean(r) * 100)

KeyboardInterrupt: 

In [35]:
r = []
for i in range(len(stand_dfs)):
    if (stand_dfs[i].empty):
        continue
    df_current = stand_dfs[i]
    
    train_set = df_current.sample(frac=0.7, random_state=42)
    test_set = df_current.drop(train_set.index)
       
    model = KNeighborsClassifier(n_neighbors=5)
    x = train_set[['day', 'hour', 'temp', 'rain', 'wind']]  # Independent variable(s)
    y = train_set['available_bikes']    # Dependent variable
    model.fit(x, y)
    
    test_x = train_set[['day', 'hour', 'temp', 'rain', 'wind']]
    test_y = train_set['available_bikes']
    predictions = model.predict(test_x)
    
    r.append(r_squared(test_y, predictions))
print("max: ", max(r) * 100)
print("min: ", min(r) * 100)
print("avg: ", np.mean(r) * 100)

max:  97.96395161827543
min:  70.77521816123121
avg:  93.14225101577793


In [36]:
models= []
for i in range(len(stand_dfs)):
    if (stand_dfs[i].empty):
        continue
    df_current = stand_dfs[i]
    model = RandomForestRegressor()
    x = df_current[['day', 'hour', 'temp', 'rain', 'wind']]  # Independent variable(s)
    y = df_current['available_bikes']    # Dependent variable
    model.fit(x, y)
    
    models.append(model)

In [None]:
i = 0
while i < len(stand_dfs):
    if (stand_dfs[i].empty):
        continue
    

In [38]:
folder_path = '../model_files'
for i, model in enumerate(models):
    filename = os.path.join(folder_path, f'model_{i}.pkl')
    with open(filename, 'wb') as file:
        pickle.dump(model, file)