In [1]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('cleaned_bike_data.csv')
data = data.dropna()
data.head()

In [None]:
x_data = data.drop(['Unnamed: 0','age'],axis=1)
y_data = data['age'].values.reshape(-1, 1)

In [None]:
print(x_data.shape, y_data.shape)

In [None]:
data_binary_encoded = pd.get_dummies(x_data, columns=["hour"])
data_binary_encoded.head()

In [None]:
data_binary_encoded['end_lat'].describe()

In [None]:
data_binary_encoded['end_long'].describe()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y_data, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)


In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [None]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
model.score(X_train_scaled, y_train_scaled)

In [None]:
km_test = x_data[['start_lat','start_long','end_lat','end_long']]
km_test.head()

In [None]:
from sklearn.cluster import KMeans

r2s = []
k_data = x_data.drop(['start_lat','start_long','end_lat','end_long'], axis=1)

for n in range(3,15):
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(km_test)
    predicted_clusters = kmeans.predict(km_test)
    
    k_data['trip_cluster'] = predicted_clusters
    k_data_encoded = pd.get_dummies(k_data, columns=['trip_cluster','hour'])
    
    X_train, X_test, y_train, y_test = train_test_split(k_data_encoded, y_data, random_state=42)
    
    X_scaler = StandardScaler().fit(X_train)
    y_scaler = StandardScaler().fit(y_train)
    
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    y_train_scaled = y_scaler.transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)
    
    model = LinearRegression()
    model.fit(X_train_scaled, y_train_scaled)
    
    predictions = model.predict(X_test_scaled)
    MSE = mean_squared_error(y_test_scaled, predictions)
    r2 = model.score(X_test_scaled, y_test_scaled)
    
    r2s.append(r2)
    print(f"{n} clusters yielded r2 = {r2}, MSE = {MSE}")
    
    

In [None]:
tree_data = x_data.drop(['start_lat','start_long','end_lat','end_long'], axis=1)

kmeans = KMeans(n_clusters=11)
kmeans.fit(km_test)
predicted_clusters = kmeans.predict(km_test)

k_data['trip_cluster'] = predicted_clusters
k_data_encoded = pd.get_dummies(k_data, columns=['trip_cluster','hour'])
k_data_encoded.head()



In [None]:
X_train, X_test, y_train, y_test = train_test_split(k_data_encoded, y_data, random_state=42)

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=300)
regr.fit(X_train_scaled, y_train_scaled)

In [None]:
regr.score(X_test_scaled,y_test_scaled)