In [38]:
import polars as pl
import numpy as np
import pandas as pd
path = '2021-2023/{}.parquet'

# concat 2021-2023 into single dataframe
df = pl.concat([
    pl.scan_parquet(path.format(2021)),
    pl.scan_parquet(path.format(2022)),
    pl.scan_parquet(path.format(2023))
])

In [29]:
df = df.with_columns([
    (pl.col('ended_at') - pl.col('started_at')).dt.seconds().alias('durationseconds'),
    pl.col('started_at').dt.hour().alias('starthour'),
    pl.col('ended_at').dt.hour().alias('endhour'),
    pl.col('started_at').dt.month().alias('month'),
    pl.col('started_at').dt.year().alias('year'),
]).drop(['ride_id', 'start_station_id', 'end_station_id']).drop_nulls()

In [20]:
# df_rideabletype = df.filter(pl.col('rideable_type').is_in(['docked_bike', 'electric_bike', 'classic_bike']))

# map_dict = {
#     'electric_bike': 0,
#     'docked_bike': 1,
#     'classic_bike': 2,
# }

# df_rideabletype = df_rideabletype.with_columns(
#     pl.col('rideable_type').map_dict(map_dict).alias('rideable_type').cast(pl.Int32)
# )

In [30]:
df_sample = df.collect().sample(n=100000)

In [34]:
df_sample = df_sample.to_pandas()

In [37]:
df_sample.head(3)

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng,member_casual,durationseconds,starthour,endhour,month,year
0,classic_bike,2022-02-13 02:24:38,2022-02-13 02:29:32,Irving Ave & Jefferson St,40.705379,-73.925346,Irving Ave & Harman St,40.70108,-73.9179,member,294,2,2,2,2022
1,classic_bike,2023-05-26 15:12:07,2023-05-26 15:15:54,E 128 St & 3 Ave,40.805607,-73.934059,E 128 St & 3 Ave,40.80563,-73.934074,member,227,15,15,5,2023
2,electric_bike,2021-08-23 10:14:50,2021-08-23 10:15:35,St. Nicholas Ave & W 126 St,40.811432,-73.951881,St. Nicholas Ave & W 126 St,40.811432,-73.951881,casual,45,10,10,8,2021


In [42]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# encode categorical variables
le = LabelEncoder()
ohe = OneHotEncoder(sparse=False)

ohecols = ['start_station_name', 'end_station_name']
lecols = ['rideable_type', 'member_casual', 'month', 'year', 'endhour', 'starthour', 'year', 'durationseconds']

def preprocess(df):
    df = df.copy()
    for col in ohecols:
        df[col] = le.fit_transform(df[col])
    for col in lecols:
        df[col] = le.fit_transform(df[col])
    return df


In [47]:
preprocess_df = preprocess(df_sample)
preprocess_df.drop(['started_at', 'ended_at'], axis=1, inplace=True)

In [49]:
from sklearn.model_selection import train_test_split

X = preprocess_df.drop('rideable_type', axis=1)
y = preprocess_df['rideable_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [50]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf.fit(X_train, y_train)

In [51]:
y_pred = rf.predict(X_test)

In [53]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)


Accuracy:  0.7755


In [54]:
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [57]:
for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)