In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [83]:
# Load and prepare the dataset
df = pd.read_csv('mldata.csv')
df.dropna(inplace=True)

In [84]:
df.sample(20)

Unnamed: 0,trip_id,segment_bearing,angle,start_time,end_time,total_time,length,cell_index,cell_bearing,cell_angle,...,cell_connectivity,cell_bicycle_lane_density,cell_primary_density,cell_secondary_density,cell_other_density,cell_ped_density,cell_dead_end_density,segment_geom,angle_normalized,cell_angle_normalized
1390378,215774,0.950638,54.467535,2023-07-10 08:56:01.499496,2023-07-10 09:08:02.020033,00:12:00.520537,410.069564,89196d85eabffff,0.481909,27.61134,...,38.480448,0.000727,0.0,0.009164,0.024214,0.011243,6.8e-05,SRID=4326;LINESTRING(6.557900834514772 53.2188...,54.467535,27.61134
1452860,258786,0.676294,38.748819,2023-01-21 08:10:01.214409,2023-01-21 08:24:01.369691,00:14:00.155282,349.815311,89196d85e07ffff,0.562793,32.245656,...,32.669112,0.011539,0.0,0.010968,0.015126,0.017158,3.4e-05,SRID=4326;LINESTRING(6.554698789373306 53.2178...,38.748819,32.245656
111752,325007,5.965436,341.794334,2023-05-20 12:16:01.838379,2023-05-20 12:38:01.963031,00:22:00.124652,321.427294,89196d853abffff,0.663205,37.998867,...,38.171459,0.007956,0.002739,0.007692,0.025719,0.013489,6.8e-05,SRID=4326;LINESTRING(6.540290434102627 53.2328...,161.794334,37.998867
701466,279127,4.231613,242.453591,2023-05-02 23:26:01.931827,2023-05-02 23:38:01.439673,00:11:59.507846,288.64722,89196d85eabffff,0.481909,27.61134,...,38.480448,0.000727,0.0,0.009164,0.024214,0.011243,6.8e-05,SRID=4326;LINESTRING(6.561489318671504 53.2211...,62.453591,27.61134
692350,244262,4.264884,244.359866,2023-04-21 19:38:01.672075,2023-04-21 20:00:01.409848,00:21:59.737773,356.581113,89196d8ec87ffff,-1.803909,-103.356395,...,110.705144,0.003809,0.0,0.0,0.006763,0.001983,0.0,SRID=4326;LINESTRING(6.512127389754001 53.2359...,64.359866,76.643605
311674,280381,5.685455,325.752574,2023-01-22 13:48:01.666002,2023-01-22 14:10:01.09579,00:21:59.429788,304.132675,89196d858d3ffff,-1.816108,-104.055334,...,31.509571,0.00018,0.0,0.0,0.029761,0.017896,0.000125,SRID=4326;LINESTRING(6.572084667179786 53.2152...,145.752574,75.944666
801891,316111,3.32631,190.583535,2023-09-24 18:02:01.555333,2023-09-24 18:34:01.722865,00:32:00.167532,420.331167,89196d858c7ffff,-1.864672,-106.837859,...,30.932205,0.012811,0.0,0.009297,0.029281,0.002573,5.7e-05,SRID=4326;LINESTRING(6.572655690341701 53.2129...,10.583535,73.162141
1303134,220114,1.213772,69.543998,2023-08-02 20:52:01.707561,2023-08-02 21:16:01.288246,00:23:59.580685,247.797915,89196d8430bffff,0.858024,49.161132,...,85.31878,0.012633,0.004667,0.0,0.020519,0.010508,0.0,SRID=4326;LINESTRING(6.613680297835998 53.2296...,69.543998,49.161132
752764,204190,3.900808,223.499829,2023-09-04 10:34:01.759701,2023-09-04 10:48:01.977522,00:14:00.217821,180.49342,89196d85e07ffff,0.562793,32.245656,...,32.669112,0.011539,0.0,0.010968,0.015126,0.017158,3.4e-05,SRID=4326;LINESTRING(6.557627812324986 53.2200...,43.499829,32.245656
251797,272549,5.794254,331.986273,2023-03-19 12:48:01.383831,2023-03-19 13:04:01.833408,00:16:00.449577,306.643544,89196d85ecbffff,0.512097,29.340973,...,33.02117,0.014144,0.0,0.012766,0.018818,0.020899,3.4e-05,SRID=4326;LINESTRING(6.54497318168838 53.22647...,151.986273,29.340973


In [85]:
df['start_time'] = pd.to_datetime(df['start_time'])
df['hour'] = df['start_time'].dt.hour
df['dow'] = df['start_time'].dt.dayofweek

In [86]:
# Define input features and target features
input_features = ['hour', 'dow', 'cell_index', 'cell_angle_normalized', 'cell_density', 'cell_connectivity', 'cell_bicycle_lane_density', 'cell_primary_density', 'cell_secondary_density', 'cell_other_density', 'cell_ped_density', 'cell_dead_end_density']
target_features = ['angle_normalized', 'length']  # Add other route characteristics as needed

In [87]:
# Preprocessing
numeric_features = ['cell_angle_normalized', 'cell_density', 'cell_connectivity', 'cell_bicycle_lane_density', 'cell_primary_density', 'cell_secondary_density', 'cell_other_density', 'cell_ped_density', 'cell_dead_end_density']
categorical_features = ['hour', 'dow', 'cell_index']

# Fit the OneHotEncoder on the entire dataset
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(df[categorical_features])

# Combine the numeric and one-hot encoded categorical features
df_encoded = pd.concat([df[numeric_features], pd.DataFrame(ohe.transform(df[categorical_features]).toarray(), columns=ohe.get_feature_names_out(categorical_features))], axis=1)

In [88]:
# Initialize a dictionary to hold models
models = {}

In [93]:
for target in target_features:
    X = df_encoded
    y = df[target]

    valid_indices = ~X.isna().any(axis=1) & ~y.isna()
    X = X[valid_indices]
    y = y[valid_indices]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    models[target] = model


  X = X[valid_indices]
  X = X[valid_indices]


In [94]:
import itertools

# Create a DataFrame with all combinations of cell_index, hour, and dow
cell_indices = df['cell_index'].unique()
hours = range(24)
dows = range(7)
combinations = list(itertools.product(cell_indices, hours, dows))

# Create a DataFrame from the combinations
example_data = pd.DataFrame(combinations, columns=['cell_index', 'hour', 'dow'])

# Add the other features
for cell_index in cell_indices:
    cell_data = df[df['cell_index'] == cell_index].iloc[0]
    example_data.loc[example_data['cell_index'] == cell_index, numeric_features] = cell_data[numeric_features].values

# Encode the example data
example_data_encoded = pd.concat([example_data[numeric_features], pd.DataFrame(ohe.transform(example_data[categorical_features]).toarray(), columns=ohe.get_feature_names_out(categorical_features))], axis=1)

# Use models to predict route characteristics
predictions = {target: models[target].predict(example_data_encoded) for target in target_features}

# Add the predictions to the example_data DataFrame
for target, prediction in predictions.items():
    example_data[f'predicted_{target}'] = prediction

# Display the results
print(example_data)

            cell_index  hour  dow  cell_angle_normalized  cell_density   
0      89196d85e63ffff     0    0              60.991772      0.038068  \
1      89196d85e63ffff     0    1              60.991772      0.038068   
2      89196d85e63ffff     0    2              60.991772      0.038068   
3      89196d85e63ffff     0    3              60.991772      0.038068   
4      89196d85e63ffff     0    4              60.991772      0.038068   
...                ...   ...  ...                    ...           ...   
99955  89196d8edc3ffff    23    2               6.554114      0.032268   
99956  89196d8edc3ffff    23    3               6.554114      0.032268   
99957  89196d8edc3ffff    23    4               6.554114      0.032268   
99958  89196d8edc3ffff    23    5               6.554114      0.032268   
99959  89196d8edc3ffff    23    6               6.554114      0.032268   

       cell_connectivity  cell_bicycle_lane_density  cell_primary_density   
0              47.313567          

In [60]:
example_data.columns

Index(['cell_index', 'hour', 'dow', 'cell_angle_normalized', 'cell_density',
       'cell_connectivity', 'cell_bicycle_lane_density',
       'cell_primary_density', 'cell_secondary_density', 'cell_other_density',
       'cell_ped_density', 'cell_dead_end_density',
       'predicted_angle_normalized', 'predicted_length'],
      dtype='object')

In [95]:
ml_result = example_data[['cell_index', 'hour', 'dow', 'predicted_angle_normalized', 'predicted_length']]

In [None]:
# show only 2 decimal places

In [99]:
ml_result.sample(20)

Unnamed: 0,cell_index,hour,dow,predicted_angle_normalized,predicted_length,hour_str,dow_str,time_series
81460,89196d85963ffff,21,1,158.918746,246.364625,21,1,21_1
34414,89196d8e98fffff,20,2,122.544487,209.984449,20,2,20_2
19391,89196d85327ffff,10,1,122.224212,140.262415,10,1,10_1
2005,89196d85bc7ffff,22,3,39.404526,253.655229,22,3,22_3
77645,89196d8edd7ffff,4,1,40.064868,228.814692,4,1,4_1
61203,89196dba6d3ffff,7,2,111.37689,309.683097,7,2,7_2
92127,89196d8424fffff,9,0,52.755177,345.481278,9,0,9_0
971,89196d85c53ffff,18,5,66.25798,185.127407,18,5,18_5
5355,89196d85e33ffff,21,0,127.48336,223.255519,21,0,21_0
54918,89196d85c2fffff,21,3,111.462602,237.25057,21,3,21_3


In [97]:
ml_result.to_csv('ml_result.csv', index=False)

In [100]:
# Load the ml_result.csv file
ml_result = pd.read_csv('ml_result.csv')

# Define a function to convert dow and hour to a datetime
def convert_to_datetime(row):
    # Choose an arbitrary week (the first week of 2022 in this case)
    date = pd.Timestamp('2022-01-03')  # This is a Monday
    # Add the day of the week and the hour
    return date + pd.Timedelta(days=row['dow'], hours=row['hour'])

# Apply the function to each row
ml_result['datetime'] = ml_result.apply(convert_to_datetime, axis=1)

# Save the result
ml_result.to_csv('ml_result_with_datetime.csv', index=False)