In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Data loading. Refer to the user_data_preprocessing.ipynb file for details on how user-level features are generated.

df = pd.read_csv('../data/modeling_w_duration.csv')

In [None]:
# Drop duplicate rows (if any)
df.drop_duplicates(inplace=True)

In [None]:
def compute_argmax(df: pd.DataFrame):
    # Create the two columns.
    df[['section_distance_argmax', 'section_duration_argmax', 'section_mode_argmax']] = None
    rows = list()

    for ix, row in df.iterrows():
        parsed_distances = eval(row['section_distances'])
        parsed_durations = eval(row['section_durations'])
        parsed_modes = eval(row['section_modes'])

        argmax_ix = np.argmax(parsed_distances)

        row['section_distance_argmax'] = parsed_distances[argmax_ix]
        row['section_duration_argmax'] = parsed_durations[argmax_ix]
        row['section_mode_argmax'] = parsed_modes[argmax_ix]

        rows.append(row.to_dict())
    
    return pd.DataFrame(rows)

In [None]:
# Let's plot the mode-wise durations as a function of distance.
df_modded = compute_argmax(df)

In [None]:
df_modded.columns

In [None]:
analysis = df_modded[['section_mode_argmax', 'section_duration_argmax', 'section_distance_argmax']].copy()

In [None]:
analysis.head()

In [None]:
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

analysis.loc[(analysis.section_mode_argmax == 'bus') | (analysis.section_mode_argmax == 'train'), ['section_mode_argmax']] = 'public_transport'

for mode in analysis.section_mode_argmax.unique():
    mode_df = analysis.loc[(analysis.section_mode_argmax == mode) 
    & (analysis.section_duration_argmax >=0) 
    & ((analysis.section_distance_argmax >=0)), ['section_duration_argmax', 'section_distance_argmax']]

    X = mode_df[['section_distance_argmax']].values * 0.00062
    Y = mode_df[['section_duration_argmax']].values / 60

    X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

    # scaler = StandardScaler()

    # X_tr = scaler.fit_transform(X_tr)
    # X_te = scaler.transform(X_te)

    # regr = LinearRegression()
    regr = LinearRegression(fit_intercept=True)
    regr.fit(X_tr.reshape(-1,1), Y_tr.reshape(-1,1))

    y_tr_pred = regr.predict(X_tr)
    y_te_pred = regr.predict(X_te)

    train_r2 = r2_score(y_true=Y_tr.flatten(), y_pred=y_tr_pred.flatten())
    test_r2 = r2_score(y_true=Y_te.flatten(), y_pred=y_te_pred.flatten())

    print(mode, train_r2, test_r2)
    print('intercept: ', regr.intercept_[0], 'coeff: ', regr.coef_[0][0])


'''
Previous results:

walking 0.3843977052858275 0.3749466865077252
bicycling 0.7396768569714562 0.735986721086616
car 0.5839819702140339 0.5918942114399524
no_sensed 0.8045590529878717 0.8059202285373765
public_transport 0.44880904441364 0.6020723455289356
'''

In [None]:
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


# Combine bus and train into 'public transport'
analysis.loc[(analysis.section_mode_argmax == 'bus') | (analysis.section_mode_argmax == 'train'), ['section_mode_argmax']] = 'public_transport'

for mode in analysis.section_mode_argmax.unique():

    mode_df = analysis.loc[
        (analysis.section_mode_argmax == mode) & (analysis.section_duration_argmax >= 0) & (analysis.section_distance_argmax >= 0), 
        ['section_duration_argmax', 'section_distance_argmax']
    ]
    

    X = mode_df[['section_distance_argmax']].values
    Y = mode_df[['section_duration_argmax']].values

    scaler = StandardScaler()

    X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

    X_tr = X_tr.flatten()
    X_te = X_te.flatten()
    Y_tr, Y_te = Y_tr.flatten(), Y_te.flatten()

    fit = np.polyfit(X_tr, Y_tr.flatten(), 1)
    # Using the fit, predict tr and te.
    y_tr_pred = np.poly1d(fit)(X_tr)
    y_te_pred = np.poly1d(fit)(X_te)

    train_r2 = r2_score(y_true=Y_tr, y_pred=y_tr_pred)
    test_r2 = r2_score(y_true=Y_te, y_pred=y_te_pred)

    fig, ax = plt.subplots()
    # Plot the testing samples.
    sns.scatterplot(x=X_te, y=Y_te, ax=ax)
    # Plot the line.
    # line = mode_model.intercept_ + (mode_model.coef_[0] * X_te.flatten())
    sns.lineplot(x=X_te, y=y_te_pred, ax=ax)
    plt.show()

    print(mode, train_r2, test_r2)