In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID

import matplotlib.pyplot as plt

import sys
sys.path.append('/Users/kshankar/e-mission/gis_branch_tests')

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.core.wrapper.user as ecwu

import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT
import get_EC
import helper_functions as hf

import sklearn.model_selection as skm
import sklearn.feature_selection as skfs
import sklearn.pipeline as skp
import sklearn.preprocessing as skpr

from sklearn.model_selection import KFold
import sklearn.linear_model as sklm
import sklearn.svm as sksvm

METERS_TO_MILES = 0.000621371 # 1 meter = 0.000621371 miles

df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv') # r stands for raw string, only matters if the path is on Windows

In [None]:
import emission.core.get_database as edb

chosen_program = 'all'
all_user_list = []
programs_all = {}
for u in edb.get_uuid_db().find():         # add users to proper locations in programs 
    program = u["user_email"].split("_")[0]    # This info is in the Stage_uuids collection of the database
    uuid = u["uuid"]
    if program not in programs_all.keys(): programs_all[program] = []
    programs_all[program].append(uuid)
    all_user_list.append(uuid)

user_list = programs_all[chosen_program] if chosen_program is not 'all' else all_user_list

In [None]:
os_map = {}

for u in user_list:
    profile = ecwu.User(u).getProfile()
    if 'curr_platform' in profile:
        os_map[u] = profile['curr_platform']
    else:
        print("Removed a user who had no OS information.")
        user_list.remove(u) # Note: this removes u from programs_all[chosen_program] as well.
        no_os_user = u


In [None]:
# Collect all trips in the program specified earlier
# Then expand user inputs.
# You could instead load the file that "place_all_trips_in_pkl.py" generates
# expanded_labeled_trips = hf.get_expanded_labeled_trips(user_list)

In [None]:
#expanded_labeled_trips.drop(['start_loc','end_loc'],axis = 1).columns

### What modes can we properly sense without substituting a "close enough" energy intensity?
drove alone, walk, bike,bus,train

In [None]:
# Base mode map for GIS. Not directly used in this notebook but nice to see.
gis_sensed_modes = {0 : 'no_sensed',    # UNKNOWN  #NOTE: this is important info to mention.
        1 : 'walking',    # WALKING
        2 : 'bicycling',    # BICYCLING
        3 : 'bus',        # BUS
        4 : 'train',      # TRAIN
        5 : 'car',        # CAR
        6 : 'air_or_hsr', # AIR_OR_HSR
        7 : 'subway',      # SUBWAY
        8 : 'train',      # TRAM
        9 : 'train',      # LIGHT_RAIL
}

# Get error related info
unit_dist_MCS_df = pd.read_csv("unit_distance_MCS.csv").set_index("moment")
#android_EI_moments_df = pd.read_csv("android_EI_moments.csv").set_index("mode")
#ios_EI_moments_df = pd.read_csv("ios_EI_moments.csv").set_index("mode")

# Dictionary of energy intensities in kWH/PMT
energy_dict = cm_handling.get_energy_dict(df_EI)
#%store -r energy_consumption_df # to save time

# sensed_car (maps via MODE_MAPPING_DICT) -> “Gas Car, sensed” in energy dict, 
# which is used for the ground truth car intensity in get_conditional_EI_expectation_and_variance(). 
# Then the sensed mode will show car, but the EI used will be based on a car with a 1.5 person load factor.
#drove_alone_EI = energy_dict["Gas Car, drove alone"]
#load_factor = 1#1.5
#energy_dict.update({"Gas Car, sensed": drove_alone_EI/load_factor})

In [None]:
# This dataframe was generated in place_all_trips_in_pkl.py
df = pd.read_pickle("/tmp/Sensing_sensitivity_analysis/expanded_labeled_trips.pickle")
expanded_labeled_trips = df.copy()#df[df['program'] == 'vail'].copy()

In [None]:
expanded_labeled_trips = hf.drop_unwanted_trips(expanded_labeled_trips,drop_not_a_trip=False)

expanded_labeled_trips = hf.get_primary_modes(expanded_labeled_trips,energy_dict,MODE_MAPPING_DICT)

In [None]:
# find out mode distance proportions for each program
program_proportions = pd.DataFrame(columns=['program','r', 'drove_alone_distance', 'shared_ride_distance','car_proportion', 'ebike_proportion', 'walk_proportion', 'drove_alone_proportion', 'shared_ride_proportion'])
for program in expanded_labeled_trips['program'].unique():
    program_df = expanded_labeled_trips[expanded_labeled_trips['program'] == program].copy()
    proportions = hf.get_ratios_for_dataset(program_df)
    proportions.update({'program': program})
    
    # Append row of proportions to the dataframe
    program_proportions = program_proportions.append(proportions, ignore_index=True)

# Get the proportions for the full dataset
proportions = hf.get_ratios_for_dataset(expanded_labeled_trips)
proportions.update({'program': 'all'})
program_proportions = program_proportions.append(proportions, ignore_index=True)

program_proportions = program_proportions.set_index("program")
#print(program_proportions.round(3).to_markdown())  # pip install tabulate
program_proportions

In [None]:
# Get the confusion matrices and then the EI moments from those.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

r = 1  # 0.91 for vail, 0.71 for pc.
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

# if you forget this step, the error for expected may be different, 
# since you might be relying on a different saved version of the EI_moments_dataframe
android_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
ios_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)

energy_consumption_df = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df)
energy_consumption_df['distance_miles'] = energy_consumption_df.distance*METERS_TO_MILES
# %store energy_consumption_df

In [None]:
r = 1
car_EI_load_divider = (r+1)/(r+0.5)  # aka Michael's definition of load factor.
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_EI_load_divider})
program_df = expanded_labeled_trips[expanded_labeled_trips['program'] == '4c'].copy()


EC_4c = get_EC.compute_all_EC_values(program_df,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df)

In [None]:
program_df = energy_consumption_df[energy_consumption_df['program'] == '4c'].copy()

#### Most recent update:
May want to look at primary mode normalized by distance rather than value counts.

In [None]:
all_trip_modes = expanded_labeled_trips[['mode_confirm','section_modes','primary_mode']].copy()

#for i,ct in all_trip_modes.iterrows():

for mode in ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike']:
    #n_user_labels = all_trip_modes['mode_confirm'].count(mode)
    mode_df = all_trip_modes[all_trip_modes['mode_confirm'] == mode]
    section_mode_distance_dict_given_user_label = {}
    for i,ct in mode_df.iterrows():
        section_modes = ct['section_modes']

    print(mode)
    print(mode_df.primary_mode.value_counts(normalize=True))

In [None]:
drove_alone_4c_df = program_df[program_df['mode_confirm'] == 'drove_alone']
drove_alone_outliers = hf.get_outliers(drove_alone_4c_df,'error_for_confusion',100,15)[['distance','mode_confirm','section_modes','section_distances','primary_mode','primary_length','error_for_confusion','error_for_prediction','expected','predicted', 'user_labeled','os']]
#drove_alone_outliers

In [None]:
shared_ride_4c_df = program_df[program_df['mode_confirm'] == 'shared_ride']
shared_ride_outliers_low = hf.get_outliers(shared_ride_4c_df,'error_for_confusion',100,15)[['distance','distance_miles','mode_confirm','section_modes','section_distances','primary_mode','primary_length','error_for_confusion','error_for_prediction','expected','predicted', 'user_labeled','os']]
shared_ride_outliers_high = hf.get_outliers(shared_ride_4c_df,'error_for_confusion',85,0)[['distance','distance_miles','mode_confirm','section_modes','section_distances','primary_mode','primary_length','error_for_confusion','error_for_prediction','expected','predicted', 'user_labeled','os']]

fig,axs = plt.subplots(1,2)
fig.set_figwidth(15)
shared_ride_outliers_high.primary_mode.hist(ax = axs[0])
shared_ride_outliers_low.primary_mode.hist(ax = axs[1])

axs[0].set_title("4c shared ride overestimates primary modes (above 85th percentile)")
axs[1].set_title("4c shared ride underestimates primary modes(below 15th percentile)")

# most of the overestimates are car. (blue)
# most of the unerestimates are walking, bicycling, and no sensed.

In [None]:
shared_ride_outliers_high.distance_miles.hist()

In [None]:
drove_alone_outliers.primary_mode.hist()
plt.title("4c drove alone outlier primary modes (below the 15th percentile)")

In [None]:
# Naming convenction below: <user label>_<primary mode>
drove_alone_car = drove_alone_4c_df[drove_alone_4c_df.primary_mode == 'car']
shared_ride_car = shared_ride_4c_df[shared_ride_4c_df.primary_mode == 'car']

EI_used_for_android_sensed_car = 1.189540
EI_used_for_android_walking = 0.010464
EI_for_drove_alone = 1.51517707
EI_for_shared_ride = 0.757588535
drove_alone_car_distance = drove_alone_car.distance.sum()*METERS_TO_MILES
shared_ride_car_distance = shared_ride_car.distance.sum()*METERS_TO_MILES

# the outliers below 15% account for -2390 kWH
drove_alone_outlier_error = drove_alone_outliers.error_for_confusion.sum()
shared_ride_outliers_high_error = shared_ride_outliers_high.error_for_confusion.sum()
shared_ride_outliers_low_error = shared_ride_outliers_low.error_for_confusion.sum()

# the drove alone trips in 4c where the primary mode is car account for -1754 kWH of error.
print(f"Errors for drove alone and shared ride when we predict car: {drove_alone_car.error_for_confusion.sum():.2f}, {shared_ride_car.error_for_confusion.sum():.2f}")
print(f"Drove alone outlier errors sum: {drove_alone_outlier_error:.2f}")
print(f"Shared ride outlier error for upper outliers, lower outliers: {shared_ride_outliers_high_error:.2f}, {shared_ride_outliers_low_error:.2f}")

print("\nMost of the outlier error for drove alone is from walking.")
print(f"Difference between sensed walking and drove alone EI: {EI_used_for_android_walking - EI_for_drove_alone:.4f}")

print("\nMost of the overestimation outlier error for shared ride is from sensed car.")
print(f"Difference between sensed car and shared ride EI: {EI_used_for_android_sensed_car - EI_for_shared_ride:.4f}")

print("\nMost of the underestimation outlier error for shared ride is from no_sensed and walking.")
print(f"Difference between no_sensed and shared ride EI: {android_EI_moments_df['mean(EI)']['no_sensed'] - EI_for_shared_ride:.4f}")
print(f"Difference between sensed walking and shared ride EI: {EI_used_for_android_walking - EI_for_shared_ride:.4f}")
print("In either case, when we mispredict drove alone, we are guaranteed to have a higher error than for a similar shared ride trip.")


In [None]:
#%store -r energy_consumption_df

### Modeling the energy consumption percent error as a function of dataset characteristics
Make sure you've calculated program proportions and energy consumption for the full dataset first.
Before analysis, keep track of whether you dropped not a trips in the "helper_functions.drop_unwanted_trips()" call. 

### Plot energy consumption by user labeled mode

In [None]:
program_df = energy_consumption_df[energy_consumption_df['program'] == '4c'].copy()

hf.plot_energy_consumption_by_mode(program_df,'4c')
hf.plot_energy_consumption_by_mode(energy_consumption_df[energy_consumption_df['program'] == 'pc'].copy(),'nrelop')
hf.plot_energy_consumption_by_mode(energy_consumption_df,'all CEO')

In [None]:
(energy_consumption_df.query('program == "pc" & mode_confirm == "train"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc" & mode_confirm == "bus"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc" & mode_confirm == "shared_ride"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc" & mode_confirm == "e_car_shared_ride"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc" & mode_confirm == "pilot_ebike"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc" & mode_confirm == "drove_alone"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc"').error_for_confusion.sum(),
 energy_consumption_df.query('program == "pc"').expected.sum(),
 energy_consumption_df.query('program == "pc"').user_labeled.sum()
)

In [None]:
energy_consumption_df.query('program == "4c"').groupby("mode_confirm").sum().error_for_confusion.abs().sort_values().tail(n=10)

In [None]:
energy_consumption_df.query('program == "pc"').groupby("mode_confirm").sum().error_for_confusion.abs().sort_values().tail(n=10)

In [None]:
hf.plot_energy_consumption_by_mode(energy_consumption_df[energy_consumption_df['program'] == 'pc'].query('mode_confirm == ["drove_alone", "shared_ride", "pilot_ebike", "not_a_trip", "walk"]').copy(),'nrelop')

In [None]:
energy_consumption_df.query('program == "4c" & mode_confirm == "pilot_ebike"').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "4c" & mode_confirm == "drove_alone"').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "4c" & mode_confirm == "shared_ride"').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "4c" & mode_confirm == "bus"').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "4c" & (mode_confirm == "shared_ride" | mode_confirm == "drove_alone")').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]], energy_consumption_df.query('program == "4c" & mode_confirm == "pilot_ebike"').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]]

In [None]:
energy_consumption_df.query('program == "4c" & (mode_confirm == "shared_ride" | mode_confirm == "drove_alone")').groupby('primary_mode').sum()[["distance_miles", "error_for_confusion"]].plot(kind="bar")

In [None]:
big_error_modes = ['drove_alone', 'pilot_ebike', 'bus', 'shared_ride', 'taxi']
energy_consumption_df.query('program == "4c" & mode_confirm == @big_error_modes').groupby('primary_mode').sum()[["error_for_confusion"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "4c"').error_for_confusion.sum(), energy_consumption_df.query('program == "4c"').user_labeled.sum() 

### Splitting into separate sets to check for variations

We are now going to try and see if we can determine the source of the variation across programs.

Here are the features that we plan to try
- drove_alone_2_shared_ride
- no_sensed_ratio
- not_a_trip_ratio
- e_bike_ratio
- car_like_ratio
- car_like_as_not_car
- e_bike_as_car
- e_bike_as_not_bike
- car_to_non_car_motorized_user_label
- car_to_non_car_motorized_sensed
- mispredicted_as_walk
- mispredicted_as_car

First, let's take the data we have now and split it into 10 parts with shuffling to create 10 fake distributions and see what happens to the error calculations

In [None]:
# splitting without shuffling leads to some larger car to other ratios
kf = KFold(n_splits=10, shuffle=True, random_state=2)  # some splits might not have any ebike

# The naive KFold gives us 10 separate arrays with 4283 train and 476 test trips
# But what we really want is 9 training sets of trips (to simulate the 9 programs for training)
# and one test set of trips 
# Note that our features are for sets of trips, not individual trips
for train_index, test_index in kf.split(energy_consumption_df):
    print(len(train_index), len(test_index))

In [None]:
# There are multiple crossvalidation splitters in sklearn but all of them split into one training and one test set at a time
# if we want to split into k-1 training sets and k test sets, we are gonna have to do it ourselves
# New code suggestion from https://numpy.org/doc/stable/reference/random/generated/numpy.random.shuffle.html
def get_set_splits(n_rounds = 50, n_splits_per_round=10):
    from numpy.random import default_rng
    large_size_splits = []
    for round in range(n_rounds):
        rng = default_rng()
        trip_index = np.array(energy_consumption_df.index.copy())
        rng.shuffle(trip_index)
        # print(energy_consumption_df.index, trip_index)
        splits = np.array_split(trip_index, n_splits_per_round)
        large_size_splits.append(splits)
    large_size_splits = np.array(large_size_splits).flatten()
    print([len(s) for s in large_size_splits])
    return large_size_splits

In [None]:
energy_consumption_df.columns

### First round and splitting and plotting the splits

In [None]:
splits = get_set_splits(n_rounds = 50, n_splits_per_round=10)
split_result_list = []
for s in splits:
    ERROR_COLS = ['error_for_confusion',
       'error_for_prediction', 'expected', 'predicted', 'user_labeled', 'distance_miles', 'distance', 'duration']
    curr_split_result = {'count': len(s)}
    for e in ERROR_COLS:
        curr_split_result[e] = energy_consumption_df.loc[s][e].sum()
    # print(curr_split_result)
    # print(f"CHECK user_labeled {energy_consumption_df.loc[s].user_labeled.sum()}")
    # print(f"CHECK error_for_confusion {energy_consumption_df.loc[s].error_for_confusion.sum()}")
    split_result_list.append(curr_split_result)
split_results = pd.DataFrame(split_result_list)
split_results

In [None]:
split_results.plot(subplots=True, layout=(3,3), figsize=(12,6))

In [None]:
split_results['error_pct_for_confusion'] = (split_results.error_for_confusion / split_results.user_labeled ) * 100
split_results['error_pct_for_prediction'] = (split_results.error_for_prediction / split_results.user_labeled) * 100

In [None]:
split_results

In [None]:
split_results[["expected", "predicted", "user_labeled", "distance_miles", "error_pct_for_confusion", "error_pct_for_prediction"]].plot(subplots=True, layout=(3,2), figsize=(12,6))

In [None]:
ax = split_results.plot.scatter(x=["distance_miles", "distance_miles"], y=["expected", "user_labeled"], c=['blue'] * 500 + ['green'] * 500)

In [None]:
ax = split_results.plot.scatter(x="distance_miles", y="error_pct_for_confusion")

### Recomputing with the potential other factors

In [None]:
# Recomputing with the other potential factors
def get_split_results(splits):
    CAR_LIKE_MODES = ['drove_alone', 'shared_ride', 'taxi']
    NON_CAR_MOTORIZED_MODES = ['bus', 'free_shuttle', 'train']
    split_result_list = []
    for s in splits:
        ERROR_COLS = ['error_for_confusion',
           'error_for_prediction', 'expected', 'predicted', 'user_labeled', 'distance_miles', 'distance', 'duration']
        curr_split_trips = energy_consumption_df.loc[s]
        curr_split_result = {'count': len(s)}
        for e in ERROR_COLS:
            curr_split_result[e] = curr_split_trips[e].sum()
        curr_split_result['drove_alone_2_shared_ride'] = curr_split_trips.query('mode_confirm == "drove_alone"').distance.sum() / curr_split_trips.query('mode_confirm == "shared_ride"').distance.sum()
        curr_split_result['no_sensed_ratio'] = curr_split_trips.query('primary_mode == "no_sensed"').distance.sum() / curr_split_trips.distance.sum()
        curr_split_result['car_like_ratio'] = curr_split_trips.query('mode_confirm == @CAR_LIKE_MODES').distance.sum() / curr_split_trips.distance.sum()        
        curr_split_result['e_bike_ratio'] = curr_split_trips.query('mode_confirm == "pilot_ebike"').distance.sum() / curr_split_trips.distance.sum()
        curr_split_result['not_a_trip_ratio'] = curr_split_trips.query('mode_confirm == "not_a_trip"').distance.sum() / curr_split_trips.distance.sum()
        
        curr_split_result['car_like_as_not_car'] = curr_split_trips.query('mode_confirm == @CAR_LIKE_MODES & primary_mode != "car"').distance.sum() / curr_split_trips.query('mode_confirm == @CAR_LIKE_MODES').distance.sum()
        curr_split_result['e_bike_as_car'] = curr_split_trips.query('mode_confirm == "pilot_ebike" & primary_mode == "car"').distance.sum() / curr_split_trips.query('mode_confirm == "pilot_ebike"').distance.sum()
        curr_split_result['e_bike_as_not_car_bike'] = curr_split_trips.query('mode_confirm == "pilot_ebike" & primary_mode != ["car", "bicycling"]').distance.sum() / curr_split_trips.query('mode_confirm == "pilot_ebike"').distance.sum()

        curr_split_result['non_car_2_car_user_label'] = curr_split_trips.query('mode_confirm == @NON_CAR_MOTORIZED_MODES').distance.sum() / curr_split_trips.query('mode_confirm == @CAR_LIKE_MODES').distance.sum()
        curr_split_result['non_car_2_car_sensed'] = curr_split_trips.query('primary_mode == ["bus", "train"]').distance.sum() / curr_split_trips.query('primary_mode == "car"').distance.sum()
        curr_split_result['mispredicted_as_walk'] = curr_split_trips.query('mode_confirm != "walk" & primary_mode == "walking"').distance.sum() / curr_split_trips.distance.sum()
        curr_split_result['mispredicted_as_car'] = curr_split_trips.query('mode_confirm != @CAR_LIKE_MODES & primary_mode == "car"').distance.sum() / curr_split_trips.distance.sum()
    
        # if curr_split_result['drove_alone_2_shared_ride'] > 0.5:
            # print(f"CHECK: drove_alone %s, shared_ride %s" % (curr_split_trips.query('mode_confirm == "drove_alone"').distance_miles.sum(),
            #                                                   curr_split_trips.query('mode_confirm == "shared_ride"').distance_miles.sum()))
        # print(curr_split_result)
        # print(f"CHECK user_labeled {energy_consumption_df.loc[s].user_labeled.sum()}")
        # print(f"CHECK error_for_confusion {energy_consumption_df.loc[s].error_for_confusion.sum()}")
        split_result_list.append(curr_split_result)
    split_results = pd.DataFrame(split_result_list)
    split_results['error_pct_for_confusion'] = (split_results.error_for_confusion / split_results.user_labeled ) * 100
    split_results['error_pct_for_prediction'] = (split_results.error_for_prediction / split_results.user_labeled) * 100
    return split_results

split_results = get_split_results(splits)

In [None]:
split_results[["non_car_2_car_sensed", "non_car_2_car_user_label"]]

In [None]:
split_results[['drove_alone_2_shared_ride', 'no_sensed_ratio', 'not_a_trip_ratio', "e_bike_ratio", "mispredicted_as_walk", "mispredicted_as_car", "distance_miles", "error_pct_for_confusion", "non_car_2_car_user_label"]].plot(subplots=True, layout=(3,3), figsize=(12,6))

In [None]:
fig, ax2d = plt.subplots(nrows=3, ncols=4, figsize=(8,8), sharey=True)
fig.tight_layout(h_pad = 3)
axarray = ax2d.flatten()
IND_VAR = ['drove_alone_2_shared_ride', 'no_sensed_ratio', 'car_like_ratio', 'e_bike_ratio', 'not_a_trip_ratio',
           "car_like_as_not_car", "e_bike_as_car", "e_bike_as_not_car_bike", 
           "non_car_2_car_user_label", "mispredicted_as_walk", "mispredicted_as_car", 'distance_miles']
DEP_VAR = 'error_pct_for_confusion'
for iv, ax in zip(IND_VAR, axarray):
    split_results.plot(x=iv, y=DEP_VAR, ax=ax, kind="scatter")

### Feature estimation

In [None]:
estimators = pd.Series({"linear": sklm.LinearRegression(), "lasso": sklm.Lasso(), "ridge": sklm.Ridge(),
                       "bayesian": sklm.ARDRegression(), "sgd": sklm.SGDRegressor(),
                        "svm_linear": sksvm.SVR(kernel="linear"), "NuSVR": sksvm.NuSVR(kernel="linear")})
std_estimators = estimators.apply(lambda e: skp.Pipeline([("scale", skpr.StandardScaler()), ("reg", e)]))
selectors = std_estimators.apply(lambda e: skfs.RFECV(e, step=1, importance_getter="named_steps.reg.coef_"))
fitted_selectors = selectors.apply(lambda s: s.fit(split_results[IND_VAR], split_results[DEP_VAR]))

In [None]:
estimator_sel_features = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: np.array(IND_VAR)[fs.support_]), index=fitted_selectors.index); estimator_sel_features

In [None]:
IND_VAR, pd.Series(fitted_selectors.loc['linear'].estimator_['reg'].coef_).plot()

In [None]:
estimator_grid_scores = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: fs.grid_scores_), index=fitted_selectors.index); estimator_grid_scores

In [None]:
estimator_grid_scores.transpose().drop(columns=['sgd']).plot(kind="box", grid=True)

In [None]:
estimator_grid_scores.transpose().plot(kind="box", grid=True)

In [None]:
estimator_grid_scores.transpose().median()

In [None]:
np.array(IND_VAR)[skfs.SelectPercentile(skfs.f_regression, percentile=0.9).fit(split_results[IND_VAR], split_results[DEP_VAR]).get_support()]

In [None]:
np.array(IND_VAR)[skfs.SelectPercentile(skfs.mutual_info_regression, percentile=0.9).fit(split_results[IND_VAR], split_results[DEP_VAR]).get_support()]

In [None]:
expanded_labeled_trips.mode_confirm.value_counts(normalize=True)

### Figuring out how to represent feature selection results across multiple runs

- each run may result in a separate set of selected features
- we first determine the support (aka included/not) values for each feature
- we convert them to numbers
- we add the the numbers
- if we divide by the numer of interations, then the closeness to 2 will indicate how often the feature was selected

In [None]:
true_false_df = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: fs.support_), index=fitted_selectors.index, columns=IND_VAR); true_false_df

In [None]:
int_feature_df = true_false_df.applymap(lambda tf: int(tf)); int_feature_df

In [None]:
int_feature_df + int_feature_df

### Figuring out how to represent scores across runs

- each run will result in 6 scores per estimator
- we can create a dataframe where the columns are the estimators and the rows are the scores
- for each run, we just append the scores to the dataframe and then reset the index

In [None]:
estimator_grid_scores.transpose().append(estimator_grid_scores.transpose()).reset_index(drop=True)

### Figuring out how to represent coefficients across runs

For each run, we will presumably get the coefficients for only the last run. For the algorithms with stable results, this won't matter. We can have the rows as the algorithms and the columns as the features or vice versa, but we will need to have multiple rows, one for each run.

Let's have the columns be the features, and the rows be the algo, run combinations.

Some complications:
- the SVM coefficients are in a 2-D array, but we can flatten them
- we only get coeffiecients for the features that are meaningful?

```
drove_alone_2_shared_ride 	no_sensed_ratio 	not_a_trip_ratio 	e_bike_ratio 	mispredicted_as_walk 	mispredicted_as_car 	distance_miles

linear 	True 	True 	True 	False 	True 	True 	False
linear 	-1.765813e+01 	-5.202422e+01 	99.113221 	124.429262 	46.013267 	NaN

or 

sgd 	True 	False 	False 	False 	False 	False 	True
sgd 	-2.047909e+10 	-7.597053e+10 	NaN 	NaN 	NaN 	NaN

or

svm_linear 	True 	True 	True 	True 	False 	True 	False
svm_linear 	-1.449385e+01 	-6.055291e+00 	4.868629 	6.639475 	9.626487 	NaN
```

So we need to find the support, find the dicts by zipping and then create the dataframe

In [None]:
# fitted_selectors['linear'].estimator_.coef_, np.pad(fitted_selectors['linear'].estimator_.coef_.copy(), (0,len(IND_VAR) - len(fitted_selectors['linear'].estimator_.coef_)), 'constant')

In [None]:
fitted_selectors['svm_linear'].estimator_['reg'].coef_, np.pad(fitted_selectors['linear'].estimator_['reg'].coef_.copy(), (0,len(IND_VAR) - len(fitted_selectors['linear'].estimator_['reg'].coef_)), 'constant')

In [None]:
fitted_selectors.apply(lambda fs: fs.estimator_['reg'].coef_.flatten())

In [None]:
pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: fs.estimator_['reg'].coef_.flatten()), index=fitted_selectors.index)

In [None]:
list(zip(np.array(IND_VAR)[fitted_selectors['svm_linear'].support_],fitted_selectors['svm_linear'].estimator_['reg'].coef_.flatten()))

In [None]:
estimator_coef = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: dict(zip(np.array(IND_VAR)[fs.support_], fs.estimator_['reg'].coef_.flatten()))), index=fitted_selectors.index); estimator_coef

In [None]:
estimator_coef['run'] = [1] * len(estimator_coef); estimator_coef

In [None]:
estimator_coef.loc[['svm_linear', 'NuSVR']].plot(kind="box", figsize=(10,4))

## Figured it all out, starting the runs now

In [None]:
# Creating the splits first just in case we want to do something standard with them
# TODO: Set a random seed

split_reps = []
for repetition in range(10):
    splits = get_set_splits(n_rounds=50, n_splits_per_round=10)
    split_reps.append(splits)

In [None]:
for run in range(10):
    for run_2 in range(run, 10):
        print(f"Comparing {run} and {run_2}")
        if np.array_equal(split_reps[run][0], split_reps[run_2][0]):
            print(f"CHECK: split indices at index 0 for {run} and {run_2} are the same!")

In [None]:

all_int_feature_df = None
all_estimator_grid_scores = None
all_estimator_coef = None

for run, splits in enumerate(split_reps):
    # Get the split
    print(f"Running run {run} with {len(splits)} size")
    split_results = get_split_results(splits)
    
    # Creating new estimators and selectors
    estimators = pd.Series({"linear": sklm.LinearRegression(), "ridge": sklm.Ridge(),
                "bayesian": sklm.ARDRegression(),
                "svm_linear": sksvm.SVR(kernel="linear")})
    print(f"Created new estimators {len(estimators)}")
    std_estimators = estimators.apply(lambda e: skp.Pipeline([("scale", skpr.StandardScaler()), ("reg", e)]))
    print(f"After combining with the pipeline {len(std_estimators)} ")
    selectors = std_estimators.apply(lambda e: skfs.RFECV(e, step=1, importance_getter="named_steps.reg.coef_"))
    print("Created selectors, about to fit them")

    
    # Fit the selectors
    fitted_selectors = selectors.apply(lambda s: s.fit(split_results[IND_VAR], split_results[DEP_VAR]))
    print("Finished fitting selectors, about to generate results")
    
    # Combine the feature selection
    curr_true_false_df = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: fs.support_), index=fitted_selectors.index, columns=IND_VAR)
    curr_int_feature_df = curr_true_false_df.applymap(lambda tf: int(tf))
    all_int_feature_df = curr_int_feature_df if all_int_feature_df is None else all_int_feature_df + curr_int_feature_df

    # Combine the grid scores
    curr_estimator_grid_scores = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: fs.grid_scores_), index=fitted_selectors.index).transpose()
    curr_estimator_grid_scores['run'] = [run] * len(curr_estimator_grid_scores)
    all_estimator_grid_scores = curr_estimator_grid_scores if all_estimator_grid_scores is None else all_estimator_grid_scores.append(curr_estimator_grid_scores)
    
    # Combine the coefficients
    curr_estimator_coef = pd.DataFrame.from_records(fitted_selectors.apply(lambda fs: dict(zip(np.array(IND_VAR)[fs.support_], fs.estimator_['reg'].coef_.flatten()))), index=fitted_selectors.index)
    curr_estimator_coef['run'] = [run] * len(curr_estimator_coef)
    all_estimator_coef = curr_estimator_coef if all_estimator_coef is None else all_estimator_coef.append(curr_estimator_coef)

In [None]:
all_int_feature_df

In [None]:
(all_int_feature_df/10).mean()

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, squeeze=True, figsize=(12,6))
all_int_feature_df.transpose().plot(kind="bar", ax=ax[0][0])
all_int_feature_df.plot(kind="box", ax=ax[0][1], rot=90)
(all_int_feature_df/10).mean().plot(kind="bar", ax=ax[1][0])
all_int_feature_df.plot(kind="bar", ax=ax[1][1])
ax[1][1].hlines(y=5,xmin=-1,xmax=len(all_int_feature_df))
ax[1][1].legend(bbox_to_anchor=(1, 1))

### We get three sets of feature selections

- Only 10: drove_alone_2_shared_ride, car_like_ratio, car_like_as_not_car, e_bike_as_car, non_car_2_car_user_label, mispredicted_as_walk
- All over 50%: drop e_bike_as_not_car_bike, distance_miles
- Everything

In [None]:
all_estimator_grid_scores.reset_index(inplace=True, drop=True)

In [None]:
all_estimator_grid_scores

In [None]:
all_estimator_grid_scores.drop(columns='run').plot(kind="box", grid=True)

In [None]:
all_estimator_coef

In [None]:
all_estimator_coef.loc[["svm_linear"]]

In [None]:
all_estimator_coef.loc[["svm_linear"]].drop(columns=["mispredicted_as_walk", "distance_miles", "non_car_2_car_user_label", "run"]).plot(kind="box", rot=90, grid=True)

### Now that we know that the svm family of estimators works well, let's try some non-linear models as well

In [None]:
estimators = pd.Series({"svm_linear": sksvm.SVR(kernel="linear"), "svm_poly": sksvm.SVR(kernel="poly"), "svm_rbf": sksvm.SVR(kernel="rbf"),
                        "NuSVR_linear": sksvm.NuSVR(kernel="linear"), "NuSVR_poly": sksvm.NuSVR(kernel="poly"), "NuSVR_rbf": sksvm.NuSVR(kernel="rbf")})
std_estimators = estimators.apply(lambda e: skp.Pipeline([("scale", skpr.StandardScaler()), ("reg", e)]))
split_results = get_split_results(split_reps[0])
training_set = split_results.sample(frac=0.8)
test_set = split_results.drop(labels=training_set.index)
fitted_estimators = std_estimators.apply(lambda e: e.fit(training_set[IND_VAR], training_set[DEP_VAR]))
scores = fitted_estimators.apply(lambda e: e.score(test_set[IND_VAR], test_set[DEP_VAR]))
# coef = fitted_estimators[["svm_linear", "NuSVR_linear"]].apply(lambda e: e.coef_)

In [None]:
pd.DataFrame().append(scores, ignore_index=True)

In [None]:
# len(coef.loc["svm_linear"].flatten()), len(IND_VAR)

In [None]:
fold5 = skm.KFold()

all_svm_kernel_scores = pd.DataFrame()

for run, split in enumerate(split_reps):
    print(f"About to run manual k-fold validation for run {run}")
    estimators = pd.Series({"svm_linear": sksvm.SVR(kernel="linear"), "svm_poly": sksvm.SVR(kernel="poly"), "svm_rbf": sksvm.SVR(kernel="rbf"),
                        "NuSVR_linear": sksvm.NuSVR(kernel="linear"), "NuSVR_poly": sksvm.NuSVR(kernel="poly"), "NuSVR_rbf": sksvm.NuSVR(kernel="rbf")})
    std_estimators = estimators.apply(lambda e: skp.Pipeline([("scale", skpr.StandardScaler()), ("reg", e)]))
    print(f"Finished creating estimators for run {run}, about to featurize")
    split_results = get_split_results(split)
    print(f"Finished featurizing for run {run}, about to run cross-validation")
    for train_index, test_index in fold5.split(split_results):
        print(f"Split features into training {len(train_index)} and test {len(test_index)}")
        training_set = split_results.loc[train_index]
        test_set = split_results.loc[test_index]
        fitted_estimators = std_estimators.apply(lambda e: e.fit(training_set[IND_VAR], training_set[DEP_VAR]))
        scores = fitted_estimators.apply(lambda e: e.score(test_set[IND_VAR], test_set[DEP_VAR]))
        all_svm_kernel_scores = all_svm_kernel_scores.append(scores, ignore_index=True)

In [None]:
all_svm_kernel_scores.head()

In [None]:
all_svm_kernel_scores.plot(kind="box", grid=True)

In [None]:
all_svm_kernel_scores.drop(columns=["NuSVR_poly", "svm_poly"]).plot(kind="box", grid=True)

## Tune the final estimator and then apply it to the individual programs

I was going to apply it to the coefficients from this prior estimation, but it turns out that `GridSearchCV` makes it easier. It has a `best_estimator_` field and we can just apply the programs to the estimator.

In [None]:
IND_VAR_OVER_HALF = IND_VAR.copy()
IND_VAR_OVER_HALF.remove("e_bike_as_not_car_bike")
IND_VAR_OVER_HALF.remove("distance_miles")

IND_VAR_PERFECT_10 = ["drove_alone_2_shared_ride", "car_like_ratio", "car_like_as_not_car", "e_bike_as_car", "non_car_2_car_user_label", "mispredicted_as_walk"]

In [None]:
parameters = {'svm__kernel': ('linear', 'rbf', 'sigmoid'), 'svm__C':[1, 10, 100], 'svm__gamma': ('scale', 'auto')}
svr = skp.Pipeline([("scaler", skpr.StandardScaler()), ("svm", sksvm.SVR(kernel='linear'))])
clf = skm.GridSearchCV(svr, parameters, verbose=2)
split_results = get_split_results(split_reps[0])
clf.fit(split_results[IND_VAR], split_results[DEP_VAR])

In [None]:
all_feature_check_results = pd.DataFrame(clf.cv_results_); all_feature_check_results.head()

In [None]:
ax = all_feature_check_results.mean_test_score.plot(kind="bar")
ax.set_xticklabels(all_feature_check_results.params.tolist())

In [None]:
all_feature_check_results.query("mean_test_score > -10")

In [None]:
ax = all_feature_check_results.query("mean_test_score >= -10").mean_test_score.plot(kind="bar")
ax.set_xticklabels(all_feature_check_results.query("mean_test_score >= -10").params.tolist())

In [None]:
ax = all_feature_check_results.query("rank_test_score == 1").mean_test_score.plot(kind="bar")
ax.set_xticklabels(all_feature_check_results.query("rank_test_score == 1").params.tolist())

In [None]:
parameters = {'svm__kernel': ('linear', 'rbf', 'sigmoid'), 'svm__C':[1, 10, 100], 'svm__gamma': ('scale', 'auto')}
svr = skp.Pipeline([("scaler", skpr.StandardScaler()), ("svm", sksvm.SVR(kernel='linear'))])
clf = skm.GridSearchCV(svr, parameters, verbose=2)
split_results = get_split_results(split_reps[0])
clf.fit(split_results[IND_VAR_OVER_HALF], split_results[DEP_VAR])

In [None]:
over_half_feature_check_results = pd.DataFrame(clf.cv_results_); over_half_feature_check_results.head()

In [None]:
ax = over_half_feature_check_results.query("rank_test_score == 1").mean_test_score.plot(kind="bar")
ax.set_xticklabels(over_half_feature_check_results.query("rank_test_score == 1").params.tolist())

In [None]:
parameters = {'svm__kernel': ('linear', 'rbf', 'sigmoid'), 'svm__C':[1, 10, 30], 'svm__gamma': ('scale', 'auto')}
svr = skp.Pipeline([("scaler", skpr.StandardScaler()), ("svm", sksvm.SVR(kernel='linear'))])
clf = skm.GridSearchCV(svr, parameters, verbose=2)
split_results = get_split_results(split_reps[0])
clf.fit(split_results[IND_VAR_PERFECT_10], split_results[DEP_VAR])

In [None]:
perfect_10_feature_check_results = pd.DataFrame(clf.cv_results_); perfect_10_feature_check_results.head()

In [None]:
ax = perfect_10_feature_check_results.query("rank_test_score == 1").mean_test_score.plot(kind="bar")
ax.set_xticklabels(perfect_10_feature_check_results.query("rank_test_score == 1").params.tolist())

In [None]:
pd.Series(clf.best_estimator_['svm'].coef_.flatten(), index=IND_VAR_PERFECT_10)

## Final retraining, picking the split that resulted in the median and dropping unimportant features

In [None]:
# Let's find the split which resulted in the median value for svm_linear
print(all_svm_kernel_scores.svm_linear.median(), all_svm_kernel_scores.svm_linear.quantile(interpolation="nearest"))
median_row = all_svm_kernel_scores[all_svm_kernel_scores.svm_linear == all_svm_kernel_scores.svm_linear.quantile(interpolation="nearest")]
median_index = median_row.index.tolist()[0]
print(median_index)

In [None]:
split_reps_indices = np.split(np.array(range(0,len(all_svm_kernel_scores))), len(split_reps))
for i, sri in enumerate(split_reps_indices):
    if median_index in sri:
        median_split_reps_index = i
        
print(median_split_reps_index)

In [None]:
IND_VAR_FINAL = IND_VAR_OVER_HALF
parameters = {'svm__C':range(1, 20, 2), 'svm__gamma': ('scale', 'auto')}
svr = skp.Pipeline([("scaler", skpr.StandardScaler()), ("svm", sksvm.SVR(kernel='linear'))])
clf = skm.GridSearchCV(svr, parameters, verbose=2)
split_results = get_split_results(split_reps[median_split_reps_index])
clf.fit(split_results[IND_VAR_FINAL], split_results[DEP_VAR])

In [None]:
second_check_results = pd.DataFrame(clf.cv_results_); second_check_results.head()

In [None]:
ax = second_check_results.mean_test_score.plot(kind="bar")
ax.set_xticklabels(second_check_results.params.tolist())
ax

In [None]:
clf.best_estimator_

## Now, let's create splits for the actual programs

In [None]:
program_specific_splits_series = energy_consumption_df.groupby("program").apply(lambda g: g.index)

In [None]:
program_specific_split_results = get_split_results(program_specific_splits_series.values).set_index(program_specific_splits_series.index)

In [None]:
program_specific_split_results

In [None]:
program_specific_split_results[IND_VAR_FINAL+[DEP_VAR]].plot(kind="bar", subplots=True, layout=(3,4), sharex=True, figsize=(12, 12))

In [None]:
program_specific_split_results["predicted_error_pct_for_confusion"] = clf.predict(program_specific_split_results[IND_VAR_FINAL])

In [None]:
program_specific_split_results[["error_pct_for_confusion", "predicted_error_pct_for_confusion"]].plot(kind="bar")

In [None]:
clf.score(program_specific_split_results[IND_VAR_FINAL], program_specific_split_results[DEP_VAR])

In [None]:
pd.Series(clf.best_estimator_['svm'].coef_.flatten(), index=IND_VAR_FINAL)

In [None]:
split_results[["drove_alone_2_shared_ride", "no_sensed_ratio", "not_a_trip_ratio", "mispredicted_as_walk"]].plot(subplots=True, layout=(2,2), figsize=(6,6), sharex=True)

In [None]:
pd.Series(clf.best_estimator_.coef_.flatten(), index=IND_VAR_FINAL) * program_specific_split_results[IND_VAR_FINAL].loc["4c"]

In [None]:
pd.Series(clf.best_estimator_.coef_.flatten(), index=IND_VAR_FINAL) * program_specific_split_results[IND_VAR_FINAL].loc["prepilot"]

In [None]:
pd.Series(clf.best_estimator_.coef_.flatten(), index=IND_VAR_FINAL) * program_specific_split_results[IND_VAR_FINAL].loc["vail"]

In [None]:
energy_consumption_df.query('program == "4c"').groupby("mode_confirm").sum().error_for_confusion.abs().sort_values().tail(n=10)

In [None]:
energy_consumption_df.query('program == "prepilot"').groupby("mode_confirm").sum().error_for_confusion.abs().sort_values().tail(n=10)

In [None]:
ax = energy_consumption_df.query('program == "4c" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby('mode_confirm').sum()[["expected", "user_labeled"]].plot(kind="bar")

In [None]:
ax = energy_consumption_df.query('program == "prepilot" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby('mode_confirm').sum()[["expected", "user_labeled"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "4c"').groupby("mode_confirm").sum()['distance_miles'].sort_values().tail(n=10)

In [None]:
energy_consumption_df.query('program == "prepilot"').groupby("mode_confirm").sum()['distance_miles'].sort_values().tail(n=10)

In [None]:
ax = energy_consumption_df.query('program == "4c" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["expected", "user_labeled"]].plot(kind="bar")
ax1 = energy_consumption_df.query('program == "4c" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["error_for_confusion"]].plot(kind="bar")
ax, ax1

In [None]:
user_label_ec_4c = energy_consumption_df.query('program == "4c"').user_labeled.sum(); user_label_ec_4c
user_label_ec_prepilot = energy_consumption_df.query('program == "prepilot"').user_labeled.sum(); user_label_ec_prepilot

In [None]:
energy_consumption_df.query('program == "4c" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["expected", "user_labeled", "error_for_confusion"]] / user_label_ec_4c

In [None]:
energy_consumption_df.query('program == "prepilot" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["expected", "user_labeled", "error_for_confusion"]] / user_label_ec_prepilot

In [None]:
ax = energy_consumption_df.query('program == "prepilot" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["expected", "user_labeled"]].plot(kind="bar")

In [None]:
ax = energy_consumption_df.query('program == "vail" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["expected", "user_labeled"]].plot(kind="bar")

In [None]:
energy_consumption_df.query('program == "vail" & (mode_confirm == "pilot_ebike" | mode_confirm == "drove_alone" | mode_confirm == "shared_ride")').groupby(['mode_confirm', 'primary_mode']).sum()[["expected", "user_labeled", "error_for_confusion"]] / user_label_ec_prepilot

### Let's bootstrap by creating splits with re-shuffling

Before this, we have splits of 500 x 120, representing 500 fake programs with 120 trips per program
But our actual program sizes are in the 1000s. So let's create programs of 6000 trips each, which is 
If we reshuffle and re-generate 10 times, we will end up with 100 fake programs of 6000 trips each

Let's try to work with that instead

In [None]:
get_set_splits(n_splits=10)

In [None]:
large_size_splits = []
for round in range(50):
    large_size_splits.append(get_set_splits(n_splits=10))

In [None]:
large_size_splits = np.array(large_size_splits).flatten()

In [None]:
split_results = get_split_results(large_size_splits)

In [None]:
split_results[["drove_alone_2_shared_ride", "no_sensed_ratio", "not_a_trip_ratio", "mispredicted_as_walk", 'misrepresented_car_like_as_no_sensed', 'non_car_2_car_user_label', 'e_bike_ratio', "car_like_ratio", 'error_pct_for_confusion']].plot(subplots=True, layout=(3,3), figsize=(12,12), sharex=True)

In [None]:
IND_VAR

In [None]:
IND_VAR_FINAL = IND_VAR.copy() + ['misrepresented_car_like_as_no_sensed', 'car_like_ratio']
IND_VAR_FINAL.remove('distance_miles')
parameters = {'svm__C':range(1, 30, 2), 'svm__gamma': ('scale', 'auto')}
svr = skp.Pipeline([("scaler", skpr.StandardScaler()), ("svm", sksvm.SVR(kernel='linear'))])
clf = skm.GridSearchCV(svr, parameters, verbose=2)
split_results = get_split_results(split_reps[median_split_reps_index])
clf.fit(split_results[IND_VAR_FINAL], split_results[DEP_VAR])

In [None]:
svr.get_params()

In [None]:
third_check_results = pd.DataFrame(clf.cv_results_); third_check_results.head()

In [None]:
ax = third_check_results.mean_test_score.plot(kind="bar")
ax.set_xticklabels(third_check_results.params.tolist())
ax

In [None]:
clf.best_estimator_

In [None]:
pd.Series(clf.best_estimator_['svm'].coef_.flatten(), index=IND_VAR_FINAL)

In [None]:
program_specific_split_results = get_split_results(program_specific_splits_series.values).set_index(program_specific_splits_series.index)

In [None]:
clf.best_estimator_.predict(program_specific_split_results[IND_VAR_FINAL])

In [None]:
program_specific_split_results["predicted_error_pct_for_confusion"] = clf.best_estimator_.predict(program_specific_split_results[IND_VAR_FINAL])

In [None]:
program_specific_split_results[["error_pct_for_confusion", "predicted_error_pct_for_confusion"]].plot(kind="bar")

In [None]:
pd.Series(clf.best_estimator_['svm'].coef_.flatten(), index=IND_VAR_FINAL) * program_specific_split_results[IND_VAR_FINAL].loc["4c"]

In [None]:
pd.Series(clf.best_estimator_['svm'].coef_.flatten(), index=IND_VAR_FINAL) * program_specific_split_results[IND_VAR_FINAL].loc["vail"]

### Retrying after removing the `mispredicted_as_car` as well

In [None]:
IND_VAR_FINAL = IND_VAR.copy() + ['misrepresented_car_like_as_no_sensed', 'car_like_ratio']
IND_VAR_FINAL.remove('distance_miles')
IND_VAR_FINAL.remove('mispredicted_as_car')
parameters = {'svm__C':range(7, 30, 2), 'svm__gamma': ('scale', 'auto')}
svr = skp.Pipeline([("scaler", skpr.StandardScaler()), ("svm", sksvm.SVR(kernel='linear'))])
clf = skm.GridSearchCV(svr, parameters, verbose=2)
split_results = get_split_results(split_reps[median_split_reps_index])
clf.fit(split_results[IND_VAR_FINAL], split_results[DEP_VAR])

In [None]:
fourth_check_results = pd.DataFrame(clf.cv_results_); fourth_check_results.head()

In [None]:
ax = fourth_check_results.mean_test_score.plot(kind="bar")
ax.set_xticklabels(fourth_check_results.params.tolist())
ax

In [None]:
clf.best_estimator_.predict(program_specific_split_results[IND_VAR_FINAL])

In [None]:
program_specific_split_results["predicted_error_pct_for_confusion"] = clf.best_estimator_.predict(program_specific_split_results[IND_VAR_FINAL])

In [None]:
program_specific_split_results[["error_pct_for_confusion", "predicted_error_pct_for_confusion"]].plot(kind="bar")

In [None]:
pd.Series(clf.best_estimator_['svm'].coef_.flatten(), index=IND_VAR_FINAL) * program_specific_split_results[IND_VAR_FINAL].loc["4c"]