In [None]:
import seaborn as sns
from abc import ABC, abstractmethod
from typing import List
import ast

In [None]:
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from enum import Enum
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

%matplotlib inline

In [None]:
# Global experiment flags and variables.
SEED = 13210

'''
'No Travel', 'Free Shuttle', 'Other', 'Gas Car, drove alone',
       'Regular Bike', 'Walk', 'Gas Car, with others', 'Bus', 'E-bike',
       'Scooter share', 'Taxi/Uber/Lyft', 'Train', 'Bikeshare',
       'Skate board', 'Not a Trip'
'''

TARGET_MAPPING = {
    'No Travel': 'no_trip',
    'Free Shuttle': 'transit',
    'Other': 'unknown',
    'Gas Car, drove alone': 'car',
    'Regular Bike': 'p_micro',
    'Walk': 'walk',
    'Gas Car, with others': 's_micro',
    'Bus': 'transit',
    'E-bike': 'p_micro',
    'Scooter share': 's_micro',
    'Taxi/Uber/Lyft': 'ridehail',
    'Train': 'transit',
    'Bikeshare': 's_micro',
    'Skate board': 'p_micro',
    'Not a Trip': 'no_trip'
}


TARGETS = {
    x: ix for (ix, x) in enumerate([
        'p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown'
    ])
}

av_modes = {
    'Skateboard': 'p_micro', 
    'Walk/roll': 'walk', 
    'Shared bicycle or scooter': 's_micro', 
    'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail', 
    'Rental car (including Zipcar/ Car2Go)': 'car',
    'Bicycle': 'p_micro', 
    'Public transportation (bus, subway, light rail, etc.)': 'transit',
    'Get a ride from a friend or family member': 's_car',
    'None': 'no_trip', 
    'Prefer not to say': 'unknown'
}

# Set the Numpy seed too.
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
TARGETS

In [None]:
data = pd.read_csv('../data/final_modeling_data_02142024.csv')
weather_df = pd.read_csv('../data/denver_weather_data.csv')

In [None]:
data.Replaced_mode = data.Replaced_mode.replace(TARGET_MAPPING)
data.Replaced_mode = data.Replaced_mode.replace(TARGETS)
data.rename(columns={'Replaced_mode': 'target'}, inplace=True)

In [None]:
data[list(av_modes.values())] = 0

In [None]:
def encode_availability(x):
    modes = [y.strip() for y in x.available_modes.split(';')]
    mapped = set([av_modes[x] for x in modes])
    
    for mode in mapped:
        x[mode] = 1
    
    return x


data = data.apply(lambda x: encode_availability(x), axis=1)
data.drop(columns=['available_modes'], inplace=True)

In [None]:
data['mark'] = 0

data.section_distances = data.section_distances.apply(lambda x: ast.literal_eval(x))
data.section_modes = data.section_modes.apply(lambda x: ast.literal_eval(x))
data.section_durations = data.section_durations.apply(lambda x: ast.literal_eval(x))

data.mark = data.apply(
    lambda x: 1 if (len(x.section_distances) == len(x.section_modes) == len(x.section_durations))
    and len(x.section_distances) > 0 and len(x.section_modes) > 0 and len(x.section_durations) > 0 else 0,
    axis=1
)

data.section_distances = data.section_distances.apply(lambda x: np.array(x).astype(np.float64))
data.section_modes = data.section_modes.apply(lambda x: np.array(x))
data.section_durations = data.section_durations.apply(lambda x: np.array(x).astype(np.float64))

In [None]:
data = data.loc[data.mark == 1, :].drop(columns=['mark'], inplace=False).reset_index(drop=True)

In [None]:
class SectionScaler:
    def __init__(self):
        self.dur = dict()
        self.dist = dict()
    
    def compute_stats(self, df):
        
        for _, row in df[['section_modes', 'section_distances', 'section_durations']].iterrows():
            for (mode, distance, duration) in zip(
                row['section_modes'], row['section_distances'], row['section_durations']
            ):
                if mode not in self.dur.keys():
                    self.dur[mode] = [duration]
                else:
                    self.dur[mode].append(duration)
                
                if mode not in self.dist.keys():
                    self.dist[mode] = [distance]
                else:
                    self.dist[mode].append(distance)

        for mode in self.dur.keys():
            self.dur[mode] = [np.nanmean(self.dur[mode]), np.std(self.dur[mode])]
    
        for mode in self.dist.keys():
            self.dist[mode] = [np.nanmean(self.dist[mode]), np.std(self.dist[mode])]
    
    def apply(self, df):

        rows = list()
        
        for ix, x in df.iterrows():
            row = x.to_dict()
            modes = row['section_modes']
            distances = row['section_distances']
            durations = row['section_durations']
            
            norm_distances = [
                (distances[i] - self.dist[mode][0])/self.dist[mode][1] for i, mode in enumerate(modes)
            ]
            
            norm_durations = [
                (durations[i] - self.dur[mode][0])/self.dur[mode][1] for i, mode in enumerate(modes)
            ]

            if ix == 0:
                print(norm_distances, norm_durations)
            
            row['section_distances'] = norm_distances
            row['section_durations'] = norm_durations

            rows.append(row)

        return pd.DataFrame(data=rows)

In [None]:
class SPLIT_TYPE(Enum):
    INTRA_USER = 0
    INTER_USER = 1
    TARGET = 2
    MODE = 3
    INTER_USER_STATIC = 4
    

class SPLIT(Enum):
    TRAIN = 0
    TEST = 1

def get_splits(count_df: pd.DataFrame, n:int, test_size=0.2):
    maxsize = int(n * test_size)

    max_threshold = int(maxsize * 1.05)
    min_threshold = int(maxsize * 0.95)

    print(f"{min_threshold}, {max_threshold}")
    
    # Allow a 10% tolerance
    def _dp(ix, curr_size, ids, cache):
        
        if ix >= count_df.shape[0]:
            return []

        key = ix

        if key in cache:
            return cache[key]

        if curr_size > max_threshold:
            return []

        if min_threshold <= curr_size <= max_threshold:
            return ids

        # two options - either pick the current id or skip it.
        branch_a = _dp(ix, curr_size+count_df.loc[ix, 'count'], ids+[count_df.loc[ix, 'index']], cache)
        branch_b = _dp(ix+1, curr_size, ids, cache)
        
        curr_max = []
        if branch_a and len(branch_a) > 0:
            curr_max = branch_a
        
        if branch_b and len(branch_b) > len(branch_a):
            curr_max = branch_b
            
        cache[key] = curr_max
        return cache[key]
    
    return _dp(0, 0, ids=list(), cache=dict())


def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):

    n_users = list(data.user_id.unique())
    n = data.shape[0]
    
    if shuffle:
        data = data.sample(data.shape[0], random_state=SEED).reset_index(drop=True, inplace=False)

    if how == SPLIT_TYPE.INTER_USER:
        # Make the split, ensuring that a user in one fold is not leaked into the other fold.
        # Basic idea: we want to start with the users with the highest instances and place 
        # alternating users in each set.
        counts = data.user_id.value_counts().reset_index(drop=False, inplace=False, name='count')

        # Now, start with the user_id at the top, and keep adding to either split.
        # This can be achieved using a simple DP program.
        test_ids = get_splits(counts, data.shape[0])
        test_data = data.loc[data.user_id.isin(test_ids), :]
        train_index = data.index.difference(test_data.index)
        train_data = data.loc[data.user_id.isin(train_index), :]
        
        return train_data, test_data
    
    elif how == SPLIT_TYPE.INTRA_USER:
        
        # There are certain users with only one observation. What do we do with those?
        # As per the mobilitynet modeling pipeline, we randomly assign them to either the
        # training or test set.
        
        value_counts = data.user_id.value_counts()
        single_count_ids = value_counts[value_counts == 1].index
        
        data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)
        data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)
        
        X_tr, X_te = train_test_split(
            data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,
            random_state=SEED
        )
        
        data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))
        X_tr_merged = pd.concat(
            [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(
                columns=['assigned'], inplace=False
            )],
            ignore_index=True, axis=0
        )
        
        X_te_merged = pd.concat(
            [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(
                columns=['assigned'], inplace=False
            )],
            ignore_index=True, axis=0
        )
        
        return X_tr_merged, X_te_merged
    
    elif how == SPLIT_TYPE.TARGET:
        
        X_tr, X_te = train_test_split(
            data, test_size=test_ratio, shuffle=shuffle, stratify=data.target,
            random_state=SEED
        )
        
        return X_tr, X_te
    
    elif how == SPLIT_TYPE.MODE:
        X_tr, X_te = train_test_split(
            data, test_size=test_ratio, shuffle=shuffle, stratify=data.section_mode_argmax,
            random_state=SEED
        )
        
        return X_tr, X_te
    
    elif how == SPLIT_TYPE.INTER_USER_STATIC:
        
        train_ids = ['810be63d084746e3b7da9d943dd88e8c', 'bf774cbe6c3040b0a022278d36a23f19', '8a8332a53a1b4cdd9f3680434e91a6ef', 
                     '5ad862e79a6341f69f28c0096fe884da', '7f89656bd4a94d12ad8e5ad9f0afecaf', 'fbaa338d7cd7457c8cad4d0e60a44d18', 
                     '3b25446778824941a4c70ae5774f4c68', '28cb1dde85514bbabfd42145bdaf7e0a', '3aeb5494088542fdaf798532951aebb0', 
                     '531732fee3c24366a286d76eb534aebc', '950f4287bab5444aa0527cc23fb082b2', '737ef8494f26407b8b2a6b1b1dc631a4', 
                     'e06cf95717f448ecb81c440b1b2fe1ab', '7347df5e0ac94a109790b31ba2e8a02a', 'bd9cffc8dbf1402da479f9f148ec9e60', 
                     '2f3b66a5f98546d4b7691fba57fa640f', 'f289f7001bd94db0b33a7d2e1cd28b19', '19a043d1f2414dbcafcca44ea2bd1f19', 
                     '68788082836e4762b26ad0877643fdcf', '4e8b1b7f026c4384827f157225da13fa', '703a9cee8315441faff7eb63f2bfa93f', 
                     'add706b73839413da13344c355dde0bb', '47b5d57bd4354276bb6d2dcd1438901d', 'e4cfb2a8f600426897569985e234636e', 
                     '0154d71439284c34b865e5a417cd48af', '234f4f2366244fe682dccded2fa7cc4e', '0d0ae3a556414d138c52a6040a203d24', 
                     '44c10f66dec244d6b8644231d4a8fecb', '30e9b141d7894fbfaacecd2fa18929f9', '0eb313ab00e6469da78cc2d2e94660fb', 
                     'fc51d1258e4649ecbfb0e6ecdaeca454', 'a1954793b1454b2f8cf95917d7547169', '6656c04c6cba4c189fed805eaa529741', 
                     '6a0f3653b80a4c949e127d6504debb55', 'dfe5ca1bb0854b67a6ffccad9565d669', '8b1f3ba43de945bea79de6a81716ad04', 
                     'cde34edb8e3a4278a18e0adb062999e5', '6d96909e5ca442ccb5679d9cdf3c8f5b', 'a60a64d82d1c439a901b683b73a74d73', 
                     '60e6a6f6ed2e4e838f2bbed6a427028d', '88041eddad7542ea8c92b30e5c64e198', '1635c003b1f94a399ebebe21640ffced', 
                     '1581993b404a4b9c9ca6b0e0b8212316', 'b1aed24c863949bfbfa3a844ecf60593', '4b89612d7f1f4b368635c2bc48bd7993', 
                     'eb2e2a5211564a9290fcb06032f9b4af', '26767f9f3da54e93b692f8be6acdac43', '8a98e383a2d143e798fc23869694934a', 
                     'b346b83b9f7c4536b809d5f92074fdae', 'd929e7f8b7624d76bdb0ec9ada6cc650', '863e9c6c8ec048c4b7653f73d839c85b', 
                     'f50537eb104e4213908f1862c8160a3e', '4a9db5a9bac046a59403b44b883cc0ba', 'cded005d5fd14c64a5bba3f5c4fe8385', 
                     'c7ce889c796f4e2a8859fa2d7d5068fe', '405b221abe9e43bc86a57ca7fccf2227', '0b3e78fa91d84aa6a3203440143c8c16', 
                     'fbff5e08b7f24a94ab4b2d7371999ef7', 'e35e65107a34496db49fa5a0b41a1e9e', 'd5137ebd4f034dc193d216128bb7fc9a', 
                     '3f7f2e536ba9481e92f8379b796ad1d0', 'dc75e0b776214e1b9888f6abd042fd95', 'b41dd7d7c6d94fe6afe2fd26fa4ac0bd', 
                     'eec6936e1ac347ef9365881845ec74df', '8c7d261fe8284a42a777ffa6f380ba3b', '4baf8c8af7b7445e9067854065e3e612', 
                     'c6e4db31c18b4355b02a7dd97deca70b', 'f0db3b1999c2410ba5933103eca9212f', '487e20ab774742378198f94f5b5b0b43', 
                     'dc1ed4d71e3645d0993885398d5628ca', '8c3c63abb3ec4fc3a61e7bf316ee4efd', '15eb78dd6e104966ba6112589c29dc41', 
                     'c23768ccb817416eaf08be487b2e3643', 'ecd2ae17d5184807abd87a287115c299', '71f21d53b655463784f3a3c63c56707b', 
                     '2931e0a34319495bbb5898201a54feb5', '92bde0d0662f45ac864629f486cffe77', '42b3ee0bc02a481ab1a94644a8cd7a0d', 
                     '15aa4ba144a34b8b8079ed7e049d84df', '509b909390934e988eb120b58ed9bd8c', '14103cda12c94642974129989d39e50d', 
                     '8b0876430c2641bcaea954ea00520e64', 'baa4ff1573ae411183e10aeb17c71c53', '14fe8002bbdc4f97acbd1a00de241bf6', 
                     '1b7d6dfea8464bcab9321018b10ec9c9', '487ad897ba93404a8cbe5de7d1922691', '5182d93d69754d7ba06200cd1ac5980a', 
                     '91f3ca1c278247f79a806e49e9cc236f', 'e66e63b206784a559d977d4cb5f1ec34', '840297ae39484e26bfebe83ee30c5b3e', 
                     'c6807997194c4c528a8fa8c1f6ee1595', '802667b6371f45b29c7abb051244836a', 'b2bbe715b6a14fd19f751cae8adf6b4e', 
                     'feb1d940cd3647d1a101580c2a3b3f8c', '1b9883393ab344a69bc1a0fab192a94c', 'ac604b44fdca482fb753034cb55d1351', 
                     'f446bf3102ff4bd99ea1c98f7d2f7af0', 'c2c5d4b9a607487ea405a99c721079d4', '85ddd3c34c58407392953c47a32f5428', 
                     'd51de709f95045f8bacf473574b96ba5', '6373dfb8cb9b47e88e8f76adcfadde20', '313d003df34b4bd9823b3474fc93f9f9', 
                     '53e78583db87421f8decb529ba859ca4', '8fdc9b926a674a9ea07d91df2c5e06f2', '90480ac60a3d475a88fbdab0a003dd5d', 
                     '7559c3f880f341e898a402eba96a855d', '19a4c2cf718d40588eb96ac25a566353', 'f4427cccaa9442b48b42bedab5ab648e', 
                     'e192b8a00b6c422296851c93785deaf7', '355e25bdfc244c5e85d358e39432bd44', 'a0c3a7b410b24e18995f63369a31d123', 
                     '03a395b4d8614757bb8432b4984559b0', 'a2d48b05d5454d428c0841432c7467b6', '3d981e617b304afab0f21ce8aa6c9786', 
                     '2cd5668ac9054e2eb2c88bb4ed94bc6d', 'd7a732f4a8644bcbb8dedfc8be242fb2', '367eb90b929d4f6e9470d15c700d2e3f', 
                     'e049a7b2a6cb44259f907abbb44c5abc', 'a231added8674bef95092b32bc254ac8', 'e88a8f520dde445484c0a9395e1a0599',
                     'cba570ae38f341faa6257342727377b7', '97953af1b97d4e268c52e1e54dcf421a', 'd200a61757d84b1dab8fbac35ff52c28', 
                     'fc68a5bb0a7b4b6386b3f08a69ead36f', '4a8210aec25e443391efb924cc0e5f23', '903742c353ce42c3ad9ab039fc418816', 
                     '2114e2a75304475fad06ad201948fbad', 'ac917eae407c4deb96625dd0dc2f2ba9', '3dddfb70e7cd40f18a63478654182e9a', 
                     'd3735ba212dd4c768e1675dca7bdcb6f', '7abe572148864412a33979592fa985fb', 'd3dff742d07942ca805c2f72e49e12c5' 
                     ]
        
        X_tr = data.loc[data.user_id.isin(train_ids), :]
        X_te = data.loc[~data.user_id.isin(train_ids), :]
        
        return X_tr, X_te
    
    raise NotImplementedError("Unknown split type")

In [None]:
def drop_columns(df: pd.DataFrame):
    to_drop = [
       'raw_trip',
        'start_ts',
        'start_loc',
        'start_place',
        'end_place',
        'cleaned_trip',
        'inferred_labels',
        'inferred_trip',
        'expectation',
        'confidence_threshold',
        'expected_trip',
        'user_input',
        'start:year',
        'start:month',
        'start:day',
        'start:hour',
        'start_local_dt_minute',
        'start_local_dt_second',
        'start_local_dt_weekday',
        'start_local_dt_timezone',
        'end:year',
        'end:month',
        'end:day',
        'end:hour',
        'end_local_dt_minute',
        'end_local_dt_second',
        'end_local_dt_weekday',
        'end_local_dt_timezone',
        '_id',
        'metadata_write_ts',
        'additions',
        'mode_confirm',
        'purpose_confirm',
        'distance_miles',
        'Mode_confirm',
        'Trip_purpose',
        'original_user_id',
        'program',
        'opcode',
        'Timestamp',
        'birth_year',
        'gender_Man',
        'gender_Man;Nonbinary/genderqueer/genderfluid',
        'gender_Nonbinary/genderqueer/genderfluid',
        'gender_Prefer not to say',
        'gender_Woman',
        'gender_Woman;Nonbinary/genderqueer/genderfluid',
        'has_multiple_jobs_No',
        'has_multiple_jobs_Prefer not to say',
        'has_multiple_jobs_Yes',
        "highest_education_Bachelor's degree",
        'highest_education_Graduate degree or professional degree',
        'highest_education_High school graduate or GED',
        'highest_education_Less than a high school graduate',
        'highest_education_Prefer not to say',
        'highest_education_Some college or associates degree',
        'primary_job_type_Full-time',
        'primary_job_type_Part-time',
        'primary_job_type_Prefer not to say',
        'is_overnight_trip',
        'n_working_residents',
        'start_lat',
        'start_lng',
        'end_lat',
        'end_lng',
        'source', 'end_ts', 'end_fmt_time', 'end_loc',
       ]

    # Drop section_mode_argmax and available_modes.
    return df.drop(
        columns=to_drop, 
        inplace=False
    )

In [None]:
processed = drop_columns(data)

train_df, test_df = get_train_test_splits(data=processed, how=SPLIT_TYPE.INTER_USER_STATIC, shuffle=True)

scaler = SectionScaler()
scaler.compute_stats(train_df)

In [None]:
print(scaler.dist)

In [None]:
train_df = scaler.apply(train_df)
test_df = scaler.apply(test_df)

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.section_distances.head()

In [None]:
class SectionEmbedding(nn.Module):
    def __init__(self, input_dim, emb_dim=32):
        super(SectionEmbedding, self).__init__()
        self.dpt = nn.Dropout(0.2)
        self.encoder = nn.Linear(input_dim, emb_dim)
        self.decoder = nn.Linear(emb_dim, input_dim)
        self.act = nn.LeakyReLU()
    
    def forward(self, x):
        '''
        Input will be a one-hot encoded matrix, where nrows=number of modes, ncols=input_dim
        dim = (B, N, D)
        
        '''

In [None]:
class ReplacedModeDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.data = df
    
    def __len__(self):
        return len(self.data.ix.unique())
    
    def __getitem__(self, ix):
        
        # Could be between 1 - 15.
        sequence = self.data.loc[self.data.ix == ix, :]
        
        # Static features that do not vary with time.
        demographic_features = ['n_residence_members', 
        'primary_job_commute_time', 'income_category',
        'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles', 'age', 
        'p_micro', 'walk', 's_micro', 'ridehail', 'car', 'transit', 's_car', 'no_trip', 'unknown',
        'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', 
        'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', 
        'primary_job_description_Education', 'primary_job_description_Food service', 
        'primary_job_description_Manufacturing, construction, maintenance, or farming', 
        'primary_job_description_Medical/healthcare', 'primary_job_description_Other', 
        'primary_job_description_Professional, managerial, or technical', 
        'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', 
        'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', 
        'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', 
        'primary_job_commute_mode_WFH', 'duration', 'distance']
        
        seq_features = ['section_distances', 'section_durations', 'section_modes', 'mph']
        
        weather_features = ['temperature_2m (°F)', 
        'relative_humidity_2m (%)', 'dew_point_2m (°F)', 'rain (inch)', 'snowfall (inch)', 
        'wind_speed_10m (mp/h)', 'wind_gusts_10m (mp/h)']
        
        return (
            sequence[seq_features], sequence[demographic_features], 
            sequence[weather_features], sequence['target']
        )

In [None]:
dset = ReplacedModeDataset(train_df)

print(dset.__getitem__(20))

In [None]:
train_dset = CustomDataset(train_df)
test_dset = CustomDataset(test_df)

In [None]:
def collate(batch):
    X, y = zip(*batch)
    
    seq_modes = [x[0] for x in X]
    seq_metrics = [x[1] for x in X]
    features = [x[-1] for x in X]

    padded_seq = pad_sequence([s for s in seq_modes], batch_first=True)
    padded_metrics = pad_sequence([m for m in seq_metrics], batch_first=True)
    lengths = [len(seq) for seq in seq_modes]
    stacked_features = torch.stack(features)

    return (padded_seq, padded_metrics, stacked_features), torch.stack(y), lengths

In [None]:
train_loader = DataLoader(train_dset, batch_size=16, collate_fn=collate, shuffle=True, drop_last=False)
test_loader = DataLoader(test_dset, batch_size=16, collate_fn=collate, shuffle=True, drop_last=False)

In [None]:
(modes, metrics, features), sY1, lX = next(iter(train_loader))

In [None]:
metrics.size(), modes.size()

In [None]:
# Set to 0 for no dropout.
DROPOUT = 0.

In [None]:
import math

class GELU_new(nn.Module):
    """
    Taken from OpenAI GPT-2 implementation.
    """
    
    def __init__(self):
        super(GELU_new, self).__init__()
    
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


class DilatedBlock(nn.Module):
    def __init__(self, n_c):
        super(DilatedBlock, self).__init__()
        
        self.block = nn.Sequential(
            nn.Linear(n_c, 4*n_c, bias=False),
            GELU_new(),
            nn.Linear(4*n_c, n_c, bias=False),
            nn.Dropout(DROPOUT)
        )
    
    def forward(self, x):
        return self.block(x)

    
class SelfAttention(nn.Module):
    def __init__(self, n_features, head_size):
        super(SelfAttention, self).__init__()
        # in: (B, F, 64)
        self.k = nn.Linear(n_features, head_size, bias=False)
        self.q = nn.Linear(n_features, head_size, bias=False)
        self.v = nn.Linear(n_features, head_size, bias=False)
        self.dpt = nn.Dropout(DROPOUT)
        self.sqrt_d = torch.sqrt(torch.tensor(head_size))
        
    def forward(self, x):
        k = self.k(x)
        q = self.q(x)
        v = self.v(x)
        
        # Q.K.t
        dot = torch.bmm(q, k.permute(0, 2, 1))
        
        # normalize dot product.
        dot /= self.sqrt_d
        
        # softmax over -1 dim.
        softmax = self.dpt(torch.softmax(dot, dim=-1))
        
        # dot with values. (B, F, F) * (B, F, x) = (B, F, x)
        return torch.bmm(softmax, v)
        

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, n_dim):
        super(MultiHeadAttention, self).__init__()
        
        # 64 dims, 4 heads => 16 dims per head.
        head_size = n_dim//n_heads
        self.heads = nn.ModuleList([SelfAttention(n_dim, head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_dim, n_dim, bias=False)
    
    def forward(self, x):
        # x is (B, seq, n_dim)
        cat = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.proj(cat)


class Block(nn.Module):
    def __init__(self, n_c):
        super(Block, self).__init__()
        
        self.sa = MultiHeadAttention(n_heads=4, n_dim=n_c)
        self.dilated = DilatedBlock(n_c)
        self.ln1 = nn.LayerNorm(n_c)
        self.ln2 = nn.LayerNorm(n_c)
        
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.dilated(self.ln2(x))
        return x
    

class LSTMLayer(nn.Module):
    def __init__(
        self, input_size: int, hidden_size: int, 
        output_size: int, n_lstm_layers: int = 1
    ):
        super(LSTMLayer, self).__init__()
        
        n_embed_mode = 16
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(7, n_embed_mode, padding_idx=0)
        self.dpt = nn.Dropout(DROPOUT)
        
        self.lstm = nn.LSTM(
            input_size=input_size + n_embed_mode,
            hidden_size=hidden_size,
            bias=False,
            bidirectional=True,
            batch_first=True,
            num_layers=n_lstm_layers
        )
    
    def forward(self, modes, x, lengths):
        mode_emb = self.embedding(modes)
        x = torch.cat([x, mode_emb], dim=-1)
        
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(out, batch_first=True)
        
        return self.dpt(unpacked)


class Model(nn.Module):
    def __init__(
        self, input_size: int, hidden_size: int, output_size: int, 
        n_features: int, n_lstm_layers: int = 1, **kwargs
    ):
        super(Model, self).__init__()
        
        block1_ip_dim = hidden_size*2
        block2_ip_dim = (hidden_size*2) + n_features
        
        self.lstm = LSTMLayer(
            input_size, hidden_size, 
            output_size, n_lstm_layers
        )
        
        self.block_l1 = nn.ModuleList([Block(block1_ip_dim) for _ in range(kwargs['l1_blocks'])])
        self.block_l2 = nn.ModuleList([Block(block2_ip_dim) for _ in range(kwargs['l2_blocks'])])
        self.final_proj = nn.Linear(block2_ip_dim, output_size, bias=True)
    
    def forward(self, modes, x, features, lengths):
        
        b = x.size(0)
        
        # Out = (B, seq, hidden*2)
        lstm_out = self.lstm(modes, x, lengths)
        
        # Pass the raw output through the blocks.
        for module in self.block_l1:
            lstm_out = module(lstm_out)
        
        features_rshp = features.unsqueeze(1).expand(b, lstm_out.size(1), -1)
        
        # Out = (B, seq, n+40)
        cat = torch.cat([lstm_out, features_rshp], dim=-1)
        
        for module in self.block_l2:
            cat = module(cat)
        
        # (8, 3, 104) -> (B, 104)
        # flattened = cat.view(b, -1)
        
        # proj = self.runtime_ffw(flattened.size(-1), 64)(flattened)
        proj = cat.mean(dim=1)
        
        return self.final_proj(proj)

In [None]:
import torch.nn.init as init

def init_weights(module):
    if isinstance(module, nn.Embedding):
        module.weight.data.normal_(mean=0.0, std=1.0)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)
    elif isinstance(module, nn.BatchNorm1d):
        init.normal_(m.weight.data, mean=1, std=0.02)
        init.constant_(m.bias.data, 0)
    elif isinstance(module, nn.Linear):
        init.xavier_normal_(module.weight.data)
        if module.bias is not None:
            init.normal_(module.bias.data)
    elif isinstance(module, nn.LSTM):
        for param in module.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)

In [None]:
model = Model(
    n_lstm_layers=3,
    input_size=3,
    hidden_size=32, 
    output_size=num_classes,
    n_features=40,
    l1_blocks=4,
    l2_blocks=4
)

model = model.apply(init_weights)

print(model)

In [None]:
print(sum(p.numel() for p in model.parameters()))

In [None]:
weights = train_df.shape[0]/(np.bincount(train_df.chosen.values) * len(np.unique(train_df.chosen)))

print(weights)

In [None]:
INIT_LR = 1e-3
optimizer = optim.Adam(model.parameters(), lr=INIT_LR)
criterion = nn.CrossEntropyLoss(weight=torch.Tensor(weights))

In [None]:
class Trainer:
    def __init__(self, model, tr_loader, te_loader):
        pass
    
    def set_optim_params(self, **kwargs):
        pass
    
    def set_criterion(self, **kwargs):
        pass

In [None]:
def train(epoch, model, loader, opt, criterion, val_ix):
    
    print("\tBeginning training.")
    
    n_batches = len(loader)
    print_every = n_batches//5
    
    train_losses, val_losses = [], []
    
    for ix, (X, y, lengths) in enumerate(loader):
        
        # Unpack X.
        modes, metrics, features = X
        # Cast y to appropriate type.
        y = y.float()
        
        if ix in val_ix:
            model.eval()
            with torch.no_grad():
                y_pred = model(modes, metrics.float(), features.float(), lengths)
                loss = criterion(y_pred.view(-1, num_classes), y.view(-1, num_classes))
                val_losses.append(loss.item())
        else:
            model.train()
            
            opt.zero_grad()

            y_pred = model(modes, metrics.float(), features.float(), lengths)
            loss = criterion(y_pred.view(-1, num_classes), y.view(-1, num_classes))
            train_losses.append(loss.item())

            loss.backward()

            optimizer.step()
            
        if ix and ix % print_every == 0:
                print(
                    f"\t-> Train loss: {np.nanmean(train_losses)}\n\t-> Val loss: {np.nanmean(val_losses)}"
                )
                print('\t'+20*'*')

    print(50*'-')
    return train_losses, val_losses

In [None]:
def evaluate(model, loader, criterion):
    
    print("\tBeginning evaluation.")
    
    model.eval()
    
    print_every = len(loader)//5
    
    losses = []
    
    for ix, (X, y, lengths) in enumerate(loader):
        
        modes, metrics, features = X

        y_pred = model(modes, metrics.float(), features.float(), lengths)
        y = y.float()
        
        loss = criterion(y_pred.view(-1, num_classes), y.view(-1, num_classes))

        losses.append(loss.item())
        
        if ix and ix % print_every == 0:
            print(f"\t -> Average loss: {np.nanmean(losses)}")

In [None]:
from sklearn.metrics import f1_score


def evaluate_f1(model, tr_loader, val_ix, te_loader=None):
    
    tr_preds, val_preds, te_preds = np.array([]), np.array([]), np.array([])
    tr_gt, val_gt, te_gt = np.array([]), np.array([]), np.array([])
    
    model.eval()
    print("\tEvaluating F1...")
    
    with torch.no_grad():
        for ix, (X, y, lengths) in enumerate(tr_loader):
        
            modes, metrics, features = X

            y_pred = model(modes, metrics.float(), features.float(), lengths).view(-1, num_classes)
            y = y.float().view(-1, num_classes)

            preds = torch.argmax(F.softmax(y_pred, dim=-1), dim=-1).numpy().ravel()
            true = torch.argmax(y.long(), dim=-1).numpy().ravel()
            
            if ix in val_ix:
                val_preds = np.append(val_preds, preds)
                val_gt = np.append(val_gt, true)
            else:
                tr_preds = np.append(tr_preds, preds)
                tr_gt = np.append(tr_gt, true)
            
        tr_f1 = f1_score(y_true=tr_gt, y_pred=tr_preds, average='weighted')
        val_f1 = f1_score(y_true=val_gt, y_pred=val_preds, average='weighted')
        print(f"\t -> Train F1: {tr_f1}, Val F1: {val_f1}")
        
        if not te_loader:
            return tr_f1, val_f1, None

        for ix, (X, y, lengths) in enumerate(te_loader):
        
            modes, metrics, features = X

            y_pred = model(modes, metrics.float(), features.float(), lengths).view(-1, num_classes)
            y = y.float().view(-1, num_classes)
            
            preds = torch.argmax(F.softmax(y_pred, dim=-1), dim=-1).numpy().ravel()
            true = torch.argmax(y.long(), dim=-1).numpy().ravel()

            te_preds = np.append(te_preds, preds)
            te_gt = np.append(te_gt, true)
        
        te_f1 = f1_score(y_true=te_gt, y_pred=te_preds, average='weighted')
        print(f"\t -> Test F1: {te_f1}")
        
        return tr_f1, val_f1, te_f1

In [None]:
# Other training hyperparameters.
num_epochs = 18
num_decays = 6
decay_at = num_epochs // num_decays
decay = 0.9
eval_every = 3

# Static hold-out val set.
n_batches = len(train_loader)
val_split = 0.2
val_batches = np.random.choice(n_batches, size=(int(val_split * n_batches),), replace=False)

# Just checking what LRs should be after decaying.
for power in range(num_decays):
    print(f"{decay_at * power} - {decay_at * (power + 1)} :: {INIT_LR * decay**power:.5f}")

In [None]:
# We'd like to start at a loss of at most -ln(1/9) ~ 2.19

# Wrapper to contain all losses.
tr_losses, val_losses = list(), list()
save_at_best_loss = True
best_loss = np.inf
model_name = "../models/LSTM_{epoch}_{loss}.pt"
patience, delta = 2, 0

for epoch_ix in range(1, num_epochs+1):
    print(f"Epoch {epoch_ix}:")
    tr_loss, val_loss = train(epoch_ix, model, train_loader, optimizer, criterion, val_batches)
    
    tr_losses.extend(tr_loss)
    val_losses.extend(val_loss)
    
    mean_val_loss = np.nanmean(val_loss)
    
    if epoch_ix and epoch_ix % eval_every == 0:
        # evaluate(epoch_ix, model, test_loader, criterion)
        tr_f1, val_f1, _ = evaluate_f1(model, train_loader, val_batches)
    
    if mean_val_loss < best_loss and save_at_best_loss:
        best_loss = mean_val_loss
        
        # Reset delta.
        delta = 0
        
        loss_str = str(best_loss).replace(".", "_")
        torch.save(model.state_dict(), model_name.format(epoch=str(epoch_ix), loss=loss_str))
        print("\tSaved model checkpoint.")
    else:
        # Increase delta by 1.
        delta += 1
        print(f"\tLoss did not decrease. Status is now {delta}/{patience}")
    
    # Tolerate for `patience` epochs.
    if delta == patience + 1:
        # Stop training.
        break

    if epoch_ix % decay_at == 0:
        optimizer.param_groups[0]['lr'] *= decay
        print(f"\tLearning rate is now: {optimizer.param_groups[0]['lr']:.5f}")
    
    print(50*'-')

In [None]:
# Evaluate once on the test set.
evaluate(model, test_loader, criterion)

In [None]:
final_tr_f1, final_val_f1, te_f1 = evaluate_f1(model, train_loader, val_batches, test_loader)

In [None]:
# fig, ax = plt.subplots(figsize=(10, 6))
# ax.plot(tr_losses, 'r-')
# ax.plot(val_losses, 'b-')
# ax.set_title('Training and Validation losses')
# plt.legend(['Training loss', 'Validation loss'], loc='best')
# plt.tight_layout()
# plt.show()

## Benchmarking



epochs = 30

```
LR scheme:
0 - 5 :: 0.00070
5 - 10 :: 0.00067
10 - 15 :: 0.00063
15 - 20 :: 0.00060
20 - 25 :: 0.00057
25 - 30 :: 0.00054
```

```language=python
model = Model(
    n_lstm_layers=1,
    input_size=3,
    hidden_size=16, 
    output_size=9,
    n_features=40,
    l1_blocks=6,
    l2_blocks=6
)
```

\# params: ~450k

mode_embedding = 4

Best stats:
	 -> Train F1: 0.7047532574096045
	 -> Test F1: 0.6560129685481192

<hr />

epochs = 40

Same LR scheme as above.

```language=python
model = Model(
    n_lstm_layers=3,
    input_size=3,
    hidden_size=32, 
    output_size=9,
    n_features=40,
    l1_blocks=4,
    l2_blocks=4
)
```

\# params: 770k

mode_embedding = 4

Best stats:
	 -> Train F1: 0.7365035440256072
	 -> Test F1: 0.6610215030981759
     
 <hr />