In [1]:
import seaborn as sns
from abc import ABC, abstractmethod
from typing import List
import ast

In [2]:
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from enum import Enum
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

%matplotlib inline

In [3]:
# Global experiment flags and variables.
SEED = 19348
TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']

# Set the Numpy seed too.
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fd481a0e970>

In [4]:
# lstm = nn.LSTM(12, 64, batch_first=True, bidirectional=True, bias=False, num_layers=3)
# inp = torch.FloatTensor(8, 3, 12).fill_(0.)
# out, _ = lstm(inp)

# print(out.size())

In [5]:
class SPLIT_TYPE(Enum):
    INTRA_USER = 0
    INTER_USER = 1
    TARGET = 2
    MIXED = 3
    

class SPLIT(Enum):
    TRAIN = 0
    TEST = 1


def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):

    n_users = list(data.user_id.unique())
    n = data.shape[0]
    
    if shuffle:
        data = data.sample(data.shape[0], random_state=SEED).reset_index(drop=True, inplace=False)
        
    # There are certain users with only one observation. What do we do with those?
    # As per the mobilitynet modeling pipeline, we randomly assign them to either the
    # training or test set.

    value_counts = data.user_id.value_counts()
    single_count_ids = value_counts[value_counts == 1].index

    data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)
    data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)

    X_tr, X_te = train_test_split(
        data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,
        random_state=SEED
    )

    data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))
    X_tr_merged = pd.concat(
        [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(
            columns=['assigned'], inplace=False
        )],
        ignore_index=True, axis=0
    )

    X_te_merged = pd.concat(
        [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(
            columns=['assigned'], inplace=False
        )],
        ignore_index=True, axis=0
    )

    return X_tr_merged, X_te_merged

In [6]:
# RM_weather contains more samples because data is not processed.
# _Fix contains lesser samples, but is cleaned using heuristics.

data = pd.read_csv('../data/FULL_preprocessed_data_RM_weather.csv')

In [7]:
class DataPipeline(ABC):
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    # Establish hooks.
    @abstractmethod
    def remove_negative_sections(self):
        raise NotImplementedError()
    
    @abstractmethod
    def get_mode_outliers(self):
        raise NotImplementedError()
    
    @abstractmethod
    def drop_outliers(self, outliers: List):
        raise NotImplementedError()
        
    @abstractmethod
    def compute_mph_feature(self):
        raise NotImplementedError()
    
    def run_pipeline(self):
        self.remove_negative_sections()
        outlier_ix = self.get_mode_outliers()
        self.drop_outliers(list(outlier_ix))
        return self.compute_mph_feature()

In [8]:
class SectionDataPipeline(DataPipeline):
    def __init__(self, df: pd.DataFrame):
        super().__init__(df)
    
    def remove_negative_sections(self):
        f_rows = list()
        for _, row in self.df[['section_durations', 'section_distances', 'section_modes']].iterrows():

            dist = np.array(ast.literal_eval(row['section_distances']))
            dur = np.array(ast.literal_eval(row['section_durations']))
            modes = np.array(ast.literal_eval(row['section_modes']))

            assert len(dist) == len(dur) == len(modes)

            mask = np.logical_and(dist > 0, dur > 0)

            f_dist, f_dur, f_modes = dist[mask], dur[mask], modes[mask]

            assert len(f_dist) == len(f_dur) == len(f_modes)

            f_rows.append({
                # scale to miles.
                'distances': f_dist * 0.00062,
                # Scale to hours.
                'durations': f_dur / 3600.,
                'modes': f_modes
            })

        parsed = pd.DataFrame(f_rows, index=data.index)

        self.df.drop(columns=['section_durations', 'section_distances', 'section_modes'], inplace=True)
        self.df = pd.concat([self.df, parsed], axis=1)
    
    def get_mode_outliers(self):
        
        def compute_outliers(mode: str):
            x = self.df[self.df.modes.str.contains(mode, regex=False)]
            outlier_ix = []
            dist, dur = np.array([]), np.array([])

            # First, iterate to compute the 99th percentile for the mode.
            for row_ix, row in x.iterrows():
                ix = np.where(row['modes'] == mode)[0]
                dist = np.append(dist, row['distances'][ix])
                dur = np.append(dur, row['durations'][ix])

            dist_99p = np.percentile(dist, 99)
            dur_99p = np.percentile(dur, 99)

            print(f"99th Percentile for {mode}:\n\tdistance = {dist_99p}\n\tduration = {dur_99p}")

            # Iterate again.
            for row_ix, row in x.iterrows():
                ix = np.where(row['modes'] == mode)[0]
                row_distances = row['distances'][ix]
                row_durations = row['durations'][ix]

                if np.any(np.logical_or(row_distances > dist_99p, row_durations > dur_99p)):
                    outlier_ix.append(row_ix)

            print("Number of candidate rows for deletion: ", len(outlier_ix))

            return outlier_ix
        
        all_bad_ix = set()
        for mode in ['walking', 'bicycling', 'train', 'bus', 'car']:
            bad_mode_ix = compute_outliers(mode)
            all_bad_ix = all_bad_ix.union(set(bad_mode_ix))
            print(50*'=')
        
        print(f"After analysis, {len(all_bad_ix)} total unique rows will be dropped.")
        
        return all_bad_ix
    
    def drop_outliers(self, outliers: List):
        self.df.drop(index=outliers, inplace=True)
    
    def compute_mph_feature(self):
        self.df['mph'] = self.df['distances']/self.df['durations']
        return self.df

In [9]:
pipeline = SectionDataPipeline(data)
cleaned = pipeline.run_pipeline()

99th Percentile for walking:
	distance = 4.305297486707881
	duration = 2.9548623706234824
Number of candidate rows for deletion:  765
99th Percentile for bicycling:
	distance = 8.82048257219703
	duration = 0.9153121545202207
Number of candidate rows for deletion:  228
99th Percentile for train:
	distance = 16.629460194644828
	duration = 0.7250753228180938
Number of candidate rows for deletion:  2
99th Percentile for bus:
	distance = 7.551944729816364
	duration = 0.7316025587360053
Number of candidate rows for deletion:  12
99th Percentile for car:
	distance = 40.548478579524726
	duration = 1.2145738347887987
Number of candidate rows for deletion:  721
After analysis, 1718 total unique rows will be dropped.


In [10]:
def preprocess(df: pd.DataFrame):
    
    # Offset by -1 for torch.
    df.chosen -= 1
    
#     df['start_fmt_time'] = pd.to_datetime(df['start_fmt_time'], utc=True)
#     df['start_fmt_time'] = df['start_fmt_time'].dt.tz_convert('America/Denver')
    
    df.rename(
        columns={'start_local_dt_weekday': 'start:DOW', 'end_local_dt_weekday': 'end:DOW'},
        inplace=True
    )

    df.n_working_residents = df.n_working_residents.apply(lambda x: 0 if x < 0 else x)

    # Fix some age preprocessing issues.
    df.age = df.age.apply(lambda x: x if x < 100 else 2024-x)
    
    return df

In [11]:
def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict):
    
    X_features = ['section_distance_argmax', 'age']
    
    if 'mph' in df.columns:
        X_features += ['mph']
    
    if dset == SPLIT.TRAIN and model_dict is None:
        model_dict = dict()
    
    if dset == SPLIT.TEST and model_dict is None:
        raise AttributeError("Expected model dict for testing.")
    
    if dset == SPLIT.TRAIN:
        for section_mode in df.section_mode_argmax.unique():
            section_data = df.loc[df.section_mode_argmax == section_mode, :]
            if section_mode not in model_dict:
                model_dict[section_mode] = dict()

                model = LinearRegression(fit_intercept=True)

                X = section_data[
                    X_features
                ]
                Y = section_data[['section_duration_argmax']]

                model.fit(X, Y.values.ravel())

                r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel())
                print(f"Train R2 for {section_mode}: {r2}")

                model_dict[section_mode]['model'] = model
                
    elif dset == SPLIT.TEST:
        for section_mode in df.section_mode_argmax.unique():
            section_data = df.loc[df.section_mode_argmax == section_mode, :]
            X = section_data[
                X_features
            ]
            
            Y = section_data[['section_duration_argmax']]
            
            y_pred = model_dict[section_mode]['model'].predict(X)
            r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel())
            print(f"Test R2 for {section_mode}: {r2}")
    
    # Create the new columns for the duration.
    df[TARGETS] = 0
    df['temp'] = 0
    
    for section in df.section_mode_argmax.unique():
        X_section = df.loc[df.section_mode_argmax == section, X_features]
        
        # broadcast to all columns.
        df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section)
    
    for c in TARGETS:
        df[c] = df['av_' + c] * df['temp']
    
    df.drop(columns=['temp'], inplace=True)
    
    df.rename(columns=dict([(x, 'tt_'+x) for x in TARGETS]), inplace=True)
    
    # return model_dict, result_df
    return model_dict, df

In [12]:
def drop_columns(df: pd.DataFrame):
        u_time_features = [
            'start:year', 'start:month', 'start:day',
            'start:hour', 'end_fmt_time', 'end:year',
            'end:month', 'end:day', 'end:hour', 'end:n_days_in_month', 
            'start:sin_DOM', 'start:sin_MOY', 'start:cos_MOY', 'start:cos_DOM',
            'end:sin_DOM', 'end:sin_MOY', 'end:cos_DOM', 'end:cos_MOY', 'start:n_days_in_month',
            'start_local_dt_weekday', 'end_local_dt_weekday', 'start_fmt_time'
        ]

        u_user_features = [
            '_id', 'original_user_id', 'gender', 'birth_year', 'user_id'
        ]

        u_trip_features = [
            'cleaned_trip', 'Mode_confirm', 'available_modes', 'duration', 'start_loc',
            'end_loc', 'section_locations_argmax', 'section_coordinates_argmax',
            'start_lat', 'start_lng', 'end_lat', 'end_lng', 'section_duration_argmax',
            'section_distance_argmax', 'section_mode_argmax'
        ]

        # Drop section_mode_argmax and available_modes.
        return df.drop(
            columns=u_time_features + u_user_features + u_trip_features, 
            inplace=False
        )

In [13]:
train_df, test_df = get_train_test_splits(data=cleaned, how=SPLIT_TYPE.INTRA_USER, shuffle=True)

In [14]:
train_df, test_df = preprocess(drop_columns(train_df)), preprocess(drop_columns(test_df))

In [15]:
(train_df.columns == test_df.columns).all()

True

In [16]:
from sklearn.preprocessing import StandardScaler

def normalize_data(
    df: pd.DataFrame, split: SPLIT, section_scalers: dict = None, scaler: StandardScaler = None
):
    # We start by normalizing the temporal features first. This has to be done carefully.
    
    if split == SPLIT.TRAIN:
    
        mode_dict = dict()

        for _, row in df[['modes', 'durations', 'distances', 'mph']].iterrows():

            for (mode, duration, distance, mph) in zip(
                row['modes'], row['durations'], row['distances'], row['mph']
            ):
                if mode not in mode_dict:
                    mode_dict[mode] = {
                        'duration': np.array([duration]), 
                        'distance': np.array([distance]),
                        'mph': np.array([mph])
                    }
                else:
                    mode_dict[mode] = {
                        'duration': np.append(mode_dict[mode]['duration'], duration),
                        'distance': np.append(mode_dict[mode]['distance'], distance),
                        'mph': np.append(mode_dict[mode]['mph'], mph)
                    }

        section_scalers = dict()

        for mode in mode_dict.keys():
            # Fit, but don't transform. Instead, get the mean and scale.
            section_scalers[mode] = {'duration': dict(), 'distance': dict(), 'mph': dict()}
            
            section_scalers[mode]['duration']['mean'] = mode_dict[mode]['duration'].mean()
            section_scalers[mode]['duration']['std'] = mode_dict[mode]['duration'].std()
            
            section_scalers[mode]['distance']['mean'] = mode_dict[mode]['distance'].mean()
            section_scalers[mode]['distance']['std'] = mode_dict[mode]['distance'].std()
            
            section_scalers[mode]['mph']['mean'] = mode_dict[mode]['mph'].mean()
            section_scalers[mode]['mph']['std'] = mode_dict[mode]['mph'].std()
    
    # Normalize the temporal features.
    rows = []
    for _, row in df.iterrows():
        row_dict = row.to_dict()
        for ix, mode in enumerate(row_dict['modes']):
            row_dict['durations'][ix] = (
                row_dict['durations'][ix] - section_scalers[mode]['duration']['mean']
            )/section_scalers[mode]['duration']['std']
            
            row_dict['distances'][ix] = (
                row_dict['distances'][ix] - section_scalers[mode]['distance']['mean']
            )/section_scalers[mode]['distance']['std']
            
            row_dict['mph'][ix] = (
                row_dict['mph'][ix] - section_scalers[mode]['mph']['mean']
            )/section_scalers[mode]['mph']['std']
        
        rows.append(row_dict)
        
    df = pd.DataFrame(rows)
    
    # -----------------------------------------
    # Now, we start with the regular features.
    # -----------------------------------------
    
    required_features = [
        'income_category', 'n_motor_vehicles', 'n_residence_members', 'n_residents_u18',
        'n_residents_with_license', 'distance_miles', 'age', 'n_working_residents', 
        'start:sin_HOD', 'start:cos_HOD', 'end:sin_HOD', 'end:cos_HOD', 
        'temperature_2m (°F)', 'relative_humidity_2m (%)', 'dew_point_2m (°F)', 
        'rain (inch)', 'snowfall (inch)', 'cloud_cover (%)', 'wind_speed_10m (mp/h)',
        'wind_gusts_10m (mp/h)', 'cost_p_micro', 'cost_no_trip', 'cost_s_car', 
        'cost_transit', 'cost_car', 'cost_s_micro', 'cost_ridehail', 'cost_walk', 'cost_unknown',
       ]
    
    if split == SPLIT.TRAIN:
        scaler = StandardScaler()
    
        t_df = pd.DataFrame(
            scaler.fit_transform(df[required_features]),
            columns=required_features,
            index=df.index
        )
    else:
        t_df = pd.DataFrame(
            scaler.transform(df[required_features]),
            columns=required_features,
            index=df.index
        )
    
    df.drop(columns=required_features, inplace=True)
    
    return pd.concat([df, t_df], axis=1), section_scalers, scaler

In [17]:
train_df, section_scaler, scaler = normalize_data(train_df, SPLIT.TRAIN)
test_df, _, _ = normalize_data(test_df, SPLIT.TEST, section_scaler, scaler)

In [18]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [19]:
print(f"Final features for training:\n{list(train_df.columns)}")
display(train_df.head())

Final features for training:
['is_student', 'is_overnight_trip', 'is_male', 'mph', 'chosen', 'av_no_trip', 'av_s_car', 'av_p_micro', 'av_walk', 'av_s_micro', 'av_transit', 'av_ridehail', 'av_car', 'av_unknown', 'distances', 'durations', 'modes', 'income_category', 'n_motor_vehicles', 'n_residence_members', 'n_residents_u18', 'n_residents_with_license', 'distance_miles', 'age', 'n_working_residents', 'start:sin_HOD', 'start:cos_HOD', 'end:sin_HOD', 'end:cos_HOD', 'temperature_2m (°F)', 'relative_humidity_2m (%)', 'dew_point_2m (°F)', 'rain (inch)', 'snowfall (inch)', 'cloud_cover (%)', 'wind_speed_10m (mp/h)', 'wind_gusts_10m (mp/h)', 'cost_p_micro', 'cost_no_trip', 'cost_s_car', 'cost_transit', 'cost_car', 'cost_s_micro', 'cost_ridehail', 'cost_walk', 'cost_unknown']


Unnamed: 0,is_student,is_overnight_trip,is_male,mph,chosen,av_no_trip,av_s_car,av_p_micro,av_walk,av_s_micro,...,wind_gusts_10m (mp/h),cost_p_micro,cost_no_trip,cost_s_car,cost_transit,cost_car,cost_s_micro,cost_ridehail,cost_walk,cost_unknown
0,0,0.0,1,[-0.6226942218711282],8,0,0,0,1,0,...,-1.118155,0.0,0.0,-0.614415,-0.286713,-0.550431,-0.431234,-0.592465,0.0,0.0
1,0,0.0,1,[-0.32572174462565845],8,0,1,1,0,0,...,-0.598757,0.0,0.0,0.102862,-0.529027,0.167956,-0.431234,0.128516,0.0,0.0
2,0,0.0,1,[1.2722796444295434],1,0,1,1,1,0,...,0.046555,0.0,0.0,-0.409306,-0.304397,-0.550431,-0.431234,-0.386296,0.0,0.0
3,0,0.0,1,[1.785796573896156],2,0,0,1,0,0,...,1.305702,0.0,0.0,-0.614415,-0.529027,-0.550431,-0.431234,-0.592465,0.0,0.0
4,1,0.0,0,"[-0.12709085963426053, -0.8967034453081592]",1,0,1,1,1,0,...,-0.457104,0.0,0.0,-0.177887,-0.529027,-0.113227,-0.431234,-0.153682,0.0,0.0


In [20]:
class CustomDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.seq_df = df[['modes', 'durations', 'distances', 'mph']]
        self.chosen = df['chosen']
        self.features = df.drop(columns=[
            'modes', 'durations', 'distances', 'mph', 'chosen', 'distance_miles'
        ])
        
        # Start from 1 so that 0 becomes padding.
        self.modes_dict = {
            m: ix+1 for (ix, m) in enumerate(['walking', 'car', 'train', 'bus', 'bicycling', 'no_sensed'])
        }
    
    def __len__(self):
        return self.seq_df.shape[0]
    
    def __getitem__(self, ix):
        
        t_modes = list()
        t_metrics = list()
        
        chosen = self.chosen[ix]
        seq = self.seq_df.loc[ix, :]
        
        # OHE the target.
        y = F.one_hot(torch.tensor(chosen), num_classes=9)
        
        # Parse the sequence.
        for seq_ix in range(len(seq['modes'])):
            mode_ix = self.modes_dict[seq['modes'][seq_ix]]
            
            t_modes.append(torch.tensor(mode_ix))
            
            # distances, durations.
            seq_vector = torch.cat(
                [
                    torch.tensor([seq['durations'][seq_ix]]), 
                    torch.tensor([seq['distances'][seq_ix]]), 
                    torch.tensor([seq['mph'][seq_ix]])
                ], dim=-1
            )
            
            t_metrics.append(seq_vector)
        
        # Flat feature vector.
        features = torch.tensor(self.features.loc[ix, :].values)
        
        return [torch.stack(t_modes).long(), torch.stack(t_metrics), features], y

In [21]:
train_dset = CustomDataset(train_df)
test_dset = CustomDataset(test_df)

In [22]:
def collate(batch):
    X, y = zip(*batch)
    
    seq_modes = [x[0] for x in X]
    seq_metrics = [x[1] for x in X]
    features = [x[-1] for x in X]

    padded_seq = pad_sequence([s for s in seq_modes], batch_first=True)
    padded_metrics = pad_sequence([m for m in seq_metrics], batch_first=True)
    lengths = [len(seq) for seq in seq_modes]
    stacked_features = torch.stack(features)

    return (padded_seq, padded_metrics, stacked_features), torch.stack(y), lengths

In [23]:
train_loader = DataLoader(train_dset, batch_size=16, collate_fn=collate, shuffle=True, drop_last=False)
test_loader = DataLoader(test_dset, batch_size=8, collate_fn=collate, shuffle=False, drop_last=False)

In [24]:
(modes, metrics, features), sY1, lX = next(iter(train_loader))

In [25]:
metrics.size(), modes.size()

(torch.Size([16, 6, 3]), torch.Size([16, 6]))

In [26]:
# Set to 0 for no dropout.
DROPOUT = 0.

In [27]:
import math

def new_gelu(x):
    """
    Taken from OpenAI GPT-2 implementation.
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


class DilatedBlock(nn.Module):
    def __init__(self, n_c):
        super(DilatedBlock, self).__init__()
        
        self.block = nn.Sequential(
            nn.Linear(n_c, 4*n_c, bias=False),
            nn.ELU(),
            nn.Linear(4*n_c, n_c, bias=False),
            nn.Dropout(DROPOUT)
        )
    
    def forward(self, x):
        return self.block(x)

    
class SelfAttention(nn.Module):
    def __init__(self, n_features, head_size):
        super(SelfAttention, self).__init__()
        # in: (B, F, 64)
        self.k = nn.Linear(n_features, head_size, bias=False)
        self.q = nn.Linear(n_features, head_size, bias=False)
        self.v = nn.Linear(n_features, head_size, bias=False)
        self.dpt = nn.Dropout(DROPOUT)
        self.sqrt_d = torch.sqrt(torch.tensor(head_size))
        
    def forward(self, x):
        k = self.k(x)
        q = self.q(x)
        v = self.v(x)
        
        # Q.K.t
        dot = torch.bmm(q, k.permute(0, 2, 1))
        
        # normalize dot product.
        dot /= self.sqrt_d
        
        # softmax over -1 dim.
        softmax = self.dpt(torch.softmax(dot, dim=-1))
        
        # dot with values. (B, F, F) * (B, F, x) = (B, F, x)
        return torch.bmm(softmax, v)
        

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, n_dim):
        super(MultiHeadAttention, self).__init__()
        
        # 64 dims, 4 heads => 16 dims per head.
        head_size = n_dim//n_heads
        self.heads = nn.ModuleList([SelfAttention(n_dim, head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_dim, n_dim, bias=False)
    
    def forward(self, x):
        # x is (B, seq, n_dim)
        cat = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.proj(cat)


class Block(nn.Module):
    def __init__(self, n_c):
        super(Block, self).__init__()
        
        self.sa = MultiHeadAttention(n_heads=4, n_dim=n_c)
        self.dilated = DilatedBlock(n_c)
        self.ln1 = nn.LayerNorm(n_c)
        self.ln2 = nn.LayerNorm(n_c)
        
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.dilated(self.ln2(x))
        return x
    

class LSTMLayer(nn.Module):
    def __init__(
        self, input_size: int, hidden_size: int, 
        output_size: int, n_lstm_layers: int = 1
    ):
        super(LSTMLayer, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(7, 4, padding_idx=0)
        self.dpt = nn.Dropout(DROPOUT)
        
        self.lstm = nn.LSTM(
            input_size=input_size + 4,
            hidden_size=hidden_size,
            bias=False,
            bidirectional=True,
            batch_first=True,
            num_layers=n_lstm_layers
        )
    
    def forward(self, modes, x, lengths):
        mode_emb = self.embedding(modes)
        x = torch.cat([x, mode_emb], dim=-1)
        
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(out, batch_first=True)
        
        return self.dpt(unpacked)


class Model(nn.Module):
    def __init__(
        self, input_size: int, hidden_size: int, output_size: int, 
        n_features: int, n_lstm_layers: int = 1, **kwargs
    ):
        super(Model, self).__init__()
        
        block1_ip_dim = hidden_size*2
        block2_ip_dim = (hidden_size*2) + n_features
        
        self.lstm = LSTMLayer(
            input_size, hidden_size, 
            output_size, n_lstm_layers
        )
        
        self.block_l1 = nn.ModuleList([Block(block1_ip_dim) for _ in range(kwargs['l1_blocks'])])
        self.block_l2 = nn.ModuleList([Block(block2_ip_dim) for _ in range(kwargs['l2_blocks'])])
        self.final_proj = nn.Linear(block2_ip_dim, output_size, bias=True)
    
    def forward(self, modes, x, features, lengths):
        
        b = x.size(0)
        
        # Out = (B, seq, hidden*2)
        lstm_out = self.lstm(modes, x, lengths)
        
        # Pass the raw output through the blocks.
        for module in self.block_l1:
            lstm_out = module(lstm_out)
        
        features_rshp = features.unsqueeze(1).expand(b, lstm_out.size(1), -1)
        
        # Out = (B, seq, n+40)
        cat = torch.cat([lstm_out, features_rshp], dim=-1)
        
        for module in self.block_l2:
            cat = module(cat)
        
        # (8, 3, 104) -> (B, 104)
        # flattened = cat.view(b, -1)
        
        # proj = self.runtime_ffw(flattened.size(-1), 64)(flattened)
        proj = cat.mean(dim=1)
        
        return self.final_proj(proj)

In [28]:
import torch.nn.init as init

def init_weights(module):
    if isinstance(module, nn.Embedding):
        module.weight.data.normal_(mean=0.0, std=1.0)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)
    elif isinstance(module, nn.BatchNorm1d):
        init.normal_(m.weight.data, mean=1, std=0.02)
        init.constant_(m.bias.data, 0)
    elif isinstance(module, nn.Linear):
        init.xavier_normal_(module.weight.data)
        if module.bias is not None:
            init.normal_(module.bias.data)
    elif isinstance(module, nn.LSTM):
        for param in module.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)

In [38]:
model = Model(
    n_lstm_layers=1,
    input_size=3,
    hidden_size=16, 
    output_size=9,
    n_features=40,
    l1_blocks=6,
    l2_blocks=6
)

model = model.apply(init_weights)

print(model)

Model(
  (lstm): LSTMLayer(
    (embedding): Embedding(7, 4, padding_idx=0)
    (dpt): Dropout(p=0.0, inplace=False)
    (lstm): LSTM(7, 16, bias=False, batch_first=True, bidirectional=True)
  )
  (block_l1): ModuleList(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0): SelfAttention(
            (k): Linear(in_features=32, out_features=8, bias=False)
            (q): Linear(in_features=32, out_features=8, bias=False)
            (v): Linear(in_features=32, out_features=8, bias=False)
            (dpt): Dropout(p=0.0, inplace=False)
          )
          (1): SelfAttention(
            (k): Linear(in_features=32, out_features=8, bias=False)
            (q): Linear(in_features=32, out_features=8, bias=False)
            (v): Linear(in_features=32, out_features=8, bias=False)
            (dpt): Dropout(p=0.0, inplace=False)
          )
          (2): SelfAttention(
            (k): Linear(in_features=32, out_features=8, bias=False)
            (q

In [39]:
print(sum(p.numel() for p in model.parameters()))

453101


In [40]:
weights = train_df.shape[0]/(np.bincount(train_df.chosen.values) * len(np.unique(train_df.chosen)))

print(weights)

[ 1.31637209  0.45785206  0.97595891  2.59617829  0.35062587 72.2209596
  4.71122642  1.08826104  1.53996715]


In [41]:
INIT_LR = 7e-4
optimizer = optim.Adam(model.parameters(), lr=INIT_LR)
criterion = nn.CrossEntropyLoss(weight=torch.Tensor(weights))

In [42]:
# sX, sY, sL = next(iter(train_loader))

# print(sX.size())

In [43]:
def train(epoch, model, loader, opt, criterion):
    
    print("\tBeginning training.")
    
    model.train()
    
    print_every = len(loader)//5
    
    losses = []
    
    for ix, (X, y, lengths) in enumerate(loader):
        
        modes, metrics, features = X
        y = y.float()

        opt.zero_grad()

        y_pred = model(modes, metrics.float(), features.float(), lengths)

        loss = criterion(y_pred.view(-1, 9), y.view(-1, 9))

        losses.append(loss.item())
        
        if ix and ix % print_every == 0:
            print(f"\t-> Average loss: {np.nanmean(losses)}")

        loss.backward()

        optimizer.step()

    print(50*'-')

In [44]:
def evaluate(epoch, model, loader, criterion):
    
    print("\tBeginning evaluation.")
    
    model.eval()
    
    print_every = len(loader)//5
    
    losses = []
    
    for ix, (X, y, lengths) in enumerate(loader):
        
        modes, metrics, features = X

        y_pred = model(modes, metrics.float(), features.float(), lengths)
        y = y.float()
        
        loss = criterion(y_pred.view(-1, 9), y.view(-1, 9))

        losses.append(loss.item())
        
        if ix and ix % print_every == 0:
            print(f"\t -> Average loss: {np.nanmean(losses)}")

In [45]:
from sklearn.metrics import f1_score


def evaluate_f1(model, tr_loader, te_loader):
    tr_preds, te_preds = np.array([]), np.array([])
    tr_gt, te_gt = np.array([]), np.array([])
    
    model.eval()
    print("\tEvaluating F1...")
    
    with torch.no_grad():
        for ix, (X, y, lengths) in enumerate(tr_loader):
        
            modes, metrics, features = X

            y_pred = model(modes, metrics.float(), features.float(), lengths).view(-1, 9)
            y = y.float().view(-1, 9)

            preds = torch.argmax(F.softmax(y_pred, dim=-1), dim=-1).numpy().ravel()
            true = torch.argmax(y.long(), dim=-1).numpy().ravel()

            tr_preds = np.append(tr_preds, preds)
            tr_gt = np.append(tr_gt, true)
            
        tr_f1 = f1_score(y_true=tr_gt, y_pred=tr_preds, average='weighted')
        print(f"\t -> Train F1: {tr_f1}")
        
        for ix, (X, y, lengths) in enumerate(te_loader):
        
            modes, metrics, features = X

            y_pred = model(modes, metrics.float(), features.float(), lengths).view(-1, 9)
            y = y.float().view(-1, 9)
            
            preds = torch.argmax(F.softmax(y_pred, dim=-1), dim=-1).numpy().ravel()
            true = torch.argmax(y.long(), dim=-1).numpy().ravel()

            te_preds = np.append(te_preds, preds)
            te_gt = np.append(te_gt, true)
        
        te_f1 = f1_score(y_true=te_gt, y_pred=te_preds, average='weighted')
        print(f"\t -> Test F1: {te_f1}")

In [46]:
# Other training hyperparameters.
num_epochs = 30
num_decays = 6
decay_at = num_epochs // num_decays
decay = 0.95

# Just checking what LRs should be after decaying.
for power in range(num_decays):
    print(f"{decay_at * power} - {decay_at * (power + 1)} :: {INIT_LR * decay**power:.5f}")

0 - 5 :: 0.00070
5 - 10 :: 0.00067
10 - 15 :: 0.00063
15 - 20 :: 0.00060
20 - 25 :: 0.00057
25 - 30 :: 0.00054


In [None]:
# We'd like to start at a loss of at most -ln(1/9) ~ 2.19

for epoch_ix in range(1, num_epochs+1):
    print(f"Epoch {epoch_ix}:")
    train(epoch_ix, model, train_loader, optimizer, criterion)
    evaluate(epoch_ix, model, test_loader, criterion)
    
    if epoch_ix < 5 or epoch_ix % 5 == 0:
        # Evaluate every 5 epochs.
        evaluate_f1(model, train_loader, test_loader)
    
    if epoch_ix % decay_at == 0:
        optimizer.param_groups[0]['lr'] *= decay
        print(f"Learning rate is now: {optimizer.param_groups[0]['lr']:.5f}")
    
    print(40*'~')

Epoch 1:
	Beginning training.
	-> Average loss: 3.4292205003553264
	-> Average loss: 3.0203611136815667
	-> Average loss: 2.9269992429190226
	-> Average loss: 2.7440315224998524
--------------------------------------------------
	Beginning evaluation.
	 -> Average loss: 1.9845308899546468
	 -> Average loss: 2.0912061837169675
	 -> Average loss: 2.0278757847845554
	 -> Average loss: 2.079901454872481
	 -> Average loss: 2.119112247215273
	Evaluating F1...
	 -> Train F1: 0.17357571210719355
	 -> Test F1: 0.16929205554584628
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2:
	Beginning training.
	-> Average loss: 2.2447872489024805
	-> Average loss: 2.179984362536923
	-> Average loss: 2.184303961117914
	-> Average loss: 2.1579395363231675
--------------------------------------------------
	Beginning evaluation.
	 -> Average loss: 1.8604771541483576
	 -> Average loss: 1.96936972224629
	 -> Average loss: 1.9072258116594
	 -> Average loss: 1.9617513231750632
	 -> Average loss: 1.99158173786165

	 -> Average loss: 1.2420701925267479
	 -> Average loss: 1.2695545183091919
	 -> Average loss: 1.273952293582052
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 17:
	Beginning training.
	-> Average loss: 1.0339172488734043
	-> Average loss: 1.0751683378473684
	-> Average loss: 1.1208586298338965
	-> Average loss: 1.1269230102115642
--------------------------------------------------
	Beginning evaluation.
	 -> Average loss: 1.248624034475681
	 -> Average loss: 1.2845101706423125
	 -> Average loss: 1.2307171229161877
	 -> Average loss: 1.2223140466163174
	 -> Average loss: 1.2198367260838676
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 18:
	Beginning training.
	-> Average loss: 1.0498548433933843
	-> Average loss: 1.0665208559345316
	-> Average loss: 1.0762758616007386
	-> Average loss: 1.099983853102772
--------------------------------------------------
	Beginning evaluation.
	 -> Average loss: 1.3901189185221101
	 -> Average loss: 1.4441073030024976
	 -> Average loss: 1.37345056035625

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/rkulhall/miniconda3/envs/pytorch/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_13888/654011310.py", line 5, in <module>
    train(epoch_ix, model, train_loader, optimizer, criterion)
  File "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_13888/2806942638.py", line 18, in train
    y_pred = model(modes, metrics.float(), features.float(), lengths)
  File "/Users/rkulhall/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_13888/515180498.py", line 143, in forward
    lstm_out = module(lstm_out)
  File "/Users/rkulhall/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/rkulhall/miniconda3/envs/pytorch/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_13888/654011310.py", line 5, in <module>
    train(epoch_ix, model, train_loader, optimizer, criterion)
  File "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_13888/2806942638.py", line 18, in train
    y_pred = model(modes, metrics.float(), features.float(), lengths)
  File "/Users/rkulhall/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_13888/515180498.py", line 143, in forward
    lstm_out = module(lstm_out)
  File "/Users/rkulhall/miniconda3/envs/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, 

## Benchmarking

```

model = Model(
    n_lstm_layers=1,
    input_size=3,
    hidden_size=16, 
    output_size=9,
    n_features=40,
    l1_blocks=3,
    l2_blocks=3
)

emb_dim = 16

Best stats:
-> Train F1: 0.6327794050521978
-> Test F1: 0.6208816430930885
```

<hr />