In [None]:
%load_ext autoreload
%autoreload 2

from context import DATADIR, RAWDATADIR
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

data = pd.read_csv(RAWDATADIR / 'public.csv').set_index('Filename')
train = data[~data['North'].isna()]
test = data[data['North'].isna()]

# Generate image features

In [None]:
from feature_extraction import ImageFeatureExtractor

FORCE_RUN = False

if (DATADIR / 'feats/feats.parquet').exists() and not FORCE_RUN:
    feats = pd.read_parquet(DATADIR / 'feats/feats.parquet')

else:
    feat_extractor = ImageFeatureExtractor()
    feats = data.progress_apply(feat_extractor.get_features_from_row, axis=1)

    feats.to_parquet(DATADIR / 'feats/feats.parquet')

feats

# Determine order of clips

In [None]:
from order_images import order_from_df

FORCE_RUN = False

if (DATADIR / 'processed/train_ordered.parquet').exists() and not FORCE_RUN:
    train_ordered = pd.read_parquet(DATADIR / 'processed/train_ordered.parquet')
    test_ordered = pd.read_parquet(DATADIR / 'processed/test_ordered.parquet')

else:
    train_ordered = order_from_df(train, plot=False)
    test_ordered = order_from_df(test, plot=False)

    train_ordered.to_parquet(DATADIR / 'processed/train_ordered.parquet')
    test_ordered.to_parquet(DATADIR / 'processed/test_ordered.parquet')

# Add lag/leap features

In [None]:
from feature_extraction import add_lags

n_lags = 7

train_feats = add_lags(train_ordered.join(feats), n_lags=n_lags)
test_feats = add_lags(test_ordered.join(feats), n_lags=n_lags)

test_feats.columns

# Train model

In [None]:
from helpers import build_model, get_Xy_cols
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from IPython.display import display

import joblib
from datetime import datetime

PERFORM_CV = True

if PERFORM_CV:
    cv = GroupKFold(n_splits=5)

    cv_scores = []
    oof_preds = []

    for i, (train_idx, test_idx) in enumerate(cv.split(train_feats, groups=train_feats['sequence'])):
        print(f'Fitting fold {i}...')
        model = build_model()
        predictors, targets = get_Xy_cols(train_feats)
        
        X_train, X_test = train_feats.iloc[train_idx][predictors].copy(), train_feats.iloc[test_idx][predictors].copy()
        
        # Check for leaks between train/test sequences
        sequence_leak = set(train_feats.iloc[train_idx]['sequence']).intersection(set(train_feats.iloc[test_idx]['sequence']))
        assert len(sequence_leak) == 0, f'Sequence leakeage found in train/test sets: {sequence_leak}'

        y_train, y_test = train_feats.loc[X_train.index, targets], train_feats.loc[X_test.index, targets]

        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        oof_preds.append(pd.DataFrame(y_pred, index=X_test.index, columns=['North', 'East']))
        cv_scores.append(mean_squared_error(y_test, model.predict(X_test), squared=False))
        print(cv_scores)
    
    print(f'CV{len(cv_scores)}=', cv_scores)
    print(np.mean(cv_scores), '±', np.std(cv_scores))
    
model = build_model()
predictors, targets = get_Xy_cols(train_feats)
model.fit(train_feats[predictors], train_feats[targets])

joblib.dump(model, DATADIR / f'models/{datetime.now().strftime("%Y%m%d_%H%M%S_model.joblib")}')

# Predict for submission

In [None]:
from helpers import generate_submission_files

FORCE_RUN = False

model_paths = list((DATADIR / 'models').glob('*.joblib'))
if len(model_paths) > 0:
    print(f'Found {len(model_paths)} in models dir. Performing predictions...')
    
    for mp in model_paths:
        sub_path = DATADIR / 'subs' / ('_'.join(mp.stem.split('_')[:2]) + '_sub.csv')
        
        if sub_path.exists() and not FORCE_RUN:
            continue
        else:
            print(f'Generating prediction file for model {mp}')
            preds = generate_submission_files(mp, test_feats[predictors])
            preds.to_csv(sub_path)

else:
    raise FileNotFoundError(f'No models found in {DATADIR / "models"}. Have any models been fitted yet?')
