# Single Instance infer

In [20]:
import pandas as pd
import seaborn as sns
import numpy as np

import glob
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KDTree
import tqdm
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.decomposition import PCA
import os
from skimage.io import imread
import cv2
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
test_columns = ['region', 'month', 'Visibility', 'Wind speed (gust)',
       'Surface pressure', 'Orography', 'Temperature',
       'Plant canopy surface water',
       'Water equivalent of accumulated snow depth (deprecated)', 'Snow cover',
       'Snow depth', 'Percent frozen precipitation', 'Precipitation rate',
       'Categorical snow', 'Categorical ice pellets',
       'Categorical freezing rain', 'Categorical rain', 'Surface roughness',
       'Frictional velocity', 'Sensible heat net flux', 'Latent heat net flux',
       'Ground heat flux', 'Vegetation Type',
       'Convective available potential energy', 'Convective inhibition',
       'Downward short-wave radiation flux',
       'Downward long-wave radiation flux', 'Upward short-wave radiation flux',
       'Upward long-wave radiation flux', 'Visible Beam Downward Solar Flux',
       'Visible Diffuse Downward Solar Flux',
       'Planetary boundary layer height', 'Land-sea mask',
       'Sea ice area fraction', 'Lightning', 'Vegetation', 'unknown',
       'Leaf Area Index', 'Cloud Forcing Net Solar Flux', 'R', 'G', 'B',
       'GMax', 'GMin', 'G_R', 'G_B', 'R_B', 'GMax_B', 'GMin_B']

In [3]:
def pseudo_round(x):
    if x < 1.65:
        return 1
    elif x < 2.55:
        return 2
    elif x < 3.5:
        return 3
    elif x < 4.5:
        return 4
    else:
        return 5

In [5]:
meta = pd.read_csv('../inputs/metadata.csv')
test = pd.read_csv('../inputs/submission_format.csv')
train = pd.read_csv('../inputs/train_labels.csv')

test = test.merge(meta, on='uid', how='inner')

In [52]:
with open('../outputs/weights/kdt.bin', 'rb') as fp:
    kdt, tdf = pickle.load(fp)

lgb_models = []
weights = glob.glob('../outputs/weights/model_*.bin')
for w in weights:
    loaded_model = lgb.Booster(model_file=w)
    lgb_models.append(loaded_model)

In [21]:
row = test.sample(1).iloc[0]

In [22]:
row

uid                nttf
region             west
severity              1
latitude        35.1245
longitude      -120.569
date         2021-04-20
split              test
Name: 3422, dtype: object

In [14]:
exist_nrrr = {x.split('.')[0]: x for x in os.listdir('../downloaded_data/nrrr/')}

In [72]:
def predict_row(row):
    knn = False
    if row.region in ['south', 'west']:
        knn = True
    else:
        nrrr_df_name = exist_nrrr.get(row['date'])
        if nrrr_df_name:
            nrrr_df = pd.read_csv('../downloaded_data/nrrr/' + nrrr_df_name, index_col=0)
            if row.uid in set(nrrr_df.id.unique()):
                nrrr_row = nrrr_df[nrrr_df.id == row.uid]
            else:
                knn = True
        else:
            knn = True

    # try load water data
    if not knn:
        water_stats = []
        im_file = glob.glob(f'../downloaded_data/Sentinel/{row.uid}*')
        water_file = glob.glob(f'../downloaded_data/Sentinel/W_{row.uid}*')

        if len(im_file) > 0 and len(water_file) > 0:
            wm = imread(water_file[0])
            img = imread(im_file[0])
            water_scaled = np.stack([cv2.resize(wm, (img.shape[1], img.shape[0]))] * 3, -1) == 6
            if water_scaled.sum() == 0:
                water_stats.append({
                    'uid': row.uid
                })
            else:
                interested = {}
                interested['R'] = img[:, :, 0][water_scaled[:, :, 0]].mean()
                interested['G'] = img[:, :, 1][water_scaled[:, :, 1]].mean()
                interested['B'] = img[:, :, 2][water_scaled[:, :, 2]].mean()
                interested['GMax'] = np.percentile(img[:, :, 1][water_scaled[:, :, 1]], 95)
                interested['GMin'] = np.percentile(img[:, :, 1][water_scaled[:, :, 1]], 5)


                interested['G_R'] = interested['G'] / interested['R']
                interested['G_B'] = interested['G'] / interested['B']
                interested['R_B'] = interested['R'] / interested['B']
                interested['GMax_B'] = interested['GMax'] / interested['B']
                interested['GMin_B'] = interested['GMin'] / interested['B']
                interested['uid'] = row.uid
                water_stats.append(interested)
        else:
            water_stats.append({
                'uid': row.uid
            })
        ws = pd.DataFrame(water_stats)

        row_input = pd.DataFrame(row).T.merge(
            nrrr_row.merge(ws, left_on='id', right_on='uid', how='left'), on='uid', how='left')

        row_input['month'] = int(row.date.split('-')[1])
        for e in set(test_columns) - set(row_input.columns):
            row_input[e] = np.nan

        row_input = row_input[test_columns]

        row_input['region'] = row_input['region'].map({
            'midwest': 0,
            'south': 1,
            'northeast': 2,
            'west': 3
        })


        preds = []
        for loaded_model in lgb_models:
            preds.append(loaded_model.predict(row_input)[0])

        out = pseudo_round(np.mean(preds))

    else:
        row_df = pd.DataFrame(row).T

        distance, matches = kdt.query(row_df[['latitude', 'longitude']].values, k=100, return_distance=True)
        pred = []
        for i, x in enumerate(matches):
            pred.append((tdf.iloc[x].severity * (1 / distance[i])).sum() / (1 / distance[i]).sum())
        out = np.round(pred[0])
        
    return out

In [73]:
preds = []
for i, row in tqdm.tqdm(test.iterrows()):
    preds.append(predict_row(row))

6510it [24:22,  4.45it/s]


In [74]:
test_sample = test.copy()

In [75]:
test_sample['severity'] = preds

In [76]:
test_sample.severity.value_counts(0)

2.0    2273
4.0    2222
1.0    1504
3.0     511
Name: severity, dtype: int64

In [77]:
st = pd.read_csv('../../../data2/tick_reproduce/outputs/submission.csv')

In [78]:
st.severity.value_counts()

2    2288
4    2222
1    1498
3     502
Name: severity, dtype: int64

In [62]:
test_sample.head(20)

Unnamed: 0,uid,region,severity,latitude,longitude,date,split
0,aabn,west,4.0,36.5597,-121.51,2016-08-31,test
1,aair,west,4.0,33.0426,-117.076,2014-11-01,test
2,aajw,northeast,2.0,40.703968,-80.29305,2015-08-26,test
3,aalr,midwest,2.0,38.9725,-94.67293,2019-08-26,test
4,aalw,west,4.0,34.279,-118.905,2018-01-08,test
5,aamp,west,2.0,40.43393,-105.0334,2017-07-28,test
6,aapj,west,4.0,33.8892,-117.562,2018-01-09,test
7,aaqf,northeast,1.0,39.820323,-79.921575,2013-08-17,test
8,aauy,south,2.0,36.313,-79.114978,2020-02-26,test
9,aava,northeast,1.0,40.82201,-77.46996,2018-04-11,test


In [51]:
test_sample

Unnamed: 0,uid,region,severity,latitude,longitude,date,split
0,aabn,west,2.0,36.559700,-121.510000,2016-08-31,test
1,aair,west,2.0,33.042600,-117.076000,2014-11-01,test
2,aajw,northeast,2.0,40.703968,-80.293050,2015-08-26,test
3,aalr,midwest,2.0,38.972500,-94.672930,2019-08-26,test
4,aalw,west,2.0,34.279000,-118.905000,2018-01-08,test
...,...,...,...,...,...,...,...
95,akao,west,2.0,35.426500,-118.832000,2019-10-15,test
96,akbk,midwest,2.0,38.516134,-95.706970,2017-10-02,test
97,akgs,midwest,1.0,47.606167,-86.818833,2019-08-25,test
98,akhe,south,2.0,39.309992,-80.034025,2017-07-26,test


In [31]:
out

2.0

In [17]:
ws

Unnamed: 0,R,G,B,GMax,GMin,G_R,G_B,R_B,GMax_B,GMin_B,uid
0,25.748876,39.408174,49.739286,49.0,34.0,1.530481,0.792295,0.517677,0.985137,0.683564,dcud


In [104]:
pd.DataFrame(row).T

Unnamed: 0,uid,region,severity,latitude,longitude,date,split
5182,uqxg,northeast,1,41.80497,-76.65939,2019-06-18,test


In [105]:
row_input = pd.DataFrame(row).T.merge(
    nrrr_row.merge(ws, left_on='id', right_on='uid', how='left'), on='uid', how='left')

In [109]:
row_input['month'] = int(row.date.split('-')[1])

In [111]:
for e in set(test_columns) - set(row_input.columns):
    row_input[e] = np.nan

In [112]:
row_input = row_input[test_columns]

In [116]:
row_input['region'] = row_input['region'].map({
    'midwest': 0,
    'south': 1,
    'northeast': 2,
    'west': 3
})


In [118]:
preds = []
weights = glob.glob('../outputs/weights/model_*.bin')
for w in weights:
    loaded_model = lgb.Booster(model_file=w)
    preds.append(loaded_model.predict(row_input)[0])

In [121]:
out = pseudo_round(np.mean(preds))

In [122]:
out

2

In [117]:
loaded_model.predict(row_input)

array([2.20572979])

In [89]:
ws

Unnamed: 0,R,G,B,GMax,GMin,G_R,G_B,R_B,GMax_B,GMin_B,uid
0,79.625637,104.737358,83.825951,185.5,61.0,1.315372,1.249462,0.949892,2.212919,0.727698,uqxg


In [87]:
interested

{'R': 79.62563700509604,
 'G': 104.7373578988632,
 'B': 83.82595060760487,
 'GMax': 185.5,
 'GMin': 61.0,
 'G_R': 1.3153723076923078,
 'G_B': 1.2494622147399925,
 'R_B': 0.9498924429479985,
 'GMax_B': 2.2129185372240925,
 'GMin_B': 0.7276982790871679,
 'uid': 'uqxg'}

In [79]:
water_file

['../downloaded_data/Sentinel/W_uqxg_2019-06-15_S2A_MSIL2A_20190615T154911_R054_T18TUM_20201006T022848.png']

In [None]:
        wm = imread(row.files)
        img = imread(row.path)
        water_scaled = np.stack([cv2.resize(wm, (img.shape[1], img.shape[0]))] * 3, -1) == 6
        if water_scaled.sum() == 0:
            r.append(np.nan)
            g.append(np.nan)
            b.append(np.nan)
            gmax.append(np.nan)
            gmin.append(np.nan)
        else:
            r.append(img[:, :, 0][water_scaled[:, :, 0]].mean())
            g.append(img[:, :, 1][water_scaled[:, :, 1]].mean())
            b.append(img[:, :, 2][water_scaled[:, :, 2]].mean())
            gmax.append(np.percentile(img[:, :, 1][water_scaled[:, :, 1]], 95))
            gmin.append(np.percentile(img[:, :, 1][water_scaled[:, :, 1]], 5))


In [42]:
nrrr_row

Unnamed: 0,id,Visibility,Wind speed (gust),Lightning,Surface pressure,Orography,Temperature,Water equivalent of accumulated snow depth (deprecated),Snow cover,Snow depth,...,Categorical ice pellets,Categorical freezing rain,Categorical rain,Vegetation Type,Convective available potential energy,Convective inhibition,Downward short-wave radiation flux,Planetary boundary layer height,Land-sea mask,Sea ice area fraction
7,yhws,9800.0,3.524016,0.0,97760.0,276.151489,291.998169,0.0,0.0,0.0,...,0.0,0.0,0.0,12.0,230.0,0.0,0.0,0.0,1.0,0.0
