In [1]:
# https://www.kaggle.com/code/mmakhyanov/classification-of-the-events?scriptVersionId=125828590&cellId=44

import numpy as np
import pandas as pd
import polars as pl
import math

from pathlib import Path


PATH_INPUT = Path("/kaggle/input/icecube-neutrinos-in-deep-ice")

#train_batch_id = 538
#print('Training batch', train_batch_id)
#batch_path = "train/batch_" + str(train_batch_id)+ ".parquet" 
batch_path = "test/batch_*.parquet"
train_batch = pl.scan_parquet(PATH_INPUT / batch_path).lazy()
#df_train_meta = pl.scan_parquet(PATH_INPUT / "train_meta.parquet").lazy()
df_sensor_geometry = pl.scan_csv(PATH_INPUT / 'sensor_geometry.csv').with_columns(pl.col('sensor_id').cast(pl.Int16)).lazy()


DISTANCE = 470

sides = df_sensor_geometry.with_columns(
[
    (((pl.col('x') ** 2 + pl.col('y') ** 2) ** 0.5)).alias('xy_distance')
]).filter(
pl.col('xy_distance') > DISTANCE).collect()

#need to delete these points
delete_points = sides.filter((pl.col('x') > 400) & (pl.col('x') < 500) & (pl.col('y') < 200) & (pl.col('y') > 0))

#need to add these points
add_points = df_sensor_geometry.filter((pl.col('x') > 100) & (pl.col('x') < 180) & (pl.col('y') > 350) & (pl.col('y') < 420)).collect()

delete_set = set(delete_points.select(pl.col('sensor_id')).to_pandas()['sensor_id'])
add_set = set(add_points.select(pl.col('sensor_id')).to_pandas()['sensor_id'])
sides_set = set(sides.select(pl.col('sensor_id')).to_pandas()['sensor_id'])
sides_set = (sides_set - delete_set) | add_set


Z_BOTTOM = -500
Z_TOP = 450

top_sensors = df_sensor_geometry.filter(pl.col('z') > Z_TOP).collect()
bottom_sensors = df_sensor_geometry.filter(pl.col('z') < Z_BOTTOM).collect()

top_set = set(top_sensors.select(pl.col('sensor_id')).to_pandas()['sensor_id'])
bottom_set = set(bottom_sensors.select(pl.col('sensor_id')).to_pandas()['sensor_id'])


def add_sides(dataf, account_for_aux):
    if account_for_aux == False:
        return dataf.groupby(['event_id']).agg([
            pl.col('sensor_id').first()]
        ).with_columns([
            pl.col('sensor_id').is_in(list(sides_set)).alias('side'),
            pl.col('sensor_id').is_in(list(top_set)).alias('top'),
            pl.col('sensor_id').is_in(list(bottom_set)).alias('bottom')
        ])
    if account_for_aux == True:
          return dataf.filter(pl.col('auxiliary') == False).groupby(['event_id']).agg([
            pl.col('sensor_id').first()]
        ).with_columns([
            pl.col('sensor_id').is_in(list(sides_set)).alias('side'),
            pl.col('sensor_id').is_in(list(top_set)).alias('top'),
            pl.col('sensor_id').is_in(list(bottom_set)).alias('bottom')
        ])

def join_tables(dataf, data_geometry):
    return dataf.join(data_geometry, on='sensor_id')

def time_rank(dataf, account_for_aux):
    if account_for_aux == True:
        return dataf.filter(pl.col('auxiliary') == False).with_columns(
[
    pl.col('time').rank('ordinal').over('event_id').alias('time_rank')
]).filter(
    pl.col('time_rank').is_in([1,2])
)
    else:
        return dataf.with_columns(
[
    pl.col('time').rank('ordinal').over('event_id').alias('time_rank')
]).filter(
    pl.col('time_rank').is_in([1,2])
)
    
def add_direction(dataf):
    return dataf.groupby('event_id').agg([
    pl.col('z').head(1).alias('first'),
    pl.col('z').tail(1).alias('second')
]).with_columns(
[
    (pl.col('second').arr.explode() - pl.col('first').arr.explode()).alias('direction')
]).with_columns(
[
    (pl.col('direction') > 0).alias('upgoing'),
    (pl.col('direction') == 0).alias('horizontal'),
    (pl.col('direction') < 0).alias('downgoing')
]).select(pl.col('*').sort_by('event_id'))


def join_two_features(dataf, df_train_batch, account_for_aux):
    return dataf.join(df_train_batch.pipe(add_sides, account_for_aux), on='event_id')

def classification_feature(dataf, account_for_aux):
    if account_for_aux == True:
        return dataf.with_columns(
        [
            (pl.col('horizontal') * 0.25 + pl.col('downgoing') * 0.5 + pl.col('side') * 0.75 + pl.col('top') + pl.col('bottom') * 0.25
            ).alias('hard_to_reconstruct_aux_on')
        ]).select([
            pl.col('event_id'),
            pl.col('hard_to_reconstruct_aux_on') / pl.col('hard_to_reconstruct_aux_on').max()
        ])
    if account_for_aux == False:
        return dataf.with_columns(
        [
            (pl.col('horizontal') * 0.25 + pl.col('downgoing') * 0.5 + pl.col('side') * 0.75 + pl.col('top') + pl.col('bottom') * 0.25
            ).alias('hard_to_reconstruct_aux_off')
        ]).select([
            pl.col('event_id'),
            pl.col('hard_to_reconstruct_aux_off') / pl.col('hard_to_reconstruct_aux_off').max()
        ])

account_for_aux = False
temp_2 = train_batch.pipe(join_tables, df_sensor_geometry
                ).pipe(time_rank, account_for_aux
                ).pipe(add_direction
                ).pipe(join_two_features, train_batch, account_for_aux
                ).pipe(classification_feature, account_for_aux)


account_for_aux = True
temp_3 = train_batch.pipe(join_tables, df_sensor_geometry
                ).pipe(time_rank, account_for_aux
                ).pipe(add_direction
                ).pipe(join_two_features, train_batch, account_for_aux
                ).pipe(classification_feature, account_for_aux)

df_classification = temp_2.join(temp_3, on='event_id', how='left').collect().to_pandas()

In [2]:
!mkdir classification

In [3]:
df_classification.to_parquet("classification/out.parquet", index=False)

In [4]:
for i in list(globals().keys()):
    if not i.startswith('_'):
        exec('del ' + i)
import gc
gc.collect()

63

In [5]:
nbs = [
    "/kaggle/input/icecube-notebooks/graphnet-baseline-submission.ipynb",
    "/kaggle/input/icecube-notebooks/early-sharing-prize-dynedge-1-046.ipynb",
    "/kaggle/input/icecube-notebooks/icecube-inference-run1679802608.ipynb",
    "/kaggle/input/icecube-notebooks/icecube-inference-run1680668395.ipynb",
    "/kaggle/input/icecube-notebooks/icecube-inference-run1680866125.ipynb",
    "/kaggle/input/icecube-notebooks/tensorflow-lstm-model-inference.ipynb",
]

In [6]:
"""
for nb in nbs:
    import pathlib
    nb = pathlib.Path(nb)
    eval_batches = pathlib.Path("/kaggle/input/icecube-neutrinos-in-deep-ice/test").glob("*.parquet")
    n = nb.name.split(".")[0]
    %run -i {str(nb)}
    !mkdir {n}
    for f in pathlib.Path("./").glob("*"):
        if f.is_file():
            if f.name.endswith(".parquet"):
                !mv {f} {n}
            else:
                !rm -rf {f}
    %reset -f
"""
    
for nb in nbs:
    print(nb)
    import pathlib
    nb = pathlib.Path(nb)
    eval_batches = pathlib.Path("/kaggle/input/icecube-neutrinos-in-deep-ice/test").glob("*.parquet")
    n = nb.name.split(".")[0]
    %run -i {str(nb)}
    !mkdir {n}
    for f in pathlib.Path("./").glob("*"):
        if f.is_file():
            if f.name.endswith(".parquet"):
                !mv {f} {n}
            else:
                !rm -rf {f}
    %reset -f

/kaggle/input/icecube-notebooks/graphnet-baseline-submission.ipynb
rm: cannot remove 'software': No such file or directory
Processing ./software/dependencies/torch-1.11.0+cu115-cp37-cp37m-linux_x86_64.whl
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.13.0
    Uninstalling torch-1.13.0:
      Successfully uninstalled torch-1.13.0
Successfully installed torch-1.11.0+cu115
[0mProcessing ./software/dependencies/torch_cluster-1.6.0-cp37-cp37m-linux_x86_64.whl
Installing collected packages: torch-cluster
Successfully installed torch-cluster-1.6.0
[0mProcessing ./software/dependencies/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
[0mProcessing ./software/dependencies/torch_sparse-0.6.13-cp37-cp37m-linux_x86_64.whl
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.13
[0mProcessing ./softwar

  warn(f"Failed to load image Python extension: {e}")


rm: cannot remove '/kaggle/working/test_database.db': No such file or directory


0it [00:00, ?it/s]

/kaggle/working/test_database.db


1it [00:00,  5.30it/s]


/kaggle/working/test_database.db
Conversion Complete!. Database available at
 /kaggle/working/test_database.db


Predicting: 0it [00:00, ?it/s]

__notebook__.ipynb  input_data	out.parquet  submission.csv
classification	    logs	software     test_database.db
__notebook__.ipynb  classification  out.parquet  submission.csv


Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times



/kaggle/input/icecube-notebooks/early-sharing-prize-dynedge-1-046.ipynb
rm: cannot remove 'software': No such file or directory
Processing ./software/dependencies/torch-1.11.0+cu115-cp37-cp37m-linux_x86_64.whl
torch is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0mProcessing ./software/dependencies/torch_cluster-1.6.0-cp37-cp37m-linux_x86_64.whl
torch-cluster is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0mProcessing ./software/dependencies/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl
torch-scatter is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0mProcessing ./software/dependencies/torch_sparse-0.6.13-cp37-cp37m-linux_x86_64.whl
torch-sparse is already installed with the same version as the provided wheel. Use --force-reinstall to fo

0it [00:00, ?it/s]

/kaggle/input/icecube-notebooks/icecube-inference-run1680668395.ipynb


0it [00:00, ?it/s]

/kaggle/input/icecube-notebooks/icecube-inference-run1680866125.ipynb


0it [00:00, ?it/s]

/kaggle/input/icecube-notebooks/tensorflow-lstm-model-inference.ipynb

==== Model Parameters
Bin Numbers: 24
Maximum Pulse Count: 96
Features Count: 6
time valid length: 6199.700247193777 ns
[0.         0.26179939 0.52359878 0.78539816 1.04719755 1.30899694
 1.57079633 1.83259571 2.0943951  2.35619449 2.61799388 2.87979327
 3.14159265 3.40339204 3.66519143 3.92699082 4.1887902  4.45058959
 4.71238898 4.97418837 5.23598776 5.49778714 5.75958653 6.02138592
 6.28318531]
[0.         0.41113786 0.58568554 0.72273425 0.84106867 0.94796974
 1.04719755 1.1410209  1.23095942 1.31811607 1.40334825 1.48736624
 1.57079633 1.65422641 1.73824441 1.82347658 1.91063324 2.00057176
 2.0943951  2.19362291 2.30052398 2.41885841 2.55590711 2.73045479
 3.14159265]
[661]


In [7]:
!ls

__notebook__.ipynb		   icecube-inference-run1679802608
__pycache__			   icecube-inference-run1680668395
classification			   icecube-inference-run1680866125
early-sharing-prize-dynedge-1-046  software
graphnet-baseline-submission	   tensorflow-lstm-model-inference


In [8]:
for i in list(globals().keys()):
    if not i.startswith('_'):
        exec('del ' + i)
import gc
gc.collect()

42

In [9]:
import gc
import pathlib
import pickle

import numpy as np
import pandas as pd

In [10]:
def convert_to_3d(df: pd.DataFrame) -> pd.DataFrame:
    """Converts zenith and azimuth to 3D direction vectors"""
    df['x'] = np.cos(df['azimuth']) * np.sin(df['zenith'])
    df['y'] = np.sin(df['azimuth'])*np.sin(df['zenith'])
    df['z'] = np.cos(df['zenith'])
    return df

def xyz2azzen(x, y, z):
    # https://www.kaggle.com/code/rasmusrse/graphnet-baseline-submission
    r = np.sqrt(x**2 + y**2 + z**2)
    zenith = np.arccos(z/r)
    azimuth = np.arctan2(y, x) #np.sign(results['true_y'])*np.arccos((results['true_x'])/(np.sqrt(results['true_x']**2 + results['true_y']**2)))
    azimuth[azimuth < 0] = azimuth[azimuth < 0] + 2*np.pi
    return azimuth, zenith

In [11]:
nbs = [
    "/kaggle/input/icecube-notebooks/graphnet-baseline-submission.ipynb",
    "/kaggle/input/icecube-notebooks/icecube-inference-run1679802608.ipynb",
    "/kaggle/input/icecube-notebooks/icecube-inference-run1680668395.ipynb",
    "/kaggle/input/icecube-notebooks/icecube-inference-run1680866125.ipynb",
    "/kaggle/input/icecube-notebooks/early-sharing-prize-dynedge-1-046.ipynb",
    "/kaggle/input/icecube-notebooks/tensorflow-lstm-model-inference.ipynb",
]

In [12]:
df = None
for nb in nbs:
    n = nb.split("/")[-1].split(".")[0]
    _df = pd.read_parquet(n + "/out.parquet")
    if not "x" in _df.columns:
        _df = convert_to_3d(_df) 
    _df = _df.rename(columns={"x": f"x_{n}", "y": f"y_{n}", "z": f"z_{n}"})
    # norm
    c = np.sqrt(_df[f"x_{n}"]**2+_df[f"y_{n}"]**2+_df[f"z_{n}"]**2)
    _df[f"x_{n}"] /= c
    _df[f"y_{n}"] /= c
    _df[f"z_{n}"] /= c
    if "direction_kappa" in _df.columns:
        _df["sigma"] = 1/np.sqrt(_df["direction_kappa"])
    if df is not None:
        if "sigma" in _df.columns:
            df = df.merge(_df[["event_id", f"x_{n}", f"y_{n}", f"z_{n}", "sigma"]], on="event_id", how="left")
        else:
            df = df.merge(_df[["event_id", f"x_{n}", f"y_{n}", f"z_{n}"]], on="event_id", how="left")
    else:
        if "sigma" in _df.columns:
            df = _df[["event_id", f"x_{n}", f"y_{n}", f"z_{n}", "sigma"]]
        else:
            df = _df[["event_id", f"x_{n}", f"y_{n}", f"z_{n}"]]
    del(_df)
    gc.collect()

In [13]:
df_classification = pd.read_parquet("classification/out.parquet")
df = df.merge(df_classification, on="event_id")

In [14]:
del(df_classification)
gc.collect()

42

In [15]:
df = df.set_index("event_id", drop=True)

In [16]:
with open("/kaggle/input/notebook32a05380f3/clf.pickle", "rb") as f:
    clf = pickle.load(f)

In [17]:
print(clf.feature_names_in_)

['x_graphnet-baseline-submission' 'y_graphnet-baseline-submission'
 'z_graphnet-baseline-submission' 'sigma'
 'x_tensorflow-lstm-model-inference' 'y_tensorflow-lstm-model-inference'
 'z_tensorflow-lstm-model-inference' 'x_icecube-inference-run1679802608'
 'y_icecube-inference-run1679802608' 'z_icecube-inference-run1679802608'
 'x_icecube-inference-run1680668395' 'y_icecube-inference-run1680668395'
 'z_icecube-inference-run1680668395' 'x_icecube-inference-run1680866125'
 'y_icecube-inference-run1680866125' 'z_icecube-inference-run1680866125'
 'x_early-sharing-prize-dynedge-1-046'
 'y_early-sharing-prize-dynedge-1-046'
 'z_early-sharing-prize-dynedge-1-046' 'hard_to_reconstruct_aux_off'
 'hard_to_reconstruct_aux_on']


In [18]:
pred = clf.predict(df[clf.feature_names_in_])

In [19]:
df[["azimuth", "zenith"]] = np.stack(xyz2azzen(pred[:, 0], pred[:, 1], pred[:, 2]), axis=1)
df = df.reset_index()[["event_id", "azimuth", "zenith"]]
df = df.sort_values("event_id")
df.to_csv("submission.csv", index=False)

In [20]:
df

Unnamed: 0,event_id,azimuth,zenith
0,2092,0.313222,1.692695
1,7344,3.474258,2.560791
2,9482,4.582993,1.536461


In [21]:
#sub = pd.read_parquet("/kaggle/input/icecube-neutrinos-in-deep-ice/sample_submission.parquet")
#sub = sub.drop(columns=["azimuth", "zenith"])

In [22]:
#sub = sub.merge(df[["event_id", "azimuth", "zenith"]], how="left", on="event_id")

In [23]:
#sub = sub.sort_values("event_id")

In [24]:
#sub

In [25]:
#sub.to_csv("submission.csv", index = False)