In [1]:
#!pip install kaggle
#!pip install --upgrade pandas
#!pip install lightgbm

In [2]:
import os
import pathlib
import zipfile
import glob

import pandas as pd
import numpy as np

In [3]:
pathlib.Path("./predict-volcanic-eruptions-ingv-oe").mkdir(parents=True, exist_ok=True)
pathlib.Path("./output").mkdir(parents=True, exist_ok=True)

In [4]:
# os.environ['KAGGLE_USERNAME'] = "rubbenliu"
# os.environ['KAGGLE_KEY'] = "4193b9c51b3a2626398f17079aaeab3f"

#!kaggle competitions download -c predict-volcanic-eruptions-ingv-oe

In [5]:
if not pathlib.Path("./predict-volcanic-eruptions-ingv-oe/sample_submission.csv").exists():
    zip_path = "/home/ec2-user/SageMaker/predict-volcanic-eruptions-ingv-oe.zip"

    with zipfile.ZipFile(zip_path, "r") as f:
        f.extractall("./predict-volcanic-eruptions-ingv-oe")

In [6]:
train = pd.read_csv("./predict-volcanic-eruptions-ingv-oe/train.csv")
sample_submission = pd.read_csv("./predict-volcanic-eruptions-ingv-oe/sample_submission.csv")

Train data preprocessing

In [7]:
train_frags = glob.glob("./predict-volcanic-eruptions-ingv-oe/train/*")

In [8]:
def build_features(signal, ts, sensor_id):
    X = pd.DataFrame()
    f = np.fft.fft(signal)
    f_real = np.real(f)
    X.loc[ts, f'{sensor_id}_sum']       = signal.sum()
    X.loc[ts, f'{sensor_id}_mean']      = signal.mean()
    X.loc[ts, f'{sensor_id}_std']       = signal.std()
    X.loc[ts, f'{sensor_id}_var']       = signal.var() 
    X.loc[ts, f'{sensor_id}_max']       = signal.max()
    X.loc[ts, f'{sensor_id}_min']       = signal.min()
    X.loc[ts, f'{sensor_id}_skew']      = signal.skew()
    X.loc[ts, f'{sensor_id}_mad']       = np.abs(signal - signal.median()).median()
    X.loc[ts, f'{sensor_id}_kurtosis']  = signal.kurtosis()
    X.loc[ts, f'{sensor_id}_quantile99']= np.quantile(signal, 0.99)
    X.loc[ts, f'{sensor_id}_quantile95']= np.quantile(signal, 0.95)
    X.loc[ts, f'{sensor_id}_quantile85']= np.quantile(signal, 0.85)
    X.loc[ts, f'{sensor_id}_quantile75']= np.quantile(signal, 0.75)
    X.loc[ts, f'{sensor_id}_quantile55']= np.quantile(signal, 0.55)
    X.loc[ts, f'{sensor_id}_quantile45']= np.quantile(signal, 0.45) 
    X.loc[ts, f'{sensor_id}_quantile25']= np.quantile(signal, 0.25) 
    X.loc[ts, f'{sensor_id}_quantile15']= np.quantile(signal, 0.15) 
    X.loc[ts, f'{sensor_id}_quantile05']= np.quantile(signal, 0.05)
    X.loc[ts, f'{sensor_id}_quantile01']= np.quantile(signal, 0.01)
    X.loc[ts, f'{sensor_id}_fft_real_mean']= f_real.mean()
    X.loc[ts, f'{sensor_id}_fft_real_std'] = f_real.std()
    X.loc[ts, f'{sensor_id}_fft_real_max'] = f_real.max()
    X.loc[ts, f'{sensor_id}_fft_real_min'] = f_real.min()

    return X

In [9]:
if not pathlib.Path('./output/preprocessed_train.csv').exists():
    sensors = set()
    observations = set()
    nan_columns = list()
    missed_groups = list()
    for_df = list()

    j=0
    for item in train_frags:
        name = int(item.replace('\\','/').split('.')[-2].split('/')[-1])
        at_least_one_missed = 0
        frag = pd.read_csv(item)
        missed_group = list()
        missed_percents = list()
        for col in frag.columns:
            missed_percents.append(frag[col].isnull().sum() / len(frag))
            if pd.isnull(frag[col]).all() == True:
                at_least_one_missed = 1
                nan_columns.append(col)
                missed_group.append(col)
        if len(missed_group) > 0:
            missed_groups.append(missed_group)
        sensors.add(len(frag.columns))
        observations.add(len(frag))
        for_df.append([name, at_least_one_missed] + missed_percents)

        if j%500 == 0:
            print(j)
            
        j+=1

0
500
1000
1500
2000
2500
3000
3500
4000


In [10]:
if not pathlib.Path('./output/preprocessed_train.csv').exists():
    for_df = pd.DataFrame(
        for_df, 
        columns=[
            'segment_id', 'has_missed_sensors', 'missed_percent_sensor1', 
            'missed_percent_sensor2', 'missed_percent_sensor3', 'missed_percent_sensor4', 
            'missed_percent_sensor5', 'missed_percent_sensor6', 'missed_percent_sensor7', 
            'missed_percent_sensor8', 'missed_percent_sensor9', 'missed_percent_sensor10'
        ]
    )
    
    print(for_df)

      segment_id  has_missed_sensors  missed_percent_sensor1  \
0     1000015382                   0                     0.0   
1     1000554676                   0                     0.0   
2     1000745424                   0                     0.0   
3     1001461087                   0                     0.0   
4     1001732002                   0                     0.0   
...          ...                 ...                     ...   
4426   998393594                   1                     0.0   
4427   998625201                   1                     0.0   
4428   998818197                   0                     0.0   
4429   998818893                   0                     0.0   
4430   999761287                   1                     0.0   

      missed_percent_sensor2  missed_percent_sensor3  missed_percent_sensor4  \
0                   0.002567                     0.0                     0.0   
1                   0.000000                     0.0                   

In [11]:
if not pathlib.Path('./output/preprocessed_train.csv').exists():
    train = pd.merge(train, for_df)
    print(train)

      segment_id  time_to_eruption  has_missed_sensors  \
0     1136037770          12262005                   1   
1     1969647810          32739612                   0   
2     1895879680          14965999                   1   
3     2068207140          26469720                   0   
4      192955606          31072429                   1   
...          ...               ...                 ...   
4426   873340274          15695097                   0   
4427  1297437712          35659379                   1   
4428   694853998          31206935                   0   
4429  1886987043           9598270                   1   
4430  1100632800          20128938                   0   

      missed_percent_sensor1  missed_percent_sensor2  missed_percent_sensor3  \
0                        0.0                     0.0                     0.0   
1                        0.0                     0.0                     0.0   
2                        0.0                     0.0           

In [12]:
if not pathlib.Path('./output/preprocessed_train.csv').exists():
    train_set = list()
    j=0
    for seg in train.segment_id:
        signals = pd.read_csv(f'./predict-volcanic-eruptions-ingv-oe/train/{seg}.csv')
        train_row = []
        if j%500 == 0:
            print(j)
        for i in range(0, 10):
            sensor_id = f'sensor_{i+1}'
            train_row.append(build_features(signals[sensor_id].fillna(0), seg, sensor_id))
        train_row = pd.concat(train_row, axis=1)
        train_set.append(train_row)
        j+=1
    
    train_set = pd.concat(train_set)

0
500
1000
1500
2000
2500
3000
3500
4000


In [13]:
if not pathlib.Path('./output/preprocessed_train.csv').exists():
    train_set = train_set.reset_index()
    train_set = train_set.rename(columns={'index': 'segment_id'})
    train_set = pd.merge(train_set, train, on='segment_id')
    print(train_set)

      segment_id  sensor_1_sum  sensor_1_mean  sensor_1_std   sensor_1_var  \
0     1136037770      -96621.0      -1.610323    303.096099   91867.245099   
1     1969647810       85569.0       1.426126    438.360560  192159.980413   
2     1895879680      150278.0       2.504592    241.575415   58358.681050   
3     2068207140      129950.0       2.165797    221.967825   49269.715377   
4      192955606        4429.0       0.073815    261.695935   68484.762635   
...          ...           ...            ...           ...            ...   
4426   873340274       54405.0       0.906735    613.427301  376293.054135   
4427  1297437712      476221.0       7.936884    649.089618  421317.332733   
4428   694853998       85261.0       1.420993    110.942596   12308.259562   
4429  1886987043       54350.0       0.905818    478.852581  229299.794013   
4430  1100632800     -248575.0      -4.142848    272.632730   74328.605478   

      sensor_1_max  sensor_1_min  sensor_1_skew  sensor_1_mad  

In [14]:
if not pathlib.Path('./output/preprocessed_train.csv').exists():
    train_set.to_csv('./output/preprocessed_train.csv', encoding='utf-8', index=False)

In [15]:
train_set = pd.read_csv('./output/preprocessed_train.csv')
print(train_set)
del train_set

      segment_id  sensor_1_sum  sensor_1_mean  sensor_1_std   sensor_1_var  \
0     1136037770      -96621.0      -1.610323    303.096099   91867.245099   
1     1969647810       85569.0       1.426126    438.360560  192159.980413   
2     1895879680      150278.0       2.504592    241.575415   58358.681050   
3     2068207140      129950.0       2.165797    221.967825   49269.715377   
4      192955606        4429.0       0.073815    261.695935   68484.762635   
...          ...           ...            ...           ...            ...   
4426   873340274       54405.0       0.906735    613.427301  376293.054135   
4427  1297437712      476221.0       7.936884    649.089618  421317.332733   
4428   694853998       85261.0       1.420993    110.942596   12308.259562   
4429  1886987043       54350.0       0.905818    478.852581  229299.794013   
4430  1100632800     -248575.0      -4.142848    272.632730   74328.605478   

      sensor_1_max  sensor_1_min  sensor_1_skew  sensor_1_mad  

Test data preprocessing

In [16]:
test_frags = glob.glob("./predict-volcanic-eruptions-ingv-oe/test/*")

In [17]:
if not pathlib.Path('./output/preprocessed_test.csv').exists():
    sensors = set()
    observations = set()
    nan_columns = list()
    missed_groups = list()
    for_test_df = list()
    
    j=0
    for item in test_frags:
        name = int(item.replace('\\','/').split('.')[-2].split('/')[-1])
        at_least_one_missed = 0
        frag = pd.read_csv(item)
        missed_group = list()
        missed_percents = list()
        for col in frag.columns:
            missed_percents.append(frag[col].isnull().sum() / len(frag))
            if pd.isnull(frag[col]).all() == True:
                at_least_one_missed = 1
                nan_columns.append(col)
                missed_group.append(col)
        if len(missed_group) > 0:
            missed_groups.append(missed_group)
        sensors.add(len(frag.columns))
        observations.add(len(frag))
        for_test_df.append([name, at_least_one_missed] + missed_percents)
        
        if j%500 == 0:
            print(j)
            
        j+=1

In [18]:
if not pathlib.Path('./output/preprocessed_test.csv').exists():
    for_test_df = pd.DataFrame(
        for_test_df, 
        columns=[
            'segment_id', 'has_missed_sensors', 'missed_percent_sensor1', 'missed_percent_sensor2', 'missed_percent_sensor3', 
            'missed_percent_sensor4', 'missed_percent_sensor5', 'missed_percent_sensor6', 'missed_percent_sensor7', 
            'missed_percent_sensor8', 'missed_percent_sensor9', 'missed_percent_sensor10'
        ]
    )
    
    print(for_test_df)

In [19]:
if not pathlib.Path('./output/preprocessed_test.csv').exists():
    test_set = list()
    j=0
    for seg in sample_submission.segment_id:
        signals = pd.read_csv(f'./predict-volcanic-eruptions-ingv-oe/test/{seg}.csv')
        test_row = []
        if j%500 == 0:
            print(j)
        for i in range(0, 10):
            sensor_id = f'sensor_{i+1}'
            test_row.append(build_features(signals[sensor_id].fillna(0), seg, sensor_id))
        test_row = pd.concat(test_row, axis=1)
        test_set.append(test_row)
        j+=1
    test_set = pd.concat(test_set)

In [20]:
if not pathlib.Path('./output/preprocessed_test.csv').exists():
    test_set = test_set.reset_index()
    test_set = test_set.rename(columns={'index': 'segment_id'})
    #test_set = pd.merge(test_set, for_test_df, how='left', on='segment_id')
    test_set = pd.merge(test_set, for_test_df, on='segment_id')
    test = test_set.drop(['segment_id'], axis=1)
    print(test)

In [21]:
if not pathlib.Path('./output/preprocessed_test.csv').exists():
    test.to_csv('./output/preprocessed_test.csv', encoding='utf-8', index=False)

In [22]:
test = pd.read_csv('./output/preprocessed_test.csv')
print(test)
del test

      sensor_1_sum  sensor_1_mean  sensor_1_std  sensor_1_var  sensor_1_max  \
0         267753.0       4.462476    306.174474  9.374281e+04        1347.0   
1          50287.0       0.838103    335.591100  1.126214e+05        1214.0   
2          47695.0       0.794903    352.172248  1.240253e+05        4259.0   
3         117225.0       1.953717    259.030122  6.709660e+04        1220.0   
4        -239079.0      -3.984584    254.306426  6.467176e+04         972.0   
...            ...            ...           ...           ...           ...   
4515    -8140616.0    -135.674672  29486.243321  8.694385e+08       32767.0   
4516     -159989.0      -2.666439    294.261065  8.658957e+04        1032.0   
4517       71538.0       1.192280    371.072752  1.376950e+05        2641.0   
4518      193117.0       3.218563    346.656999  1.201711e+05        1494.0   
4519     -162941.0      -2.715638    277.119322  7.679512e+04        1122.0   

      sensor_1_min  sensor_1_skew  sensor_1_mad  se

In [23]:
print("Done!")

Done!
