# 2 Data wrangling<a id='2_Data_wrangling'></a>

### 2.1 Importing Necessary Modules and Data

In [2]:
import pandas as pd
import os.path
from os import path

import warnings
warnings.filterwarnings("ignore")

### 2.2 Creating Features

Our data consists of 10 sensor readings place around volcanos. These 10 sensors have been normalized and separated into their own CSV file based on the volcano they were reading. The target data has been provided in a separate CSV file that links the segment ID(the name of the file our features are in) and the time to next eruption. Due to the size of our data(14GB) and the way the data is organized, we will create new features to consolidate as much data as we can into a smaller space and something that is better able to be used as an input for our models.

The raw data cannot be attached to this project but can be found at https://www.kaggle.com/c/predict-volcanic-eruptions-ingv-oe/data?select=train.csv
Place the train.csv and train folder in the predict-volcanic-eruptions folder but most of the project will focus on the Data.csv attached

In [3]:
targetData = pd.read_csv('predict-volcanic-eruptions/train.csv')

In [8]:
if not path.exists('predict-volcanic-eruptions/Data.csv'):
    index = []
    columns = ['segment_id']
    featureFunctionSuffix = ['_mean','_median','_01percentile','_05percentile','_10percentile','_20percentile','_25percentile','_35percentile','_45percentile','_65percentile','_75percentile','_80percentile','_90percentile','_99percentile','_95percentile','_std','_var','_max','_min','_range','_kurtosis','_skew','_sum','_meanAbsDev','_localMinCount','_localMaxCount','_isNullCount']
   
    for i in os.listdir('predict-volcanic-eruptions/train/') :
        fileNameSplit = i.split('.')
        
        if fileNameSplit[1] != 'csv' :
            continue
            
        segmentID = fileNameSplit[0]
        index.append(segmentID)
        
        rows = []
        rows.append(segmentID)
        
        rawData = pd.read_csv('predict-volcanic-eruptions/train/' + i)
        rawData = rawData.astype('float32')
    
        
        if columns == ['segment_id']:
            for j in rawData.columns:
                for k in featureFunctionSuffix:
                    columns.append(j + k)
                    
            featureData = pd.DataFrame(columns = columns)
                    
    
        
        for j in rawData.columns:
            columnData = rawData[[j]]
            
            meanData = columnData.mean()
            medianData = columnData.median()
            percentile01Data = columnData.quantile(0.01)
            percentile05Data = columnData.quantile(0.05)
            percentile25Data = columnData.quantile(0.25)
            percentile75Data = columnData.quantile(0.75)
            percentile10Data = columnData.quantile(0.10)
            percentile20Data = columnData.quantile(0.20)
            percentile35Data = columnData.quantile(0.35)
            percentile45Data = columnData.quantile(0.45)
            percentile65Data = columnData.quantile(0.65)
            percentile80Data = columnData.quantile(0.80)
            percentile90Data = columnData.quantile(0.90)
            percentile99Data = columnData.quantile(0.99)
            percentile95Data = columnData.quantile(0.95)
            kurtosisData = columnData.kurtosis()
            stdData = columnData.std()
            maxData = columnData.max()
            minData = columnData.min()
            rangeData = maxData - minData
            skewData = columnData.skew()
            sumData = columnData.sum()
            varData = columnData.var()
            madData = columnData.mad()
            localMinData = columnData[(columnData.shift(1) > columnData) & (columnData.shift(-1) > columnData)].count()
            localMaxData = columnData[(columnData.shift(1) < columnData) & (columnData.shift(-1) < columnData)].count()
            isNullData = columnData.isna().sum()
            
            rows.append(meanData[0])
            rows.append(medianData[0])
            rows.append(percentile01Data[0])
            rows.append(percentile05Data[0])
            rows.append(percentile10Data[0])
            rows.append(percentile20Data[0])
            rows.append(percentile25Data[0])
            rows.append(percentile35Data[0])
            rows.append(percentile45Data[0])
            rows.append(percentile65Data[0])
            rows.append(percentile75Data[0])
            rows.append(percentile80Data[0])
            rows.append(percentile90Data[0])
            rows.append(percentile95Data[0])
            rows.append(percentile99Data[0])
            rows.append(stdData[0])
            rows.append(varData[0])
            rows.append(maxData[0])
            rows.append(minData[0])
            rows.append(rangeData[0])
            rows.append(kurtosisData[0])
            rows.append(skewData[0])
            rows.append(sumData[0])
            rows.append(madData[0])
            rows.append(localMinData[0])
            rows.append(localMaxData[0])
            rows.append(isNullData[0])
        
        featureData.loc[len(featureData)] = rows
        
    featureData['segment_id'] = featureData['segment_id'].astype('int64')
    fullData = featureData.merge(targetData,on='segment_id')
    
    fullData.to_csv('predict-volcanic-eruptions/Data.csv',index = False)

We have create our own features consisting of some basic aggregate functions, such as mean, median and range. As well as others such as percentiles at many different locations, the number of peaks in each file and the count of null data that sensors may have. After turning all our data into features, we have 272 columns. 271 columns are features and 1 column target, with 4431 rows of data. This is good because that's how many files were provided.

In [9]:
data = pd.read_csv('predict-volcanic-eruptions/Data.csv')
data.head()

Unnamed: 0,segment_id,sensor_1_mean,sensor_1_median,sensor_1_01percentile,sensor_1_05percentile,sensor_1_10percentile,sensor_1_20percentile,sensor_1_25percentile,sensor_1_35percentile,sensor_1_45percentile,...,sensor_10_min,sensor_10_range,sensor_10_kurtosis,sensor_10_skew,sensor_10_sum,sensor_10_meanAbsDev,sensor_10_localMinCount,sensor_10_localMaxCount,sensor_10_isNullCount,time_to_eruption
0,1000015382,0.382244,0.0,-277.0,-174.0,-130.0,-83.0,-66.0,-38.0,-12.0,...,-2961.0,6140.0,14.978788,0.058227,53806.0,163.679382,8368,8380,0,16258654
1,1000554676,-3.82812,0.0,-1252.0,-878.0,-686.0,-446.0,-356.0,-206.0,-67.0,...,-4329.0,8771.0,0.160791,0.004739,-445008.0,835.125977,2727,2725,0,6347792
2,1000745424,8.291928,0.0,-1392.0,-989.0,-765.0,-497.0,-400.0,-230.0,-74.0,...,-5040.0,10270.0,0.193508,-0.02548,-89519.0,972.49646,2347,2358,0,5120693
3,1001461087,2.071582,0.0,-1017.0,-645.0,-485.0,-310.0,-246.0,-141.0,-46.0,...,-4634.0,10422.0,2.73242,-0.051502,-82408.0,548.506348,4988,4995,0,10393161
4,1001732002,0.904102,0.0,-702.0,-465.0,-358.0,-233.0,-187.0,-104.0,-33.0,...,-3909.0,8483.0,1.135692,0.375558,1922895.0,691.087891,2808,2825,0,20549733


In [10]:
data.describe()

Unnamed: 0,segment_id,sensor_1_mean,sensor_1_median,sensor_1_01percentile,sensor_1_05percentile,sensor_1_10percentile,sensor_1_20percentile,sensor_1_25percentile,sensor_1_35percentile,sensor_1_45percentile,...,sensor_10_min,sensor_10_range,sensor_10_kurtosis,sensor_10_skew,sensor_10_sum,sensor_10_meanAbsDev,sensor_10_localMinCount,sensor_10_localMaxCount,sensor_10_isNullCount,time_to_eruption
count,4431.0,4407.0,4407.0,4407.0,4407.0,4407.0,4407.0,4407.0,4407.0,4407.0,...,4407.0,4407.0,4407.0,4407.0,4431.0,4407.0,4431.0,4431.0,4431.0,4431.0
mean,1074694000.0,-0.527495,0.0,-1587.266751,-1055.159519,-807.139959,-520.793601,-415.341275,-235.818698,-75.961232,...,-5276.923304,10533.836397,2.614465,-0.001172,-19701.77,821.564603,4713.465809,4713.215753,427.888287,22848910.0
std,616196600.0,17.960202,0.0,3494.844624,2529.406416,2015.253541,1301.810573,1035.677944,588.758236,189.79118,...,5208.049716,10341.172094,8.776947,0.122823,2005872.0,1470.17691,3045.326741,3045.458203,4560.298459,13484390.0
min,513181.0,-595.469238,0.0,-32767.0,-32767.0,-32767.0,-21859.0,-17308.0,-9715.0,-3040.0,...,-32767.0,2050.0,-1.402664,-2.05786,-64194820.0,137.147858,0.0,0.0,0.0,6250.0
25%,552793400.0,-1.794853,0.0,-1307.5,-885.0,-676.0,-437.0,-347.5,-197.0,-64.0,...,-5169.5,6296.5,0.30438,-0.035839,-198759.0,441.303497,2656.0,2652.0,0.0,11270160.0
50%,1066153000.0,0.02815,0.0,-882.0,-580.0,-443.0,-287.0,-229.0,-130.0,-42.0,...,-4126.0,8271.0,0.795795,-0.001301,5861.0,558.057434,3349.0,3344.0,0.0,22465590.0
75%,1606350000.0,1.840611,0.0,-675.0,-441.0,-335.0,-217.0,-173.0,-98.0,-31.0,...,-3117.0,10267.0,2.358021,0.03372,227012.5,757.297546,5911.5,5910.5,0.0,34343560.0
max,2146939000.0,341.21994,0.0,-232.0,-152.0,-114.0,-71.0,-57.0,-32.0,-9.0,...,-1004.0,65534.0,234.33754,1.775161,61340240.0,20721.322266,17408.0,17407.0,60001.0,49046090.0


In [11]:
print(data.isna().sum())

segment_id                  0
sensor_1_mean              24
sensor_1_median            24
sensor_1_01percentile      24
sensor_1_05percentile      24
                           ..
sensor_10_meanAbsDev       24
sensor_10_localMinCount     0
sensor_10_localMaxCount     0
sensor_10_isNullCount       0
time_to_eruption            0
Length: 272, dtype: int64


Taking a quick look, we can see that we have missing data, which could be due to sensors missing data in our original set. One thing we can do for sure is we see that all of our median data seems to be set at 0, which makes sense since the original data has already been normalized. So we should be safe to place 0's for all of our medians that we are missing.

In [19]:
for i in data.columns :
    if i.split('_').count('median') :
        data[i].fillna(0, inplace = True)
        
print(data.isna().sum())

segment_id                  0
sensor_1_mean              24
sensor_1_median             0
sensor_1_01percentile      24
sensor_1_05percentile      24
                           ..
sensor_10_meanAbsDev       24
sensor_10_localMinCount     0
sensor_10_localMaxCount     0
sensor_10_isNullCount       0
time_to_eruption            0
Length: 272, dtype: int64
