In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

## Useful information and variable descriptions can be found here:
https://wiki.helsinki.fi/pages/viewpage.action?pageId=243959901

https://smear.avaa.csc.fi/

https://iopscience.iop.org/article/10.1088/1748-9326/aadf3c

### Load data

In [3]:
train_df = pd.read_csv("../data_raw/npf_train.csv")
test_df = pd.read_csv("../data_raw/npf_test_hidden.csv")

In [4]:
train_df.describe()

Unnamed: 0,id,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
count,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0,...,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0
mean,229.5,380.996307,3.509202,381.014152,3.29557,381.906526,4.290148,380.913345,3.094905,193.28792,...,6.311782,1.652786,6.898868,1.987094,10.961236,7.845917,0.451261,0.391348,0.00305,0.000659
std,132.357471,10.897068,3.433711,10.878318,3.192464,10.482752,4.290241,10.900973,2.918037,126.058537,...,9.680534,0.972092,9.74443,1.172087,6.633754,5.042478,0.319151,0.289645,0.002257,0.000572
min,1.0,356.526871,0.053968,356.796486,0.04899,359.795102,0.152663,356.841429,0.037417,3.719434,...,-23.288234,0.0667,-22.659083,0.069879,0.29578,0.174813,0.005346,0.003261,0.000227,2.7e-05
25%,115.25,373.001701,1.085523,373.016258,1.016582,374.129292,1.222415,372.92142,1.004862,71.773357,...,-1.298557,0.83348,-0.661579,0.888506,4.937496,2.890366,0.137491,0.103036,0.0015,0.000275
50%,229.5,380.102616,2.384334,380.101303,2.246497,380.973091,2.714247,380.113959,2.137779,197.939217,...,7.320084,1.696258,7.91716,2.027125,11.702413,8.390743,0.438305,0.391399,0.002475,0.000491
75%,343.75,388.480984,4.956735,388.505367,4.645716,389.003091,6.286882,388.296775,4.376801,304.162015,...,13.964999,2.359977,14.549644,2.878753,16.72771,12.289556,0.700794,0.623115,0.004176,0.000829
max,458.0,411.34597,20.96063,411.360294,19.351218,411.014925,27.656008,411.348676,17.384845,449.450545,...,25.684185,5.124718,25.897075,5.422549,22.560576,16.692785,1.242857,1.074115,0.019094,0.003658


In [5]:
Counter(train_df["class4"])

Counter({'nonevent': 229, 'Ib': 83, 'II': 117, 'Ia': 29})

In [6]:
test_df.describe()

Unnamed: 0,id,date,class4,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
count,965.0,0.0,0.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,...,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0
mean,941.0,,,382.203461,3.199941,382.216331,3.022803,383.137855,4.051119,382.120952,...,5.178399,1.577155,5.740758,1.862095,10.348864,7.28727,0.418032,0.358171,0.003052,0.000685
std,278.715805,,,11.237627,3.207766,11.188357,2.981282,10.870552,4.287413,11.182388,...,9.964494,0.995852,10.101577,1.181897,6.620209,5.007745,0.311692,0.282794,0.00219,0.00068
min,459.0,,,359.240488,0.094845,359.096905,0.088273,360.350743,0.111492,358.245902,...,-23.900675,0.033329,-24.874583,0.031017,0.295937,0.121475,0.004642,0.002685,0.000243,2.3e-05
25%,700.0,,,373.913351,0.850122,374.040341,0.834333,375.273509,0.967263,374.018,...,-2.387549,0.693572,-1.820163,0.760917,3.991261,2.371487,0.11485,0.079184,0.001514,0.00027
50%,941.0,,,381.100276,2.097653,381.132113,1.968488,382.04,2.392523,380.931587,...,6.359223,1.559908,7.014207,1.900331,10.477116,7.251167,0.381297,0.327694,0.002452,0.00051
75%,1182.0,,,389.117589,4.479012,389.027135,4.216838,389.758506,5.783256,388.944095,...,13.212069,2.340095,13.860287,2.84515,16.354165,11.765043,0.673152,0.598445,0.003953,0.000848
max,1423.0,,,421.511176,22.82228,421.057843,19.881541,422.6278,40.366704,419.92451,...,27.110436,5.145668,27.938861,5.131955,22.597583,16.830501,1.197271,1.055615,0.015837,0.006277


Check wether both dataframes have the same columns

In [7]:
set(train_df.columns).symmetric_difference(set(test_df.columns))

set()

In [8]:
list(train_df.columns)

['id',
 'date',
 'class4',
 'partlybad',
 'CO2168.mean',
 'CO2168.std',
 'CO2336.mean',
 'CO2336.std',
 'CO242.mean',
 'CO242.std',
 'CO2504.mean',
 'CO2504.std',
 'Glob.mean',
 'Glob.std',
 'H2O168.mean',
 'H2O168.std',
 'H2O336.mean',
 'H2O336.std',
 'H2O42.mean',
 'H2O42.std',
 'H2O504.mean',
 'H2O504.std',
 'H2O672.mean',
 'H2O672.std',
 'H2O84.mean',
 'H2O84.std',
 'NET.mean',
 'NET.std',
 'NO168.mean',
 'NO168.std',
 'NO336.mean',
 'NO336.std',
 'NO42.mean',
 'NO42.std',
 'NO504.mean',
 'NO504.std',
 'NO672.mean',
 'NO672.std',
 'NO84.mean',
 'NO84.std',
 'NOx168.mean',
 'NOx168.std',
 'NOx336.mean',
 'NOx336.std',
 'NOx42.mean',
 'NOx42.std',
 'NOx504.mean',
 'NOx504.std',
 'NOx672.mean',
 'NOx672.std',
 'NOx84.mean',
 'NOx84.std',
 'O3168.mean',
 'O3168.std',
 'O342.mean',
 'O342.std',
 'O3504.mean',
 'O3504.std',
 'O3672.mean',
 'O3672.std',
 'O384.mean',
 'O384.std',
 'Pamb0.mean',
 'Pamb0.std',
 'PAR.mean',
 'PAR.std',
 'PTG.mean',
 'PTG.std',
 'RGlob.mean',
 'RGlob.std',
 '

### For non-class related exploration, we combine test and training data

In [9]:
train_test_df = train_df.append(test_df)
train_test_df.shape

(1423, 104)

In [10]:
train_test_df.describe()

Unnamed: 0,id,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,...,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,712.0,381.814932,3.299478,381.829404,3.110594,382.741545,4.128052,381.732277,2.916157,186.030856,...,5.543184,1.601497,6.113502,1.902326,10.54596,7.467073,0.428727,0.368849,0.003051,0.000676
std,410.929029,11.13962,3.284156,11.099921,3.052384,10.758982,4.28827,11.103114,2.788741,126.050826,...,9.884792,0.988557,9.999256,1.179784,6.62842,5.023965,0.314384,0.285337,0.002211,0.000647
min,1.0,356.526871,0.053968,356.796486,0.04899,359.795102,0.111492,356.841429,0.037417,3.478842,...,-23.900675,0.033329,-24.874583,0.031017,0.29578,0.121475,0.004642,0.002685,0.000227,2.3e-05
25%,356.5,373.550747,0.91721,373.650491,0.90957,374.957122,1.046471,373.566072,0.862104,63.05127,...,-1.823624,0.72967,-1.202457,0.796233,4.183164,2.512909,0.123863,0.085873,0.00151,0.000271
50%,712.0,380.819263,2.169836,380.839841,2.07253,381.731643,2.546452,380.662737,1.955902,184.53366,...,6.626599,1.616088,7.392308,1.959201,10.827529,7.554723,0.402127,0.345111,0.002456,0.0005
75%,1067.5,388.93666,4.627821,388.917618,4.325401,389.497341,5.921885,388.898661,4.068152,297.760148,...,13.510007,2.357361,14.15594,2.858339,16.462902,11.948484,0.681238,0.607423,0.004004,0.000841
max,1423.0,421.511176,22.82228,421.057843,19.881541,422.6278,40.366704,419.92451,17.985023,449.450545,...,27.110436,5.145668,27.938861,5.422549,22.597583,16.830501,1.242857,1.074115,0.019094,0.006277


### Observations

* The std values calculated by pandas differ from those given in the dataset
* 

#### The documentation states that "Code -999 is used to indicate missing data." Let's investigate

In [11]:
train_test_df[train_test_df.eq(-999).any(1)]

Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std


### From which time span do we have the data?

In [12]:
min(train_df["date"]), max(train_df["date"])

('2000-01-01', '2011-08-22')

The documentation states "Note! Between 1.1.1996 - 27.2.2007 a molybdenum converter was used to convert
NO2 --> NO. This method also converts other reactive nitrogen oxide species (e.g. PAN) to NO and thus the NOx signal should
be considered as an upper estimate for NO+NO2 concentration. Starting 1.3.2007 an NO2 specific photolytic converter
(Blue Light Converter, Droplet Measurement Technologies, Boulder, CO, USA) was taken into use removing this interference
from the NOx signal."

### Correlations of variables and classes

In [13]:
train_df.drop(['id', 'partlybad', 'date'], axis=1).corr()

Unnamed: 0,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,Glob.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
CO2168.mean,1.000000,-0.371986,0.999712,-0.371666,0.993856,-0.390283,0.998910,-0.361283,-0.435391,-0.473683,...,-0.605413,-0.334337,-0.605420,-0.349839,-0.457761,-0.481379,-0.457928,-0.472477,-0.101836,-0.094851
CO2168.std,-0.371986,1.000000,-0.381345,0.995680,-0.309975,0.949938,-0.395271,0.982188,0.231499,0.326488,...,0.657999,0.231093,0.651299,0.242213,0.308528,0.370380,0.402900,0.435813,0.339732,0.301349
CO2336.mean,0.999712,-0.381345,1.000000,-0.380038,0.992314,-0.400475,0.999671,-0.369085,-0.431139,-0.469831,...,-0.606690,-0.330811,-0.606360,-0.347749,-0.453938,-0.478239,-0.454890,-0.470099,-0.106241,-0.098116
CO2336.std,-0.371666,0.995680,-0.380038,1.000000,-0.311252,0.937549,-0.393088,0.990299,0.228482,0.323010,...,0.657472,0.237584,0.651183,0.241119,0.305582,0.366545,0.401436,0.433594,0.342769,0.305395
CO242.mean,0.993856,-0.309975,0.992314,-0.311252,1.000000,-0.308914,0.990206,-0.302951,-0.430814,-0.463592,...,-0.565764,-0.333168,-0.567263,-0.342203,-0.447410,-0.466891,-0.437610,-0.449936,-0.079033,-0.079144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UV_A.std,-0.481379,0.370380,-0.478239,0.366545,-0.466891,0.410055,-0.477272,0.348281,0.946880,0.989021,...,0.715570,0.765915,0.721835,0.818792,0.979265,1.000000,0.959919,0.970096,0.131432,0.219691
UV_B.mean,-0.457928,0.402900,-0.454890,0.401436,-0.437610,0.447477,-0.454433,0.383682,0.931324,0.940441,...,0.757144,0.750153,0.761630,0.807038,0.962875,0.959919,1.000000,0.989872,0.170800,0.223744
UV_B.std,-0.472477,0.435813,-0.470099,0.433594,-0.449936,0.479739,-0.469986,0.417208,0.902556,0.943893,...,0.780484,0.735562,0.784677,0.787374,0.945881,0.970096,0.989872,1.000000,0.183513,0.246996
CS.mean,-0.101836,0.339732,-0.106241,0.342769,-0.079033,0.321115,-0.111163,0.350140,0.066414,0.090151,...,0.384233,0.163052,0.380446,0.169666,0.111065,0.131432,0.170800,0.183513,1.000000,0.489783


In [18]:
clean_df = train_df.drop(['id', 'partlybad', 'date'], axis=1)
clean_df['class2'] = train_df['class4'].apply(lambda x: 0 if x=='nonevent' else 1)

import pandas as pd


Y = clean_df['class2']
X = clean_df.drop(['class2', 'class4'], axis=1)

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold

#features = ['PAR.std']
X_temp = X
feature_selection = VarianceThreshold(threshold=(.8 * (1-.8)))
X_temp = feature_selection.fit_transform(X_temp)
print(X_temp.shape)

SVM = svm.LinearSVC(dual=False)
SVM.fit(X_temp, Y)
SVM.score(X_temp, Y)
#LRegressor = LogisticRegression(random_state=0, solver='lbfgs').fit(X, Y)

(458, 80)


0.8908296943231441

In [66]:
X_test = pd.read_csv("../data/npf_test_hidden.csv")
X_test = X_test.drop(['id', 'partlybad', 'date', 'class4'], axis=1)
X_test = feature_selection.fit_transform(X_test)

SVM.predict(X_test)

KeyError: "['class2'] not found in axis"