# Exploratory Data Analysis

## Informations and Conventions
https://zrp.github.io/challenges/data-science/

For convention:
 * labels = 1 will be considered **POSITIVE** cases
 * labels = 0 will be considered **NEGATIVE** cases.

## Packages

In [40]:
import pandas as pd
import plotly.express as plotly
from ast import literal_eval
import csv

## Functions

## Get raw dataset

In [41]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/zrp/challenges/master/data-science/data.csv')
df_raw.head()

Unnamed: 0,read0,read1,read2,read3,read4,read5,read6,read7,read8,read9,start_timestamp,end_timestamp,inference
0,"[-0.29, -0.07]","[0.24, -0.62]","[-0.29, 0.14]","[0.1, 0.14]","[0.41, -0.45]","[0.41, 0.22]","[0.36, 0.03]","[0.37, 0.42]","[0.19, 0.42]","[0.19, 0.01]",1665656955,1665656967,1
1,"[-0.25, -0.04]","[0.04, 0.19]","[0.11, 0.19]","[-0.23, 0.07]","[-0.23, -0.38]","[-0.48, -0.37]","[-0.42, -0.33]","[-0.23, -0.33]","[-0.23, -0.2]","[-0.12, -0.31]",1665656968,1665656980,1
2,"[-0.59, -0.27]","[-0.42, -0.27]","[-0.42, -0.34]","[-0.35, -0.37]","[-0.39, -0.36]","[-0.47, -0.36]","[-0.12, -0.39]","[-0.12, -0.06]","[-0.01, -0.05]","[-0.11, 0.46]",1665656982,1665656994,1
3,"[-0.2, -0.15]","[-0.44, -0.31]","[-0.65, -0.31]","[-0.7, -0.17]","[-0.7, -0.15]","[-0.63, -0.12]","[-0.56, -0.13]","[-0.46, -0.13]","[-0.46, -0.18]","[-0.52, -0.15]",1665656914,1665656926,1
4,"[-0.35, -0.19]","[-0.61, -0.19]","[-0.61, -0.15]","[-0.61, -0.13]","[-0.59, -0.15]","[-0.56, -0.15]","[0.44, 0.27]","[0.44, 0.15]","[-0.59, -0.25]","[-0.56, -0.22]",1665656928,1665656940,1


In [42]:
print('Value:', df_raw.iloc[0]['read0'], '\n', 'Type:', type(df_raw.iloc[0]['read0']))

Value: [-0.29, -0.07] 
 Type: <class 'str'>


Observing the raw dataset, we can see that the columns read0, read1, ..., read9 are strings. We need to convert them to lists of floats to be able to use them in our analysis.

In [43]:
df_reads = df_raw.iloc[:, :10]
for name, column in df_reads.iteritems():
    column = column.apply(literal_eval)
    df_reads[name] = column
df_reads

  for name, column in df_reads.iteritems():


Unnamed: 0,read0,read1,read2,read3,read4,read5,read6,read7,read8,read9
0,"[-0.29, -0.07]","[0.24, -0.62]","[-0.29, 0.14]","[0.1, 0.14]","[0.41, -0.45]","[0.41, 0.22]","[0.36, 0.03]","[0.37, 0.42]","[0.19, 0.42]","[0.19, 0.01]"
1,"[-0.25, -0.04]","[0.04, 0.19]","[0.11, 0.19]","[-0.23, 0.07]","[-0.23, -0.38]","[-0.48, -0.37]","[-0.42, -0.33]","[-0.23, -0.33]","[-0.23, -0.2]","[-0.12, -0.31]"
2,"[-0.59, -0.27]","[-0.42, -0.27]","[-0.42, -0.34]","[-0.35, -0.37]","[-0.39, -0.36]","[-0.47, -0.36]","[-0.12, -0.39]","[-0.12, -0.06]","[-0.01, -0.05]","[-0.11, 0.46]"
3,"[-0.2, -0.15]","[-0.44, -0.31]","[-0.65, -0.31]","[-0.7, -0.17]","[-0.7, -0.15]","[-0.63, -0.12]","[-0.56, -0.13]","[-0.46, -0.13]","[-0.46, -0.18]","[-0.52, -0.15]"
4,"[-0.35, -0.19]","[-0.61, -0.19]","[-0.61, -0.15]","[-0.61, -0.13]","[-0.59, -0.15]","[-0.56, -0.15]","[0.44, 0.27]","[0.44, 0.15]","[-0.59, -0.25]","[-0.56, -0.22]"
...,...,...,...,...,...,...,...,...,...,...
3510,"[0.08, -1.27]","[0.07, -1.27]","[0.07, -1.27]","[0.08, -1.27]","[0.08, -1.27]","[0.07, -1.27]","[0.08, -1.27]","[0.1, -1.27]","[0.08, -1.27]","[0.09, -1.27]"
3511,"[0.02, -1.27]","[0.25, -1.27]","[0.18, -1.27]","[0.17, -1.27]","[0.19, -1.27]","[0.18, -1.27]","[0.18, -1.27]","[0.06, -1.14]","[-1.18, -0.71]","[-0.35, -1.27]"
3512,"[-0.04, -1.27]","[0.03, -1.27]","[0.02, -1.27]","[0.02, -1.27]","[0, -1.27]","[0.02, -1.27]","[0.01, -1.27]","[0.01, -1.27]","[0.01, -1.27]","[-0.8, -1.07]"
3513,"[-0.09, -1.18]","[0.14, -1.27]","[0.16, -1.27]","[0.15, -1.27]","[0.14, -1.27]","[0.16, -1.27]","[0.15, -1.27]","[0.15, -1.27]","[0.15, -1.27]","[0.15, -1.27]"


In [44]:
print(f"Value: {df_reads.iloc[0]['read0']} \n"
      f"Type: {type(df_reads.iloc[0]['read0'])} \n"
      f"\n"
      f"Element value: {df_reads.iloc[0]['read0'][0]} \n"
      f"Element type: {type(df_reads.iloc[0]['read0'][0])}")

Value: [-0.29, -0.07] 
Type: <class 'list'> 

Element value: -0.29 
Element type: <class 'float'>


Now lets put the remaining columns of df_raw and create a few more

## Preparing df_min_max
Dataset containing the values of t_end and t_start outside lists

The motivation to create this dataset is to analyze each value (min or max), indivudally, because these values inside a list make the analysis more difficult.

In [45]:
df_start_end = df_reads.apply(pd.Series.explode)
df_start_end.to_csv('../assets/df_start_end.csv')
df_start_end

Unnamed: 0,read0,read1,read2,read3,read4,read5,read6,read7,read8,read9
0,-0.29,0.24,-0.29,0.1,0.41,0.41,0.36,0.37,0.19,0.19
0,-0.07,-0.62,0.14,0.14,-0.45,0.22,0.03,0.42,0.42,0.01
1,-0.25,0.04,0.11,-0.23,-0.23,-0.48,-0.42,-0.23,-0.23,-0.12
1,-0.04,0.19,0.19,0.07,-0.38,-0.37,-0.33,-0.33,-0.2,-0.31
2,-0.59,-0.42,-0.42,-0.35,-0.39,-0.47,-0.12,-0.12,-0.01,-0.11
...,...,...,...,...,...,...,...,...,...,...
3512,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.07
3513,-0.09,0.14,0.16,0.15,0.14,0.16,0.15,0.15,0.15,0.15
3513,-1.18,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27,-1.27
3514,0.15,0.15,0.14,0.15,0.14,0.13,0.1,0.09,0.07,0.04


## Preparing df_diff
Dataset containing in each cell the difference between the values of t_end and t_start

The motivation to create this dataset is to create features that can be input to a machine learning model. Lists cannot be used as input. The difference between these values can be a promising feature to get good results for our models.

In [46]:
df_diff = df_reads.copy()

df_diff = df_diff.apply(lambda x: x.apply(lambda y: y[1] - y[0]))
df_diff['mean'] = df_diff.mean(axis=1)
# change column names: read0 -> diff_read0
df_diff.columns = df_diff.columns.str.replace('read', 'diff_read')
df_diff['start_timestamp'] = df_raw['start_timestamp']
df_diff['end_timestamp'] = df_raw['end_timestamp']
df_diff['diff_timestemp'] = df_diff['end_timestamp'] - df_diff['start_timestamp']
df_diff['inference'] = df_raw['inference']

df_diff.to_csv('../assets/df_diff.csv')
df_diff

Unnamed: 0,diff_read0,diff_read1,diff_read2,diff_read3,diff_read4,diff_read5,diff_read6,diff_read7,diff_read8,diff_read9,mean,start_timestamp,end_timestamp,diff_timestemp,inference
0,0.22,-0.86,0.43,0.04,-0.86,-0.19,-0.33,0.05,0.23,-0.18,-0.145,1665656955,1665656967,12,1
1,0.21,0.15,0.08,0.30,-0.15,0.11,0.09,-0.10,0.03,-0.19,0.053,1665656968,1665656980,12,1
2,0.32,0.15,0.08,-0.02,0.03,0.11,-0.27,0.06,-0.04,0.57,0.099,1665656982,1665656994,12,1
3,0.05,0.13,0.34,0.53,0.55,0.51,0.43,0.33,0.28,0.37,0.352,1665656914,1665656926,12,1
4,0.16,0.42,0.46,0.48,0.44,0.41,-0.17,-0.29,0.34,0.34,0.259,1665656928,1665656940,12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3510,-1.35,-1.34,-1.34,-1.35,-1.35,-1.34,-1.35,-1.37,-1.35,-1.36,-1.350,1674811711,1674811723,12,0
3511,-1.29,-1.52,-1.45,-1.44,-1.46,-1.45,-1.45,-1.20,0.47,-0.92,-1.171,1674811725,1674811737,12,1
3512,-1.23,-1.30,-1.29,-1.29,-1.27,-1.29,-1.28,-1.28,-1.28,-0.27,-1.178,1674811654,1674811667,13,1
3513,-1.09,-1.41,-1.43,-1.42,-1.41,-1.43,-1.42,-1.42,-1.42,-1.42,-1.387,1674811669,1674811681,12,1
