# EDA

**Import Libraries**

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from matplotlib import pyplot as plt
import json
import pandas as pd
import numpy as np
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import stumpy
from kando import kando_client

**Functions**

In [None]:
def create_heat_map(data):
    """
    :param data: data set
    plot a heatmap of correlation between the features
    """

    plt.figure(figsize=(10, 10))

    mask = np.tril(np.ones_like(data.corr(), dtype=np.bool))
    ax = sns.heatmap(data.corr(),
                     annot=True,
                     fmt=".2f",
                     mask=mask,
                     square=True,
                     linecolor='white',
                     linewidths=1)
    if data.shape[1] > 10:
        plt.xticks(rotation=55)
        plt.yticks(rotation=0)

    plt.title('\nFeatures Correlation', fontsize=18)

In [None]:
def connect():
    """
    connect to Kando API with json file
    :return: client
    """

    with open('../key.json') as f:
        api_login = json.load(f)

    url = "https://kando-staging.herokuapp.com"
    client = kando_client.client(url, api_login['key'], api_login['secret'])
    return client

In [None]:
def find_similars(col):
    """
    :param col: chosen column
    :return: list of indices of similar value in the matrix profile
    """

    min_indices = np.argwhere(mps[f'mp_{col}'][:,
                                               0] == mps[f'mp_{col}'][:,
                                                                      0].min())
    similars = []
    for index in min_indices:
        a = np.argwhere(mps[f'mp_{col}'][:, 1] == index)
        for i in a:
            similars.append(*i)
    return similars

**Connect to API**

In [None]:
client = connect()

**Load data**

In [None]:
df_Train = client.get_all(point_id=1378, start=2020)

In [None]:
df_Train

In [None]:
samples = df_Train['samplings']
len(samples)

In [None]:
df = pd.DataFrame.from_dict(samples, orient='index')
df.drop(['1483506780'], inplace=True)
df.drop(['visit', 'Battery', 'Signal'], axis=1)

In [None]:
df['DateTime'] = df['DateTime'].apply(lambda x: 1000000000*x)
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df.set_index('DateTime')

In [None]:
df.head(100)

In [None]:
df.info()

In [None]:
data_dic = {
    'PI': 'Polution_Index ',
    'EC': 'Electrical_Conductivity',
    'PH': 'pH',
    'WL': 'Water_Level',
    'ORP': 'Oxidation_reduction_potential',
    'TEMPERATURE': 'Temperature',
    'COD': 'Chemical_Oxygen_Demand',
    'TSS': 'Total_suspended_solids',
    'FLOW': 'Flow',
    'Battery': 'Battery_Level',
    'Signal': 'Signal',
    'MS': 'MS',
    'gaps': 'gaps'
}

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.columns

**Plot**

In [None]:
cols_to_plot = ['PI', 'EC', 'PH', 'ORP', 'TEMPERATURE']

In [None]:
for col_ in cols_to_plot:
    plt.figure(figsize=(16, 6))
    df[f'{col_}'].plot()
    plt.title(data_dic[f'{col_}']+ ' OVER TIME', fontsize=18)
    plt.xlabel('Time')
    plt.ylabel(str(col_))
    plt.show()

In [None]:
create_heat_map(df[cols_to_plot])

In [None]:
df.index = pd.to_datetime(df.index)

In [None]:
for col_ in cols_to_plot:
    try:
        decomposed = seasonal_decompose(df[f'{col_}'], period = 15)
        plt.figure(figsize=(16, 6))
        decomposed.trend.plot()
        decomposed.resid.plot()
        decomposed.seasonal.plot()
        plt.legend()
        plt.title(str(col_)+ ' OVER TIME', fontsize=18)
        plt.xlabel('Time')
        plt.ylabel(str(col_))
        plt.show()
    except:
        print('too much nan values in', col_)

In [None]:
BH_data = pd.read_csv('sorek.csv', index_col=0)

In [None]:
BH_data.head()

In [None]:
BH_data.info()

In [None]:
BH_data.nunique()

In [None]:
for col in BH_data.columns:
    print(BH_data[f'{col}'].dtype)

In [None]:
cols_to_plot = [
    col for col in BH_data.columns
    if (BH_data[f'{col}'].dtype == 'float64') and (
        BH_data[f'{col}'].nunique() > 1)
]

In [None]:
cols_to_plot

In [None]:
for col_ in cols_to_plot:
    plt.figure(figsize=(16, 6))
    BH_data[f'{col_}'].plot()
    plt.title(data_dic[f'{col_}']+ ' OVER TIME', fontsize=18)
    plt.xlabel('Time')
    plt.ylabel(str(col_))
    plt.show()

In [None]:
create_heat_map(BH_data[cols_to_plot])

In [None]:
for col_ in cols_to_plot:
    try:
        decomposed = seasonal_decompose(BH_data[f'{col_}'], period = 15)
        plt.figure(figsize=(16, 6))
        decomposed.trend.plot()
        decomposed.resid.plot()
        decomposed.seasonal.plot()
        plt.legend()
        plt.title(str(col_)+ ' OVER TIME', fontsize=18)
        plt.xlabel('Time')
        plt.ylabel(str(col_))
        plt.show()
    except:
        print('too much nan values in', col_)

In [None]:
BH_data[f'PI'].sample(20)

In [None]:
motif_detection_columns = ['PI', 'EC', 'PH', 'ORP', 'TEMPERATURE']

In [None]:
BH_data[motif_detection_columns]

In [None]:
m = 96
mps = {}
for col_ in motif_detection_columns:
    mps[f'mp_{col_}'] = stumpy.stump(BH_data[f'{col_}'], m=m)

In [None]:
for col_ in motif_detection_columns:
    plt.figure(figsize=(15, 5))
    BH_data[f'{col_}'].plot()
    plt.title(data_dic[f'{col_}']+ ' OVER TIME', fontsize=20)
    plt.xlabel('Time')
    plt.ylabel(f'{col_} level', fontsize='20')   
    plt.show()
    plt.figure(figsize=(15, 5))
    plt.plot(mps[f'mp_{col_}'][:, 0], C='g')
    plt.title(data_dic[f'{col_}']+ ' 1 day window matrix profile', fontsize=20)
    plt.xlabel('index')
    plt.ylabel(f'Distance to \n nearest neighbor', fontsize='18')   
    plt.show()
    print('\n\n\n')


In [None]:
for col_ in motif_detection_columns:
    plt.figure(figsize=(15, 5))
    matrix_profile = mps[f'mp_{col_}'][:, 0]

    plt.title(
        data_dic[f'{col_}'] +
        f' most similar 1 day motif \n distance between neighbors {round(matrix_profile.min(),2)}',
        fontsize=20)
    min_index = np.argwhere(
        matrix_profile == matrix_profile.min()).flatten()[0]
    similar_index = np.argwhere(
        matrix_profile == matrix_profile.min()).flatten()[1]
    motif_1 = BH_data[f'{col_}'][min_index:min_index + m]
    motif_1.plot(label=f'[{min_index}:{min_index+m}]')
    motif_2 = BH_data[f'{col_}'][similar_index:similar_index + m]
    motif_2.plot(label=f'[{similar_index}:{similar_index+m}]')
    plt.legend()
    plt.show()

In [None]:
for col_ in motif_detection_columns:
    plt.figure(figsize=(15, 5))
    plt.title(
        data_dic[f'{col_}'] +
        f' most similar 1 day motif \n distance between neighbors {round(matrix_profile.min(),2)}',
        fontsize=20)
    matrix_profile = mps[f'mp_{col_}'][:, 0]
    similars = find_similars(col_)
    for similar in similars:
        motif = BH_data[f'{col_}'][similar:similar + m]
        motif.plot(label=f'[{similar}:{similar+m}]')
    plt.legend()
    plt.show()

In [None]:
motif_detection_columns

In [None]:
BH_data[['EC', 'PH']]

In [None]:
multi_dimensional = stumpy.mstump(BH_data[['EC', 'PH', 'ORP', 'TEMPERATURE']], 96)

In [None]:
multi_dimensional[0].shape

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(multi_dimensional[0][:,0])

In [None]:
maxi = np.argwhere(multi_dimensional[0][:,0] == multi_dimensional[0][:,0].min())
maxi[0][0]

In [None]:
for col in ['PI', 'EC', 'PH', 'ORP', 'TEMPERATURE']:
    plt.figure(figsize=(15, 5))
    plt.plot(BH_data[[f'{col}']][maxi[0][0]-2*m:maxi[0][0]+2*m], label=f'{col}')
    plt.xticks([])
    plt.legend()
    plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.title( 'max', fontsize=20)
matrix_profile = mps[f'mp_{col_}'][:, 0]
similars = find_similars(col_)
for similar in similars:
    motif = BH_data[f'{col_}'][similar:similar + m]
    motif.plot(label=f'[{similar}:{similar+m}]')
plt.legend()
plt.show()