# Data investigation

### Loading the relevant library

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from datetime import datetime

## Data cleaning

### Loading the data

In [3]:
# read data
part1 = pd.read_csv('../Data/Copy of '
                    '20200326_propulsionlab_caru_data_part1.csv', 
                    delimiter=',')
part2 = pd.read_csv('../Data/Copy of '
                    '20200326_propulsionlab_caru_data_part2.csv', 
                    delimiter=',')
part3 = pd.read_csv('../Data/Copy of '
                    '20200326_propulsionlab_caru_data_part3.csv', 
                    delimiter=',')
part4 = pd.read_csv('../Data/Copy of '
                    '20200326_propulsionlab_caru_data_part4.csv', 
                    delimiter=',')
part5 = pd.read_csv('../Data/Copy of '
                    '20200326_propulsionlab_caru_data_part5.csv', 
                    delimiter=',')
print('data loaded')

data loaded


In [4]:
df_raw = pd.concat([part1, part2, part3, part4, part5], ignore_index= True)

In [5]:
df_raw.columns

Index(['device', 'tenant', 'ts_date', 'STM-TCS3472_c', 'light', 'temperature',
       'humidity', 'co2'],
      dtype='object')

### Backup: keep df_raw, work on df_features

In [6]:
df_features = df_raw.copy()

### Drop the column STM-TCS3472_c

In [7]:
df_features = df_features.drop(['STM-TCS3472_c'],axis=1)

In [8]:
df_features.columns

Index(['device', 'tenant', 'ts_date', 'light', 'temperature', 'humidity',
       'co2'],
      dtype='object')

### Change "ts_date" to datetime format

In [9]:
df_features['ts_date'] = pd.to_datetime(df_features['ts_date'])

### Create a "day" column

In [10]:
df_features.insert(2, "day", df_features["ts_date"].dt.date)
# df_features["day"] = pd.to_datetime(df_features["day"])

### Create a day_night column

In [11]:
df_features['day_night'] = 'D'

### Check columns

In [12]:
df_features.columns

Index(['device', 'tenant', 'day', 'ts_date', 'light', 'temperature',
       'humidity', 'co2', 'day_night'],
      dtype='object')

In [13]:
df_features.head(3)

Unnamed: 0,device,tenant,day,ts_date,light,temperature,humidity,co2,day_night
0,device01,tenant01,2019-05-01,2019-05-01 12:00:17.591,119.0,21.108163,30.977341,683.169922,D
1,device01,tenant01,2019-05-01,2019-05-01 12:00:37.639,117.0,21.108163,31.08873,684.252136,D
2,device01,tenant01,2019-05-01,2019-05-01 12:00:57.686,118.0,21.121515,30.978867,685.897095,D


### Backup: keep df_features, work on df

In [14]:
df = df_features.copy()

In [15]:
print(sorted(df['device'].unique()))

['device01', 'device02', 'device03', 'device04', 'device05', 'device06', 'device07', 'device08', 'device09', 'device10', 'device11', 'device12', 'device13', 'device14', 'device15', 'device16', 'device17', 'device18', 'device19', 'device20', 'device21', 'device22', 'device23', 'device24', 'device25', 'device26', 'device27', 'device28', 'device29', 'device30', 'device31', 'device32', 'device33', 'device34', 'device35', 'device36', 'device37', 'device38', 'device39', 'device40', 'device41', 'device42', 'device43', 'device44', 'device45', 'device46', 'device47', 'device48', 'device49', 'device50', 'device51']


In [16]:
df.dtypes

device                 object
tenant                 object
day                    object
ts_date        datetime64[ns]
light                 float64
temperature           float64
humidity              float64
co2                   float64
day_night              object
dtype: object

### Work on plotting Device 2 only

In [17]:
df['ts_date'].max()

Timestamp('2020-03-26 11:59:59.906000')

## Data plotting
### Plot for all the data

### Plot for each day

### Plot for device, parameter and day

In [30]:
device_list = ['device02', 'device05', 'device08', 'device11', 'device14', 'device17',
               'device20', 'device23', 'device26', 'device29', 'device32', 'device35',
               'device38', 'device41', 'device44', 'device47', 'device50']

parameters = ['light', 'temperature', 'humidity', 'co2']

# days_instances = ['2019-12-04', '2019-12-05', '2019-12-06', '2019-12-07', '2019-12-08',
#                   '2019-12-09', '2019-12-10']
# days_instances = ['2019-06-04', '2019-06-05', '2019-06-06', '2019-06-07', '2019-06-08',
#                   '2019-06-09', '2019-06-10']
days_instances = ['2019-07-29', '2019-07-30', '2019-07-31', '2019-08-01', '2019-08-02',
                  '2019-08-03', '2019-08-04']

In [19]:
# This plotting function works !

def plot_scatter_flex(device, col, days):
    name = str(col) + '_'+ str(device) + '_' + str(days)

    fig, ax = plt.subplots(1,1, figsize=(7, 5));
    ax.scatter('ts_date', col, data = df_small, s = 2); 
#     ax.plot(df_small.index.time,df_small[col]); # for scatter uncomment prev line
    last = df_small.shape[0] - 1
    ax.set_xlim(left = df_small.iloc[0, 3], right = df_small.iloc[last, 3])
    ax.tick_params(axis = 'x', labelrotation=45)
    ax.set_title(name);
    ax.set_ylabel(col)

    folder = '/Users/guillaume/Documents/DS2020/Caru/caru/Fig/'
    filename = folder + name
    plt.savefig(filename, bbox_inches = "tight")
    plt.close()

#some ideas on ticks
# ticklabels = df_small.index.strftime("%H:%M:%S")
# ticklabels

In [31]:
# Looping the graph

print('Starting')
df = df_features.copy()
print('df_uploaded')

for device in device_list:
    print(device)
    df_dev = df.loc[df['device'] == device]
    for days in days_instances:
        date =  pd.to_datetime(days)
        df_small = df_dev[df_dev['day'] == date]
        if df_small.shape[0]>0:
            print('Found data')
            for col in parameters:
                plot_scatter_flex(device, col, days)

print('Looping completed')

Starting
df_uploaded
device02
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device05
device08
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device11
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device14
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device17
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device20
device23
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device26
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device29
device32
device35
device38
device41
device44
device47
device50
Looping completed


In [26]:
# Looping the graph - This abort after the first but WORKS

print('Starting')
df = df_features.copy()
print('df_uploaded')

for device in device_list:
    print(device)
    df = df.loc[df['device'] == device]
    for days in days_instances:
        date =  pd.to_datetime(days)
        df_small = df[df['day'] == date]
        if df_small.shape[0]>0:
            print('Found data')
            for col in parameters:
                plot_scatter_flex(device, col, days)

print('Looping completed')

Starting
df_uploaded
device08
Found data
Found data
Found data
Found data
Found data
Found data
Found data
device11
device14
device17
device20
device23
device26
device29
device32
device35
device38
device41
device44
device47
device50
Looping completed


### Flexible plotting function

### Meeting note - Badru 2020 03 30

Do data clustering. Cluster activity day. Identify 

Set threshold for sensitivity of activity detection. Designed to be used by nurses, caretaker.

Unsupervised algorithm that can detect patterns in the data.

What do they expect: dashboard, graph segments, web app.