# Mice sleep
This jupiter notebook should show some of the implementation described in the project report. 

- The total **execution time** is on a macbook pro 2015 ca. **10min**  


## Python Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifierCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from tqdm.notebook import tqdm
from tqdm.keras import TqdmCallback

import experiments.lib.features as features
import experiments.lib.breeds as breeds
import experiments.lib.plots as plots
import experiments.lib.models as models

## Data import
We add directly the artifical features `time`, `day` and `hours` to our dataset. This represents better the fact, that the measurments have been taken each 4 seconds over a timespan of 4 days.

In [4]:
data_folder = 'data/csv/'
file = '10101.smo.csv'

df = features.load_features(data_folder + file)

# adding time features
df = features.add_times(df)

def to_hours(tick):
    return tick / 900.0

df['hours'] = df.index.map(to_hours)

## Data exploration

### Size of single mouse data set
Each 4 seconds over 4 days a datapoint is taken. This accumulates in a total of 86400 datapoints.

In [None]:
df.shape[0]

### Given features
Where the `rawState` column can have a values `['n', 'r', 'w', 1, 2, 3, 5, 6]` and the `state` column can have the state `['n', 'r', 'w']`

In [None]:
df.head()

### State
The mouse is either in 'w' = awake, 'n'= rem sleep and 'n' = Not Rem Sleep phase. Depicted below the number of measurements in each state and the relation between `EEGv`, `EMGv` and the state for the first day. 

In [None]:
print(df.groupby('state').size())

plots.plot_df(data=df, day=0, log=True)


### Distribution of sleep phases over 4 days

Note that the mouse was put under stress during the third day.

In [None]:
plots.density(df)

### Frequency domain

In [None]:
bins = [f"bin{i}" for i in range(401)]
df.iloc[0][bins].plot()

# remap x axis from 0 to 100 Hz
plt.xticks(np.arange(0, 401, 50), np.arange(0, 100.1, 12.5))


## Feature engineering
- [ ] Check how many spectral roll of we need

### A: Exploiting the EEG frequency spectrum


In [None]:
# 1) Spectral Flatness Measure
df = features.spectral_flatness(df)

# 2) Sepectral roll-off: 
df = features.spectral_rolloff(df, 0.5)

# 3) Spectral Centroid
df = features.spectral_centroid(df)

# 4) Spectral entropy
df = features.spectral_entropy(df)

# Drop the raw bins
for i in range(401):
        df = df.drop([f"bin{i}"], axis=1)

### B: Adding non-linearity

In [None]:
# Add log features
df = features.log_features(df, ['EEGv', 'EMGv'])

# Add polynomial features
df = features.expand_features_poly(df, 3, ['EEGv', 'EMGv'])

### C: Aggregating measurements

In [None]:
df = features.add_mean_variance_feature_windows(df, [10], ['EEGv', 'EMGv'])

### D: Eliminating outliers

In [None]:
df = features.remove_outliers_quantile(df, ['EEGv', 'EMGv'], threshold=0.95)

### E: Rebalancing

In [None]:
df = features.rebalance_labels(df)

### F: Normalization and standardization

In [None]:
# drop unwanted features
df = df.drop(['rawState', "temp"], axis=1)

# Split into train and test set and standardize
x_train, x_test, y_train, y_test, le = features.split_encode_scale_data(df, False, 0.3, 13, True)

## Model fitting


### Neural Network

In [None]:
model = models.nn_baseline_model(x_train.shape[1], le.classes_.size, lr=1e-4)
# train the model
history = model.fit(x=x_train,
                    y=y_train,
                    validation_data=(x_test, y_test),
                    epochs=150,
                    batch_size=64,
                    verbose=0,
                    callbacks=[TqdmCallback(verbose=1)]
                )

# Plot the confusion matrix
plots.plot_confusion(model=model, x_test=x_test, y_test=y_test, le=le, cat_matrix=True, normalize='true')

### Random forests

In [None]:
rfc = RandomForestClassifier(random_state=13)
rfc.fit(x_train, y_train)
y_predict = rfc.predict(x_test)

plots.plot_confusion(model=rfc, x_test=x_test, y_test=y_test, le=le, cat_matrix=True, normalize='true')