# The purpose of this notebook is to build a machine learning model to predict whether or not a compression took place

In [None]:
import pandas as pd
import numpy as np

# Utilities file
import utility as util

from collections import OrderedDict
from itertools import product

# modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    auc,
    brier_score_loss,
)
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from scipy import stats

In [None]:
# import data
df = pd.read_csv('data/labeled_comp.csv')

In [None]:
df.head()

## When modeling, it's important to think about how this model will be used in a production setting. Model results will need to be quick. Therefore, it would be best for us to use an existing sensor reading rather than building a new one.

In [None]:
# Select features and target
X = df[['Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)', 'Absolute acceleration (m/s^2)']]
y = df['is_compression']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(n_estimators=200, 
                               random_state = 11,
                               n_jobs=-1, 
                               class_weight = 'balanced',
                               verbose=True, 
                               max_depth=10,
                               min_samples_leaf=1)


# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

### Model Metrics

In [None]:
# Make sure the model isn't overfitting the training data
rf_train_prob = model.predict_proba(X_train)[:, 1]
rf_test_prob = model.predict_proba(X_test)[:, 1]

### Get Model Stats:
# Get test metrics
rf_test_pred = rf_test_prob > 0.5
util.model_stats(y_train, rf_train_prob, y_test, rf_test_prob, rf_test_pred)