In [None]:
!pip install pyathena

!pip install pyarrow
!pip install s3fs

In [None]:
import pyarrow.parquet as pq
import s3fs
import pandas as pd
fs = s3fs.S3FileSystem()

In [None]:
dataset = pq.ParquetDataset('s3://datalake-curated-datasets-907317471167-us-east-1-pjkrtzr/year=2021', filesystem=fs)
table = dataset.read()
df = table.to_pandas()
df = df.sort_values(['unit_number', 'cycle'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df.describe().T

In [None]:
fig, ax = plt.subplots(figsize=(10, 20))
_ = df.groupby(['filename', 'unit_number']).cycle.max().plot.barh(ax=ax)
_ = plt.axvline(x=df.groupby('unit_number').cycle.max().mean())

In [None]:
sns.jointplot(x='op_1', y='failure_cycle', data=df.sample(1000), kind='reg')

In [None]:
sns.jointplot(x='op_2', y='failure_cycle', data=df.sample(1000), kind='reg')

In [None]:
sns.jointplot(x='op_3', y='failure_cycle', data=df.sample(10000))

In [None]:
import matplotlib.pyplot as plt

In [None]:
ddf = df.set_index(['filename', 'unit_number', 'cycle']).sort_index()

In [None]:
ddf.loc[['train_FD001.txt', ...]]

In [None]:
fig, axes = plt.subplots(7, 3, figsize=(30, 40))
axes = axes.ravel()
for i, a in zip(range(1, 22), axes):
    column = 'sensor_measurement_' + str(i)
    _ = a.plot(ddf.loc[['train_FD001.txt', ...], column].unstack(level=[0, 1]).values, alpha=.05)
    a.set_title(column)
    a.set_xlabel('cycle')

In [None]:
fig, axes = plt.subplots(7, 3, figsize=(30, 40))
axes = axes.ravel()
for i, a in zip(range(1, 22), axes):
    column = 'sensor_measurement_' + str(i)
    _ = a.plot(ddf.loc[['train_FD002.txt', ...], column].unstack(level=[0, 1]).values, alpha=.05)
    a.set_title(column)
    a.set_xlabel('cycle')

In [None]:
fig, axes = plt.subplots(7, 3, figsize=(30, 40))
axes = axes.ravel()
for i, a in zip(range(1, 22), axes):
    column = 'sensor_measurement_' + str(i)
    _ = a.plot(ddf.loc[['train_FD003.txt', ...], column].unstack(level=[0, 1]).values, alpha=.05)
    a.set_title(column)
    a.set_xlabel('cycle')

# the training data for 2 and 4 have 6 different operational settings and we can see how this effects the measurements here

In [None]:
fig, axes = plt.subplots(7, 3, figsize=(30, 40))
axes = axes.ravel()
for i, a in zip(range(1, 22), axes):
    column = 'sensor_measurement_' + str(i)
    _ = a.plot(ddf.loc[['train_FD004.txt', ...], column].unstack(level=[0, 1]).values, alpha=.05)
    a.set_title(column)
    a.set_xlabel('cycle')

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features = ['cycle', 'op_1', 'op_2',
       'op_3', 'sensor_measurement_1', 'sensor_measurement_2',
       'sensor_measurement_3', 'sensor_measurement_4', 'sensor_measurement_5',
       'sensor_measurement_6', 'sensor_measurement_7', 'sensor_measurement_8',
       'sensor_measurement_9', 'sensor_measurement_10',
       'sensor_measurement_11', 'sensor_measurement_12',
       'sensor_measurement_13', 'sensor_measurement_14',
       'sensor_measurement_15', 'sensor_measurement_16',
       'sensor_measurement_17', 'sensor_measurement_18',
       'sensor_measurement_19', 'sensor_measurement_20',
       'sensor_measurement_21']

In [None]:
import numpy as np

In [None]:
is_train = df.unit_number % 3 != 0
is_test = df.unit_number % 3 == 0

In [None]:
x_train, x_test = df.loc[is_train, features], df.loc[is_test, features]

y_train, y_test = df.loc[is_train, 'failure_cycle'],  df.loc[is_test, 'failure_cycle']

In [None]:
cls = RandomForestRegressor(n_jobs=-1, n_estimators=40, )

cls = cls.fit(x_train, y_train)

cls.score(x_test, y_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_test, cls.predict(x_test), squared=False)

In [None]:
from sklearn.dummy import DummyRegressor

In [None]:
dummy = DummyRegressor()
dummy = dummy.fit(x_train, y_train)
mean_squared_error(y_test, dummy.predict(x_test), squared=False)

# An ensemble based method outperforms a niave mean prediction by ~50% 
* Next steps apply Xgboost, gradient boosting generally outperforms random forest when tuned appropriately
* this approach above validates the potencial value before we commit to building a sagemaker model, ie if there wasn't a margin over out "dummy" model then building a sagemaker model wouldn't probably be fruitful. 