<a href="https://colab.research.google.com/github/coding-dojo-data-science/week-10-lecture-2-pca/blob/main/11_7_23_PCA_MotorData_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Machine Failure

The task is to predict the condition of a motor based on the features provided.

This is ultimately a supervised learning task, but we will add PCA to our pre-processing step.

The goal here is to get the highest overall accuracy.

In [1]:
import pandas as pd
from seaborn import heatmap
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [2]:
# load data
url = 'Data/Sensorless_RAW (1).csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49
0,-3.0146e-07,8.2603e-06,-1.2e-05,-2e-06,-1.4386e-06,-2.1e-05,0.031718,0.03171,0.031721,-0.032963,...,-0.63308,2.9646,8.1198,-1.4961,-1.4961,-1.4961,-1.4996,-1.4996,-1.4996,1
1,2.9132e-06,-5.2477e-06,3e-06,-6e-06,2.7789e-06,-4e-06,0.030804,0.03081,0.030806,-0.03352,...,-0.59314,7.6252,6.169,-1.4967,-1.4967,-1.4967,-1.5005,-1.5005,-1.5005,1
2,-2.9517e-06,-3.184e-06,-1.6e-05,-1e-06,-1.5753e-06,1.7e-05,0.032877,0.03288,0.032896,-0.029834,...,-0.63252,2.7784,5.3017,-1.4983,-1.4983,-1.4982,-1.4985,-1.4985,-1.4985,1
3,-1.3226e-06,8.8201e-06,-1.6e-05,-5e-06,-7.2829e-07,4e-06,0.02941,0.029401,0.029417,-0.030156,...,-0.62289,6.5534,6.2606,-1.4963,-1.4963,-1.4963,-1.4975,-1.4975,-1.4976,1
4,-6.8366e-08,5.6663e-07,-2.6e-05,-6e-06,-7.9406e-07,1.3e-05,0.030119,0.030119,0.030145,-0.031393,...,-0.6301,4.5155,9.5231,-1.4958,-1.4958,-1.4958,-1.4959,-1.4959,-1.4959,1


In [3]:
df['Column49'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

In [4]:
# Check for duplicates
df.duplicated().sum()

0

In [5]:
# Explore shape and datatypes of the data
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58509 entries, 0 to 58508
Data columns (total 49 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column1   58509 non-null  float64
 1   Column2   58509 non-null  float64
 2   Column3   58509 non-null  float64
 3   Column4   58509 non-null  float64
 4   Column5   58509 non-null  float64
 5   Column6   58509 non-null  float64
 6   Column7   58509 non-null  float64
 7   Column8   58509 non-null  float64
 8   Column9   58509 non-null  float64
 9   Column10  58509 non-null  float64
 10  Column11  58509 non-null  float64
 11  Column12  58509 non-null  float64
 12  Column13  58509 non-null  float64
 13  Column14  58509 non-null  float64
 14  Column15  58509 non-null  float64
 15  Column16  58509 non-null  float64
 16  Column17  58509 non-null  float64
 17  Column18  58509 non-null  float64
 18  Column19  58509 non-null  float64
 19  Column20  58509 non-null  float64
 20  Column21  58509 non-null  fl

In [6]:
# Check for missing values
df.isna().sum()

Column1     0
Column2     0
Column3     0
Column4     0
Column5     0
Column6     0
Column7     0
Column8     0
Column9     0
Column10    0
Column11    0
Column12    0
Column13    0
Column14    0
Column15    0
Column16    0
Column17    0
Column18    0
Column19    0
Column20    0
Column21    0
Column22    0
Column23    0
Column24    0
Column25    0
Column26    0
Column27    0
Column28    0
Column29    0
Column30    0
Column31    0
Column32    0
Column33    0
Column34    0
Column35    0
Column36    0
Column37    0
Column38    0
Column39    0
Column40    0
Column41    0
Column42    0
Column43    0
Column44    0
Column45    0
Column46    0
Column47    0
Column48    0
Column49    0
dtype: int64

In [7]:
# Explore the class balance in the target column.
# The last column (Column 49) is the target and is an integer (This is a classification task)
df['Column49'].value_counts()

1     5319
2     5319
3     5319
4     5319
5     5319
6     5319
7     5319
8     5319
9     5319
10    5319
11    5319
Name: Column49, dtype: int64

In [8]:
# Separate the target and features.  
X = df.drop(columns='Column49')
y = df['Column49']


In [9]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
X_train.describe()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column39,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48
count,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,...,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0,43881.0
mean,-3e-06,1.427072e-06,1.01335e-07,-1e-06,1.282729e-06,-1.960034e-07,0.00197,0.001969,0.001969,-0.011824,...,8.383915,-0.425689,7.290038,8.291361,-1.500878,-1.500903,-1.500795,-1.497776,-1.497798,-1.497691
std,7.2e-05,5.340336e-05,0.0002370628,6.2e-05,5.728528e-05,0.0002242847,0.036346,0.036344,0.036349,0.066168,...,6.758397,24.069049,12.658098,6.599908,0.003652,0.003662,0.003628,0.00318,0.003183,0.003193
min,-0.013721,-0.0039561,-0.01358,-0.012787,-0.0083559,-0.0097413,-0.13989,-0.13594,-0.13086,-0.21864,...,0.52218,-0.891,-0.59471,0.32066,-1.5255,-1.5262,-1.5237,-1.5199,-1.52,-1.5195
25%,-7e-06,-1.4477e-05,-7.3416e-05,-5e-06,-1.4679e-05,-7.3785e-05,-0.019621,-0.019642,-0.019648,-0.032054,...,4.4571,-0.7153,1.4502,4.4472,-1.5033,-1.5033,-1.5032,-1.4996,-1.4996,-1.4996
50%,-3e-06,8.4022e-07,2.1358e-07,-1e-06,7.5154e-07,-2.7189e-07,0.013213,0.013214,0.013239,-0.015604,...,6.5654,-0.66134,3.2934,6.4803,-1.5003,-1.5003,-1.5002,-1.4981,-1.4981,-1.498
75%,2e-06,1.8646e-05,7.4131e-05,4e-06,1.8971e-05,7.1703e-05,0.024703,0.024706,0.024715,0.020565,...,9.9434,-0.57447,8.286,9.8763,-1.4982,-1.4982,-1.4982,-1.4962,-1.4963,-1.4962
max,0.005784,0.0045253,0.0052377,0.000622,0.00049053,0.0023956,0.069125,0.06913,0.069131,0.35258,...,125.49,3670.8,889.93,153.15,-1.4576,-1.4561,-1.4555,-1.3372,-1.3372,-1.3371


# 1. Create a processing pipeline

Create a pipeline that preprocesses the data and applies PCA.  The resulting principal components should retain 95% of the variance of the original features.

In [11]:
# Create a scaler
scale =  StandardScaler()
# Create a PCA object that will retain 95% of the variance when transforming
pca = PCA(n_components=0.95)
# Combine the scaler and the PCA in a pipeline.
pipe = make_pipeline(scale, pca)
# Transform the training data and check shape of new features after applying PCA
pipe.fit_transform(X_train).shape

(43881, 20)

# Create a KNN classification model **WITHOUT** PCA and check the accuracy


In [26]:
%%time
# Create and fit a KNN model WITHOUT PCA.
knn_nopca = make_pipeline(scale, KNeighborsClassifier(n_neighbors=1))

knn_nopca.fit(X_train, y_train)

CPU times: total: 31.2 ms
Wall time: 57.2 ms


In [27]:
%%time
# Make predictions and evaluate the model using classification_report
pred_nopca = knn_nopca.predict(X_test)

CPU times: total: 5.55 s
Wall time: 1.38 s


In [28]:
print(classification_report(y_test, pred_nopca))

              precision    recall  f1-score   support

           1       0.76      0.74      0.75      1345
           2       0.78      0.75      0.76      1367
           3       0.80      0.81      0.81      1304
           4       0.83      0.82      0.82      1327
           5       0.66      0.67      0.67      1317
           6       0.64      0.69      0.67      1324
           7       0.99      0.99      0.99      1379
           8       0.74      0.74      0.74      1300
           9       0.82      0.80      0.81      1254
          10       0.77      0.78      0.78      1341
          11       1.00      1.00      1.00      1370

    accuracy                           0.80     14628
   macro avg       0.80      0.80      0.80     14628
weighted avg       0.80      0.80      0.80     14628



# Create a KNN classification model **WITH** PCA and check the accuracy

In [29]:
%%time
# Create and fit a KNN model WITH PCA.
knn_pca = make_pipeline(pipe, KNeighborsClassifier(n_neighbors=1))

knn_pca.fit(X_train, y_train)

CPU times: total: 141 ms
Wall time: 131 ms


In [30]:
%%time
# Make predictions and evaluate the model using classification_report
preds_pca = knn_pca.predict(X_test)

CPU times: total: 4.48 s
Wall time: 1.17 s


In [32]:
print(classification_report(y_test, preds_pca))

              precision    recall  f1-score   support

           1       0.77      0.74      0.75      1345
           2       0.78      0.76      0.77      1367
           3       0.83      0.83      0.83      1304
           4       0.85      0.84      0.84      1327
           5       0.69      0.69      0.69      1317
           6       0.64      0.73      0.68      1324
           7       0.99      0.99      0.99      1379
           8       0.77      0.76      0.76      1300
           9       0.84      0.81      0.82      1254
          10       0.78      0.79      0.79      1341
          11       1.00      1.00      1.00      1370

    accuracy                           0.81     14628
   macro avg       0.81      0.81      0.81     14628
weighted avg       0.81      0.81      0.81     14628



# Discuss:

1. What were the benefits of using PCA for this dataset?
     Both improvement of the processing time, and incresing the accuracy.
2. What were the drawbacks?
    Probably overfiting, and loosing of the some variability. 
3. Which model took longer to train?  
    with PCA
  - Which took longer to predict?  
    without PCA
  - Why might this be?  
    training took longer with PCA because we have one more step in the pipeline, the PCA itself. 
  - What does this tell you about using PCA in a pipeline?
    That it may consume time in the training process. 

## Bonus! 

This is just a default model, but you could try a gridsearch or other classifier algorithms to reduce overfitting and see if your group can get a better result!

#### Hint
You can use gridsearch to change the number of componenents that the PCA is keeping, too!  However, remember not to search over too many hyperparameter values or your search will take too long!

In [33]:
knn_pca.get_params()

{'memory': None,
 'steps': [('pipeline',
   Pipeline(steps=[('standardscaler', StandardScaler()),
                   ('pca', PCA(n_components=0.95))])),
  ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=1))],
 'verbose': False,
 'pipeline': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('pca', PCA(n_components=0.95))]),
 'kneighborsclassifier': KNeighborsClassifier(n_neighbors=1),
 'pipeline__memory': None,
 'pipeline__steps': [('standardscaler', StandardScaler()),
  ('pca', PCA(n_components=0.95))],
 'pipeline__verbose': False,
 'pipeline__standardscaler': StandardScaler(),
 'pipeline__pca': PCA(n_components=0.95),
 'pipeline__standardscaler__copy': True,
 'pipeline__standardscaler__with_mean': True,
 'pipeline__standardscaler__with_std': True,
 'pipeline__pca__copy': True,
 'pipeline__pca__iterated_power': 'auto',
 'pipeline__pca__n_components': 0.95,
 'pipeline__pca__n_oversamples': 10,
 'pipeline__pca__power_iteration_normalizer': 'auto',
 'pipeli

In [34]:
%%time
params = {'pipeline__pca__n_components': [1, .95, .9, .85, .8],
          'kneighborsclassifier__n_neighbors': list(range(1, 11, 1))
         }

knn_gridsearch = GridSearchCV(knn_pca, param_grid=params, n_jobs=-1, cv=5)

knn_gridsearch.fit(X_train, y_train)

CPU times: total: 2.06 s
Wall time: 3min 25s


In [35]:
knn_gridsearch.best_params_

{'kneighborsclassifier__n_neighbors': 9, 'pipeline__pca__n_components': 0.95}

In [36]:
best_knn = knn_gridsearch.best_estimator_

In [38]:
%%time
pred_gs = best_knn.predict(X_test)

pred_gs[:5]

CPU times: total: 7.92 s
Wall time: 2.32 s


array([ 3, 11,  9,  8,  7], dtype=int64)

In [39]:
print(classification_report(y_test, pred_gs))

              precision    recall  f1-score   support

           1       0.78      0.79      0.78      1345
           2       0.79      0.82      0.80      1367
           3       0.82      0.89      0.85      1304
           4       0.86      0.90      0.88      1327
           5       0.75      0.71      0.73      1317
           6       0.69      0.75      0.72      1324
           7       1.00      0.99      0.99      1379
           8       0.83      0.75      0.79      1300
           9       0.88      0.81      0.85      1254
          10       0.83      0.79      0.81      1341
          11       1.00      1.00      1.00      1370

    accuracy                           0.84     14628
   macro avg       0.84      0.84      0.84     14628
weighted avg       0.84      0.84      0.84     14628

