In [8]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

In [9]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path(..//Resources/ml_inspection_data/InspectionList3.csv')   
)

# Review the DataFrame
df

FileNotFoundError: [Errno 2] No such file or directory: '..\\Resources\\ml_inspection_data\\InspectionList3.csv'

### 2. Separate the features `X` from the target `y`

In [3]:
# Seperate the features, X,  from the target variable, y
y = df['risk score']
X = df.drop(columns='risk score')

In [4]:
# Preview the features data
X.head()

Unnamed: 0,Seg_ID,UP_MH,DS_MH,Dia_Inch,Shape,Material,Length_Ft,MWL,DS,defects,Video Quality,Jurisdiction,Inspection_cycle
0,1.0,118.2,117.0,30,Circular,RCP,222.0,55,0.12,SAV,Good,Dulles Aiport,1
1,1.5,118.2,117.0,30,Circular,RCP,222.0,55,0.12,SAV,Good,Dulles Aiport,1
2,2.0,117.0,116.0,30,Circular,RCP,248.0,25,0.2,SAV,Good,Dulles Aiport,9
3,2.5,117.0,116.0,30,Circular,RCP,248.0,25,0.2,SAV,Good,Dulles Aiport,9
4,3.0,116.0,115.8,30,Circular,RCP,209.0,25,0.0,SAV,Good,Dulles Aiport,9


In [5]:
y.value_counts()

risk score
2    170
3    150
4     69
1     64
5     44
Name: count, dtype: int64

In [6]:
# Preview the first five entries for the target variable
y[:10]

0    5
1    5
2    2
3    2
4    2
5    2
6    2
7    2
8    3
9    3
Name: risk score, dtype: int64

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [7]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [8]:
# Review the features data
X.head()

Unnamed: 0,Seg_ID,UP_MH,DS_MH,Dia_Inch,Length_Ft,MWL,DS,Inspection_cycle,Shape_Arched,Shape_C,...,Video Quality_Good,Video Quality_Poor,Jurisdiction_DC,Jurisdiction_Difficult Run,Jurisdiction_Dulles Aiport,Jurisdiction_Loudoun,"Jurisdiction_MD, CB, PKWY",Jurisdiction_Sugarland run,"Jurisdiction_VA, Great Falls",Jurisdiction_Watts&muddy
0,1.0,118.2,117.0,30,222.0,55,0.12,1,False,False,...,True,False,False,False,True,False,False,False,False,False
1,1.5,118.2,117.0,30,222.0,55,0.12,1,False,False,...,True,False,False,False,True,False,False,False,False,False
2,2.0,117.0,116.0,30,248.0,25,0.2,9,False,False,...,True,False,False,False,True,False,False,False,False,False
3,2.5,117.0,116.0,30,248.0,25,0.2,9,False,False,...,True,False,False,False,True,False,False,False,False,False
4,3.0,116.0,115.8,30,209.0,25,0.0,9,False,False,...,True,False,False,False,True,False,False,False,False,False


### 4. Separate the data into training and testing subsets.

In [9]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [10]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [11]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=7)

### 7. Fit the model using the training data.

In [12]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [13]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [14]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.78      0.93      0.85        15
           2       0.60      0.60      0.60        45
           3       0.59      0.81      0.68        36
           4       0.58      0.37      0.45        19
           5       1.00      0.10      0.18        10

    accuracy                           0.62       125
   macro avg       0.71      0.56      0.55       125
weighted avg       0.65      0.62      0.60       125



In [15]:
from collections import Counter
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_scaled, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({1: 125, 2: 125, 3: 125, 5: 125, 4: 125})


In [19]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_res, y_res)
y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15
           2       0.73      0.78      0.75        45
           3       0.72      0.86      0.78        36
           4       0.83      0.53      0.65        19
           5       0.86      0.60      0.71        10

    accuracy                           0.78       125
   macro avg       0.83      0.75      0.78       125
weighted avg       0.79      0.78      0.77       125



In [17]:
# Calculating the confusion matrix

cm = confusion_matrix(y_test, y_pred)
# cm_df = pd.DataFrame(
#     cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3"], columns=["Predicted 0", "Predicted 1","Predicted 2", "Predicted 3"]
# )
# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)
print(f"confusion Matrix: \n {cm}")
print(f"Accuracy: {acc_score}")

confusion Matrix: 
 [[15  0  0  0  0]
 [ 0 35  9  1  0]
 [ 0  5 31  0  0]
 [ 0  7  1 10  1]
 [ 0  1  2  1  6]]
Accuracy: 0.776
