In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('../Resources/InspectionList1.csv')   
)

# Review the DataFrame
df.head()

Unnamed: 0,Seg_ID,UP_MH,DS_MH,"Dia., Inch",Shape,Material,"Length, Ft","MWL, %","DS, % depth",defects,Video Quality,Location,Risk,Inspection Raning
0,1,36206.0,36207.0,42,Circular,DIP,200,95,0%,1,0,DC,1,3
1,2,36206.0,36207.0,60,Circular,DIP,200,95,0%,1,0,DC,1,3
2,3,36206.0,36207.0,60,Circular,DIP,200,95,0%,1,0,DC,1,1
3,4,36204.0,838.0,102,Circular,zzz,651,80,0%,4,1,DC,4,9
4,5,8197.0,36206.0,96,Circular,RCP,950,75,0%,1,1,DC,1,8


### 2. Separate the features `X` from the target `y`

In [4]:
# Seperate the features, X,  from the target variable, y
y = df['Inspection Raning']
X = df.drop(columns='Inspection Raning')

In [5]:
# Preview the features data
X.head()

Unnamed: 0,Seg_ID,UP_MH,DS_MH,"Dia., Inch",Shape,Material,"Length, Ft","MWL, %","DS, % depth",defects,Video Quality,Location,Risk
0,1,36206.0,36207.0,42,Circular,DIP,200,95,0%,1,0,DC,1
1,2,36206.0,36207.0,60,Circular,DIP,200,95,0%,1,0,DC,1
2,3,36206.0,36207.0,60,Circular,DIP,200,95,0%,1,0,DC,1
3,4,36204.0,838.0,102,Circular,zzz,651,80,0%,4,1,DC,4
4,5,8197.0,36206.0,96,Circular,RCP,950,75,0%,1,1,DC,1


In [6]:
# Preview the first five entries for the target variable
y[:5]

0    3
1    3
2    1
3    9
4    8
Name: Inspection Raning, dtype: int64

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [8]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [9]:
# Review the features data
X.head()

Unnamed: 0,Seg_ID,UP_MH,DS_MH,"Dia., Inch","Length, Ft","MWL, %",defects,Video Quality,Risk,Shape_Arched,...,"DS, % depth_8%","DS, % depth_9%",Location_DC,Location_Difficult Run,Location_Dulles Aiport,Location_Loudoun,"Location_MD, CB, PKWY",Location_Sugarland run,"Location_VA, Great Falls",Location_Watts&muddy
0,1,36206.0,36207.0,42,200,95,1,0,1,False,...,False,False,True,False,False,False,False,False,False,False
1,2,36206.0,36207.0,60,200,95,1,0,1,False,...,False,False,True,False,False,False,False,False,False,False
2,3,36206.0,36207.0,60,200,95,1,0,1,False,...,False,False,True,False,False,False,False,False,False,False
3,4,36204.0,838.0,102,651,80,4,1,4,False,...,False,False,True,False,False,False,False,False,False,False
4,5,8197.0,36206.0,96,950,75,1,1,1,False,...,False,False,True,False,False,False,False,False,False,False


### 4. Separate the data into training and testing subsets.

In [10]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 5. Scale the data using `StandardScaler`

In [12]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Instantiate an K Nearest Neighbor Classifier instance.

In [13]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

### 7. Fit the model using the training data.

In [14]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

### 8. Make predictions using the testing data.

In [15]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

### 9. Generate the classification report for the test data.

In [17]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.14      0.50      0.22         2
           2       0.50      0.33      0.40         6
           3       1.00      0.50      0.67         8
           4       0.50      0.75      0.60         4
           5       0.50      0.67      0.57         3
           6       0.92      0.80      0.86        15
           7       0.75      0.86      0.80         7
           8       1.00      0.67      0.80         3
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         6

    accuracy                           0.71        55
   macro avg       0.73      0.71      0.69        55
weighted avg       0.80      0.71      0.73        55

