# Pipeline

In [2]:
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

## Concrete Strength Dataset

In [7]:
concrete = pd.read_csv("./Cases/Concrete Strength/Concrete_Data.csv")
concrete.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [8]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

### MinMaxScaler method

In [10]:
scaler = MinMaxScaler().set_output(transform='pandas')
X_trn_scl = scaler.fit_transform(X_train)
X_tst_scl = scaler.transform(X_test)
knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_trn_scl, y_train)
y_pred = knn.predict(X_tst_scl)
r2_score(y_test, y_pred)


0.6257731313470496

### Creating PipeLine

In [13]:
scaler = MinMaxScaler()
knn = KNeighborsRegressor(n_neighbors=6)
pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.6257731313470496

In [15]:
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    scaler = MinMaxScaler()
    knn = KNeighborsRegressor(n_neighbors=i)
    pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})
    accuracy.append(r2_score(y_test, y_pred))

i_max = np.argmax(accuracy)
print("Best n_value = ", n_values[i_max])
print("Best r2 score = ", accuracy[i_max])
# concrete_df = pd.DataFrame(data=accuracy)
# concrete_df = concrete_df.sort_values(by='r2_score', ascending=False)
# concrete_df.head(5)

Best n_value =  3
Best r2 score =  0.7070339604544171


### Inferencing using pipeline

In [16]:
scaler = StandardScaler()
knn = KNeighborsRegressor(n_neighbors=4)
pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
pipe.fit(X,y)
tst = pd.read_csv("./Cases/Concrete Strength/testConcrete.csv")

predictions = pipe.predict(tst)
predictions

array([55.945 , 43.8025, 31.2975, 51.44  , 50.69  , 36.5325, 52.3675,
       38.92  , 53.04  , 60.335 , 26.0875, 69.5   , 52.8075, 40.5675])