<a href="https://colab.research.google.com/github/dkanh6/Machine_Learning_Examples/blob/main/sklearn_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit Learn

In [70]:
# !pip install --upgrade scikit-learn
import sklearn as sk
import pandas as pd
import numpy as np

train_df = pd.read_csv('sample_data/california_housing_train.csv')
test_df = pd.read_csv('sample_data/california_housing_test.csv')

train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [71]:
X_train, y_train = train_df.to_numpy()[:,:-1], train_df.to_numpy()[:,-1]
X_test, y_test = test_df.to_numpy()[:,:-1], test_df.to_numpy()[:,-1]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((17000, 8), (17000,), (3000, 8), (3000,))

In [72]:
# Standad scaler (if negative rule of thumb use), MinMaxScaler (if all positive use, converts between 0 and 1)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from copy import deepcopy

std_scaler = StandardScaler().fit(X_train[:,:2])
min_max_scaler = MinMaxScaler().fit(X_train[:,2:])

# create master transformer (which will be a combination of both of those scalers)
def preprocessor(X):
  A = np.copy(X)
  A[:,:2] = std_scaler.transform(X[:,:2])
  A[:,2:] = min_max_scaler.transform(X[:,2:])
  return A


In [73]:
preprocessor(X_test)


array([[-1.24077729e+00,  8.16354338e-01,  5.09803922e-01, ...,
         4.29944785e-02,  9.94902154e-02,  4.21276948e-01],
       [ 6.29446690e-01, -6.38768279e-01,  8.23529412e-01, ...,
         2.25903192e-02,  4.53872718e-02,  2.13728087e-01],
       [ 8.73822623e-01, -8.63353120e-01,  5.09803922e-01, ...,
         4.15090109e-02,  8.12366387e-02,  3.65063930e-01],
       ...,
       [-6.87702626e-02,  3.15717296e-01,  1.76470588e-01, ...,
         1.93391070e-02,  3.60138135e-02,  1.23418987e-01],
       [ 1.21794383e+00, -7.13629892e-01,  7.64705882e-01, ...,
         1.20519073e-03,  2.13780628e-03,  1.91093916e-01],
       [-3.38594150e-02, -5.63906665e-01,  8.03921569e-01, ...,
         2.10207685e-02,  4.25916790e-02,  5.55916470e-01]])

In [74]:
preprocess_transformer = FunctionTransformer(preprocessor) # our own custom scikit learn transformer object
preprocess_transformer

In [75]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('Scaler',preprocess_transformer),('Linear Regression',LinearRegression())])

In [76]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
  p.fit(X_train, y_train)
  train_preds = p.predict(X_train)
  test_preds = p.predict(X_test)
  print('Trainnig error: '+str(mean_absolute_error(train_preds, y_train)))
  print('Test error:     '+str(mean_absolute_error(test_preds, y_test)))



In [79]:
fit_and_print(p1)

Trainnig error: 50795.857117863714
Test error:     50352.228257942894


In [81]:
from sklearn.neighbors import KNeighborsRegressor as KNR
p2 = Pipeline([('Scaler',preprocess_transformer),('KNN Regression',KNR(n_neighbors=7))])

In [82]:
fit_and_print(p2)

Trainnig error: 30045.80900840336
Test error:     35865.41276190476


In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR
p3 = Pipeline([('Scaler',preprocess_transformer),('Random Forest',RFR(n_estimators=10, max_depth=7))])

In [None]:
fit_and_print(p3)