<a href="https://colab.research.google.com/github/denmj/Tensorflow_learning/blob/master/ML_linreg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
from sklearn.datasets import fetch_california_housing

from sklearn.pipeline import Pipeline

#Pre processing 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV



In [0]:
def missing_zero_values_table(df):
    zero_val = (df == 0.00).astype(int).sum(axis=0)
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
        columns={0: 'Zero Values', 1: 'Missing Values', 2: '% of Total Values'})
    mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
    mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
    mz_table['Data Type'] = df.dtypes
    mz_table = mz_table[
        mz_table.iloc[:, 1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
    print("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"
                                                                                                   "There are " + str(
        mz_table.shape[0]) +
          " columns that have missing values.")
    #         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
    return mz_table

In [3]:
data = fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


In [0]:
X = pd.DataFrame(data=data.data, columns=data.feature_names)
y = data.target


In [5]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [0]:
  pline = Pipeline([('std_scaler', StandardScaler())])
  X_scaled = pline.fit_transform(X)

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.3, random_state=44)

In [0]:
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


In [0]:
lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor()
ridge_reg = Ridge()
lasso_reg  = Lasso()
sgd_reg = SGDRegressor(max_iter=10000)
svr = SVR()
rnd_frst = RandomForestRegressor()

In [29]:
print("X_train size: {}, y_train size: {}, X_val size: {}, y_val size: {}".format(X_train.shape, y_train.shape, X_val.shape, y_val.shape))

X_train size: (14448, 8), y_train size: (14448,), X_val size: (6192, 8), y_val size: (6192,)


In [30]:
lin_reg.fit(X_train, y_train)
tree_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)
sgd_reg.fit(X_train, y_train)
svr.fit(X_train, y_train)
rnd_frst.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [0]:
from sklearn.metrics import mean_squared_error

In [0]:
linreg_pred = lin_reg.predict(X_train)

In [24]:
lin_mse = mean_squared_error(y_train, linreg_pred)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.7239607017498086

In [0]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)

tree_rmse = np.sqrt(-scores)

In [32]:
tree_rmse

array([0.71204681, 0.75042202, 0.74326833, 0.72497184, 0.7265866 ,
       0.73149309, 0.74036179, 0.72562857, 0.7637631 , 0.73997071])

In [0]:
param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, 
               {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]

In [0]:
grid_search = GridSearchCV(rnd_frst, param_grid, cv=5, scoring="r2")

In [35]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [36]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [37]:
grid_search.cv_results_

{'mean_fit_time': array([0.08636279, 0.2830792 , 0.85203099, 0.1418364 , 0.49186192,
        1.43434677, 0.20923948, 0.6820951 , 2.07653966, 0.27900229,
        0.88510594, 2.62061906, 0.13969293, 0.43777823, 0.17637763,
        0.61270375, 0.23535008, 0.77076879]),
 'mean_score_time': array([0.00409336, 0.01193585, 0.03071952, 0.00405698, 0.0114562 ,
        0.03089423, 0.00484939, 0.01189442, 0.03282857, 0.00476775,
        0.01121626, 0.0307796 , 0.00484843, 0.01380925, 0.00469079,
        0.01349907, 0.00526509, 0.01340828]),
 'mean_test_score': array([0.71945995, 0.77757336, 0.79715911, 0.71392217, 0.77826379,
        0.80060189, 0.71603227, 0.77777379, 0.79538874, 0.722441  ,
        0.77656874, 0.7943879 , 0.72676545, 0.79492834, 0.74859823,
        0.79204714, 0.72668149, 0.79226286]),
 'param_bootstrap': masked_array(data=[--, --, --, --, --, --, --, --, --, --, --, --, False,
                    False, False, False, False, False],
              mask=[ True,  True,  True,  Tru

In [39]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(mean_score), params)

0.8482098473649493 {'max_features': 2, 'n_estimators': 3}
0.8818012037294121 {'max_features': 2, 'n_estimators': 10}
0.8928376708853599 {'max_features': 2, 'n_estimators': 30}
0.8449391489927538 {'max_features': 4, 'n_estimators': 3}
0.8821926033165359 {'max_features': 4, 'n_estimators': 10}
0.8947635947092659 {'max_features': 4, 'n_estimators': 30}
0.8461869003455584 {'max_features': 6, 'n_estimators': 3}
0.8819148444061705 {'max_features': 6, 'n_estimators': 10}
0.8918456918792005 {'max_features': 6, 'n_estimators': 30}
0.8499652943880118 {'max_features': 8, 'n_estimators': 3}
0.881231377760168 {'max_features': 8, 'n_estimators': 10}
0.8912844103033116 {'max_features': 8, 'n_estimators': 30}
0.852505393433261 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.8915875409885883 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.8652157128876203 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.8899703014190471 {'bootstrap': False, 'max_features': 3, 'n_e