<div id="job_title"> 
    <h1  style="color:Green; font-family:Baskerville"><b>This is a notebook for referencing everyday tools for ML engineering</h1>
</div>

<div id="job_title"> 
    <h2  style="font-family:Baskerville"><b>All dependencies</h2>
</div>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
import plotly.graph_objects as go

<div id="job_title"> 
    <h2  style="font-family:Baskerville"><b>Data preprocessing</h2>
</div>

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Reading CSV files</h2>
</div>

In [2]:
import pandas as pd
df1 = pd.read_csv('../input/the-boston-houseprice-data/boston.csv')

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Removing NA values</h2>
</div>

In [3]:
df1 =df1.dropna()

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Separating training and test data</h2>
</div>

In [4]:
X = df1.copy()
X.pop('MEDV') #X.drop(df1.columns[-1],axis = 1)
y = df1.MEDV.copy()

In [5]:
from sklearn.model_selection import train_test_split
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Standardizing data</h2>
</div>

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
  
X_train = sc.fit_transform(X_train) #Create standardization and apply to train data
X_test = sc.transform(X_test)       #Apply created standardization to new data
X_val = sc.transform(X_val)         #Apply created standardization to new data

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Principal component analysis</h2>
</div>

In [7]:
from sklearn.decomposition import PCA
  
pca = PCA(n_components = 0.9, svd_solver='full')
  
X_train = pca.fit_transform(X_train) #Create PCA and apply to train data
X_test = pca.transform(X_test)       #Apply created PCA to new data
X_val = pca.transform(X_val)         #Apply created normalization to new data

<div id="job_title"> 
    <h2  style="font-family:Baskerville"><b>Model training</h2>
</div>

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Function to calculate model resuts</h2>
</div>

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def calculate_results(y_true, y_pred):
    model_mae = mean_absolute_error(y_true, y_pred)
    model_mse = mean_squared_error(y_truey_true, y_pred)
    model_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    model_r2 = r2_score(y_true, y_pred)
    model_rmsle = np.log(np.sqrt(mean_squared_error(y_true, y_pred)))
    
    
    model_results = {"Mean Absolute Error (MAE)": model_mae,
                     "Mean Squared Error (MSE)": model_mse,
                     "Root Mean Squared Error (RMSE)": model_rmse,
                     "Adjusted R^2 Score": model_r2,
                     "Root Mean Squared Log Error": model_rmsle}
    return model_results

<div id="job_title"> 
    <h2  style="font-family:Baskerville">Example models</h2>
</div>

<div id="job_title"> 
    <h3  style="font-family:Baskerville">Random Forest</h3>
</div>

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
model = rf.fit(X_train, y_train)
y_pred = model.predict(X_val)
rf_results = calculate_results(y_val, y_pred)
results = pd.DataFrame(rf_results, index=['Random Forest']).T

predictions = pd.DataFrame(y_val.reset_index(drop=True)).rename(columns={'MEDV':'Validation'})

data = {
    'Random Forest':y_pred
}
predictions['Random Forest'] = pd.DataFrame(data)

<div id="job_title"> 
    <h3  style="font-family:Baskerville">Gradient Boosting</h3>
</div>

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42) 

#xgboost.fit(X_train, y_train)
model = gbr.fit(X_train, y_train)
y_pred = model.predict(X_val)
gbr_results = calculate_results(y_val, y_pred)
results['Gradient Boosting'] = pd.DataFrame(gbr_results, index=['Gradient Boosting']).T

data = {
    'Gradient Boosting':y_pred
}
predictions['Gradient Boosting'] = pd.DataFrame(data)

<div id="job_title"> 
    <h3  style="font-family:Baskerville">LGBM</h3>
</div>

In [11]:
from lightgbm import LGBMRegressor

lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

y_pred = lightgbm.fit(X_train, y_train).predict(X_val)
lightgbm_results = calculate_results(y_val, y_pred)
results['LGBM'] = pd.DataFrame(lightgbm_results, index=['LGBM']).T

data = {
    'LGBM':y_pred
}

predictions['LGBM'] = pd.DataFrame(data)



<div id="job_title"> 
    <h2  style="font-family:Baskerville"><b>Visualizing results</h2>
</div>

In [12]:
results

Unnamed: 0,Random Forest,Gradient Boosting,LGBM
Mean Absolute Error (MAE),2.609109,2.546833,3.49622
Mean Squared Error (MSE),13.813117,12.287712,20.629317
Root Mean Squared Error (RMSE),3.7166,3.505383,4.541951
Adjusted R^2 Score,0.813873,0.834427,0.722027
Root Mean Squared Log Error,1.312809,1.2543,1.513357


<div id="job_title"> 
    <h2  style="font-family:Baskerville">Plotly scatter plot</h2>
</div>

In [13]:
import plotly.graph_objects as go
%matplotlib inline

fig = go.Figure()
fig.update_layout(title_text="Models predictions",
                  title_font_size=30,
                  title_x=0.5)

for model in predictions:
    if model == 'Validation': continue
    fig.add_trace(go.Scatter(x=np.sort(y_test), y=predictions[model].sort_values(),
                    mode='markers',
                    name=model))

fig.add_trace(go.Scatter(x=np.sort(y_test), y=np.sort(y_test),
                    mode='lines',
                    name='Expected results'))