In [None]:
#Import your Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
%matplotlib inline

In [None]:
# %%timeit -n 1
# Load your data  -- start with data.csv... then Life Expectancy - and then anyone you choose
# Replace with your dataset... for instance - if it is on github -use:  https://raw.githubusercontent.com/fenago/introml/main/Life%20Expectancy%20Data.csv
df = pd.read_csv('data.csv')

In [None]:
len(df)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.corr()

In [None]:
# Basic Data Cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_') # A
 
string_columns = list(df.dtypes[df.dtypes == 'object'].index) # B
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_') # C

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
# Replace with your target variable --- df.YOUR_TARGET_VARIABLE  
# Also replace your X label
plt.figure(figsize=(6, 4))

sns.histplot(df.life_expectancy_, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('life_expectancy_')
plt.title('Distribution of prices')

plt.show()

In [None]:
# This may not be needed for your dataset but explore with different features
plt.figure(figsize=(6, 4))

sns.histplot(df.msrp[df.msrp < 100000], bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('Price')
plt.title('Distribution of prices')

plt.show()

In [None]:
# This may not be needed for your dataset but explore with different features

log_price = np.log1p(df.msrp)

plt.figure(figsize=(6, 4))

sns.histplot(log_price, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('Log(Price + 1)')
plt.title('Distribution of prices after log tranformation')

plt.show()

In [None]:
# Check for nulls --- you do NOT want nulls when you train
df.isnull().sum()

In [None]:
df.head().T

In [None]:
#delete columns --- this may or may NOT be needed.  As before - skip if you don't need it
# You will encounter times where you will want to delete columns.  This is how you do that.
df = df.drop(['x5_latitude', 'x6_longitude', 'x1_transaction_date'], axis=1)
df

In [None]:
'''
# Split the data into test, train, validation sets... 60/20/20
from sklearn.model_selection import train_test_split
# This gives the 80/20 train test split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
# This splits df_train_full again so it is 60/20/20
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)
len(df_train), len(df_val), len(df_test)
# Replace nulls with 0's - these are pandas dataframes
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)
df_test = df_test.fillna(0)
len(df_train),len(df_val),len(df_test)
'''

In [None]:
# Split the data into test, train, validation sets... 80/20
from sklearn.model_selection import train_test_split
# This gives the 80/20 train test split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)

len(df_train_full), len(df_test)
# Replace nulls with 0's - these are pandas dataframes
df_train_full = df_train_full.fillna(0)

df_test = df_test.fillna(0)
len(df_train_full),len(df_test)

In [None]:
#Split the y out into train/test/splits... these are numpy ndarrays ... msrp is your target variables
# Replace with your target variable!!!  
y_train = (df_train_full.life_expectancy_).values
y_test = (df_test.life_expectancy_).values
del df_train_full['life_expectancy_']
del df_test['life_expectancy_']


In [None]:
# Convert these data frames into a LIST of DICTIONARIES (each element in the list is a dictionary (the record))
dict_train = df_train_full.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

In [None]:
# Convert the LIST OF DICTIONARIES into a Feature Matrix (does all of the encoding)
from sklearn.feature_extraction import DictVectorizer
 
dv = DictVectorizer(sparse=False)
 
X_train = dv.fit_transform(dict_train)
X_test = dv.transform(dict_test)
features = dv.get_feature_names_out()  #Features as they exist in the Vectorized Dictionary (this is an ndarray)

In [None]:
# %%timeit -n 1
# if you uncomment %%timeit it will not put lr into memory
# Fit the model - this will take some time and will burn CPU (not MEMORY)
lr = LinearRegression().fit(X_train, y_train)

In [None]:
# These are the model properties.  You can call all of these
def get_properties(model):   
  return [i for i in model.__dict__ if i.endswith('_')] 
get_properties(dt)

In [None]:
type(X_train)
type(dv.get_feature_names_out())
type(lr.coef_)

In [None]:
lr.coef_[0]
print('%.3f' % lr.intercept_, '(Intercept)') 
# Evaluate the coefficients to learn what the model thinks is important in the predictions.
for i,j in zip(features, lr.coef_): print('%.3f' % j, i)

In [None]:
# %%timeit -n 1
y_pred = lr.predict(X_test)
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_results


In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# View the coefficients
print(lr.intercept_)
print(lr.coef_)

In [None]:
pred_y = lr.predict(X_test)
print("The first 10 prediction {}".format(pred_y[:10].round(0)))
print("The real first 10 labels {}".format(y_test[:10]))

mse = metrics.mean_squared_error(y_test, pred_y)
print("Mean Squared Error {}".format(mse))

In [None]:
type(df_train_full.head(1))

In [None]:
# Use double brackets around the iloc to force it to return a pandas dataframe and not a series
# Then you can convert any record into a dictionary
df_train_full.iloc[[21]]

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
df_train_full.iloc[[213]].to_dict('records')[0]

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
df_train_full.head(21).to_dict('records')[0]

In [None]:
#car = df_train.head(1).to_dict('records')[0]
item = df_train_full.iloc[[213]].to_dict('records')[0]
actual = y_train[[213]]

In [None]:
# The item to be predicted is passed in.  
def model_prediction(item, dv, model):
    X = dv.transform([item])
    y_pred = model.predict(X)
    return y_pred[0]

In [None]:
model_prediction(item,dv,lr)

In [None]:
actual

In [None]:
lr.get_params()

In [None]:
#Algorithm Test Harness for Regression Algorithms

In [None]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

In [None]:
from time import time

from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [None]:
head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()

## R2 Score with Regression Models
The R2 score is a very important metric that is used to evaluate the performance of a regression-based machine learning model. It is pronounced as R squared and is also known as the coefficient of determination. It works by measuring the amount of variance in the predictions explained by the dataset. 
### Simply put, R2 is the difference between the samples in the dataset and the predictions made by the model.
The R2 score of the models trained here range from .91 to .98 (notice that LR does not converge so it's score is unknown). 
### If the value of the r squared score is 1, it means that the model is perfect and if its value is 0, it means that the model will perform badly on an unseen dataset. This also implies that the closer the value of the r squared score is to 1, the more perfectly the model is trained.
# In summary - look for the models with the highest R2 values

In [None]:
# Assuming XGBoost is the best model - let's find the best hyperparameters

# If you want to find the best parameters for a model - do a grid search over several features (or a random search)
# A. Do model.get_params()    to get all of the existing hyperparameters for the model
# B. Create a dictionary with different hyperparameter options
# C. Run the GridSearch and it will find the best parameters
# D. Be patient because this will take a long time.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = { 'loss' : ['squared_error', 'lad', 'huber', 'quantile'],
              'learning_rate' : (0.05,0.25,0.50,1),
              'criterion' : ['friedman_mse', 'mse', 'mae'],
              'max_features' : ['auto', 'sqrt', 'log2']
             }

In [None]:
# This will take a very long time to execute (potentially hours) - skip it
grid = GridSearchCV(GradientBoostingRegressor(),parameters)
model = grid.fit(X_train,y_train)
print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

In [None]:
{'criterion': 'friedman_mse', 'learning_rate': 0.25, 'loss': 'lad', 'max_features': 'sqrt'} 

GradientBoostingRegressor(learning_rate=0.25, loss='lad', max_features='sqrt') 

In [None]:
sns.distplot(y_pred-y_test)

In [None]:
# If you skipped the grid search then this won't run... obviously
df_1 = pd.DataFrame(grid.cv_results_).set_index('rank_test_score').sort_index()
df_1.shape

In [None]:
#Hyperparameter Tuning 
# To get the best hyperparameters - call .get_params() on the model.  
# Then copy the parameters that you want to test into a dictionary list as you see below
# The GridSearchCV will give you the best parameters

In [None]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor().fit(X_train, y_train)

In [None]:
sgd.get_params()

In [None]:
#This many parameters will take a very long time to load
param = {'alpha': [0.0001,0.001],
 'average': [False,True],
 'early_stopping': [False,True],
 'epsilon': [0.1,.001],
 'eta0': [0.01,.1],
 'fit_intercept': [True,False],
 'l1_ratio': [0.15,0.2,0.1],
 'learning_rate': ['invscaling','optimal','constant','adaptive'],
 'loss': ['squared_error','huber','epsilon_insensitive','squared_epsilon_insensitive'],
 'max_iter': [1000],
 'n_iter_no_change': [5,7],
 'penalty': ['l2','l1','elasticnet'],
 'power_t': [0.25],
 'random_state': [None],
 'shuffle': [True],
 'tol': [0.001],
 'validation_fraction': [0.1],
 'verbose': [0],
 'warm_start': [False]}

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
search = GridSearchCV(sgd, param, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X_train, y_train)

In [None]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)