# Introdución a métodos de muestreo

In [1]:
# import os for operating system dependent functionalities
import os

# import other required libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# change your working folder where you have stored the dataset
#os.chdir(".../Chapter 4")
#os.getcwd()

In [3]:
# Let's read our data. We prefix the data frame name with "df_" for easier understanding.
df_housingdata = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning-Cookbook/master/Chapter04/Linear%20regression/Final_HousePrices.csv")

In [4]:
df_housingdata.shape

(1460, 80)

In [5]:
df_housingdata.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 80, dtype: int64

In [6]:
# create feature & response variables
X = df_housingdata.iloc[:,0:79]
Y = df_housingdata['SalePrice']

# Create train & test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3)

In [7]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1021, 79)
(1021,)
(438, 79)
(438,)


#### Sampling a dataset with categorical response variable

In [8]:
df_creditcarddata = pd.read_csv("creditcarddefault.csv")
df_creditcarddata.shape

FileNotFoundError: ignored

In [None]:
# create feature & response variables
X = df_creditcarddata.iloc[:,0:24]
Y = df_creditcarddata['default payment next month']

# Create train & test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, stratify=Y)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
print(pd.value_counts(Y_train.values))
print(pd.value_counts(Y_test.values))

print(pd.value_counts(Y_train.values)*100/Y_train.shape)
print(pd.value_counts(Y_test.values)*100/Y_test.shape)

# Cross-Validation

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Let's read our data. We prefix the data frame name with "df_" for easier understanding.
df_autodata = pd.read_csv("autompg.csv")

In [None]:
df_autodata.shape

In [None]:
df_autodata.dtypes

In [None]:
df_autodata.isnull().sum()

In [None]:
df_autodata['horsepower'].fillna(df_autodata['horsepower'].median(), inplace=True)

In [None]:
df_autodata.drop(['carname'], axis=1, inplace=True)

In [None]:
# create feature & response variables
X = df_autodata.iloc[:,1:8]
Y = df_autodata.iloc[:,0]
X=np.array(X)
Y=np.array(Y)

In [None]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=1)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
# fit a model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm_model = lm.fit(X_train, Y_train)
predictedvalues = lm.predict(X_test)

In [None]:
r2score = r2_score(Y_test, predictedvalues)
mse = mean_squared_error(Y_test, predictedvalues)
        
print("Results without Cross-Validation:")
print("R^2: {:.2f}, MSE: {:.2f}".format(r2score, mse))

In [None]:
## Let us plot the model
plt.scatter(Y_test, predictedvalues)
plt.xlabel('Reported mpg')
plt.ylabel('Predicted mpg')

### 1. K-Fold Cross-Validation

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold 

In [None]:
# Let's read our data. We prefix the data frame name with "df_" for easier understanding.
df_autodata = pd.read_csv("autompg.csv")
df_autodata['horsepower'].fillna(df_autodata['horsepower'].median(), inplace=True)
df_autodata.drop(['carname'], axis=1, inplace=True)
X = df_autodata.iloc[:,1:8]
Y = df_autodata.iloc[:,0]
X=np.array(X)
Y=np.array(Y)

In [None]:
kfoldcv = KFold(n_splits=10)
kf_ytests = []
kf_predictedvalues = []

for train_index, test_index in kfoldcv.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    model = LinearRegression()
    model.fit(X_train, Y_train) 
    Y_pred = model.predict(X_test)
        
    # there is only one y-test and y-pred per iteration over the kfoldcv.split, 
    # so we append them to the respective lists.
        
    kf_ytests += list(Y_test)
    kf_predictedvalues += list(Y_pred)

        
r2score = r2_score(kf_ytests, kf_predictedvalues)
mse = mean_squared_error(kf_ytests, kf_predictedvalues)
        
print("K-Fold Cross Validation Results:")
print("R^2: {:.2f}, MSE: {:.2f}".format(r2score, mse))

In [None]:
## Let us plot the model
plt.scatter(kf_ytests, kf_predictedvalues)
plt.xlabel('Reported mpg')
plt.ylabel('Predicted mpg')

### 2. LOOCV

In [None]:
# Let's read our data. We prefix the data frame name with "df_" for easier understanding.
df_autodata = pd.read_csv("autompg.csv")
df_autodata['horsepower'].fillna(df_autodata['horsepower'].median(), inplace=True)
df_autodata.drop(['carname'], axis=1, inplace=True)
X = df_autodata.iloc[:,1:8]
Y = df_autodata.iloc[:,0]
X=np.array(X)
Y=np.array(Y)

In [None]:
from sklearn.model_selection import LeaveOneOut 
loocv = LeaveOneOut()

loo_ytests = []
loo_predictedvalues = []

for train_index, test_index in loocv.split(X):
    # the below requires arrays. So we converted the dataframes to arrays
    X_train, X_test = X[train_index], X[test_index] 
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    model = LinearRegression()
    model.fit(X_train, Y_train) 
    Y_pred = model.predict(X_test)
        
    # there is only one y-test and y-pred per iteration over the loo.split, 
    # so we append them to the respective lists.
        
    loo_ytests += list(Y_test)
    loo_predictedvalues += list(Y_pred)
    
r2score = r2_score(loo_ytests, loo_predictedvalues)
mse = mean_squared_error(loo_ytests, loo_predictedvalues)
        
print("LOOCV Cross Validation Results:")
print("R^2: {:.2f}, MSE: {:.2f}".format(r2score, mse))

In [None]:
## Let us plot the model
plt.scatter(loo_ytests, loo_predictedvalues)
plt.xlabel('Reported mpg')
plt.ylabel('Predicted mpg')

# Bootstrap

### Using resample() scikit-learn function

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

from sklearn.utils import resample

In [None]:
# Let's read our data. We prefix the data frame name with "df_" for easier understanding.
df_autodata = pd.read_csv("autompg.csv")
df_autodata['horsepower'].fillna(df_autodata['horsepower'].median(), inplace=True)
df_autodata.drop(['carname'], axis=1, inplace=True)

In [None]:
df_autodata.shape

In [None]:
def create_bootstrap_oob(df):
    global df_OOB
    global df_bootstrap_sample  
    
    # creating the bootstrap sample
    df_bootstrap_sample = resample(df, replace=True, n_samples=100)
    
    # creating the OOB sample    
    bootstrap_sample_index = tuple(df_bootstrap_sample.index)
    bootstrap_df = df.index.isin(bootstrap_sample_index)
    df_OOB = df[~bootstrap_df]

In [None]:
iteration=50
bootstap_statistics=list()
originalsample_statistics=list()

for i in range(iteration):
    # call custom function create_bootstrap_oob(). Pass df_autodata
    create_bootstrap_oob(df_autodata)
    
    # capture mean value of mpg variable for all bootstrap samples
    bootstap_statistics.append(df_bootstrap_sample.iloc[:,0].mean())
    
    originalsample_statistics.append(df_autodata['mpg'].mean())

In [None]:
import matplotlib.pyplot as plt
f, ax= plt.subplots(figsize=(6,6))

plt.plot(bootstap_statistics, 'c--', label='Bootstrap Sample Statistic')
plt.plot(originalsample_statistics, 'grey', label='Original Sample Statistic')
plt.xlabel('Iterations')
plt.ylabel('Statistic (Mean of mpg)')
plt.legend(loc=4)
plt.show()

## Mean Squared Error for Each Bootstrap Iteration

In [None]:
iteration=50
mse_each_iterations = list()
lm=SGDRegressor()
total_mse=0
average_mse= list()

for i in range(iteration):
    create_bootstrap_oob(df_autodata)
    X_BS = df_bootstrap_sample.iloc[:,1:8] #Bootstrap sample features
    Y_BS = df_bootstrap_sample.iloc[:,0] #Bootstrap sample response variable

    X_OOB = df_OOB.iloc[:,1:8] #OOB sample features
    Y_OOB = df_OOB.iloc[:,0] #OOB sample response variable    
    
    # fit your model with bootstrap sample
    lm=SGDRegressor()
    lm.fit(X_BS, Y_BS)
    
    # test your model on out-of-bag sample 
    predictedvalues = lm.predict(X_OOB)
    
    # capture MSE for the predicted values against OOB actuals
    mse = mean_squared_error(Y_OOB, predictedvalues)
    
    # create a list of mse values
    mse_each_iterations.append(mse) 

    # add all mse for calculating average
    total_mse += mse
    average_mse.append(total_mse/i)


In [None]:
import matplotlib.pyplot as plt
f, ax= plt.subplots(figsize=(6,6))

plt.plot(mse_each_iterations, 'c--', label='MSE by Iteration')
plt.plot(average_mse, 'r^', label='Average MSE after each iteration')

plt.xlabel('Iterations')
plt.ylabel('Mean Squared Error')
plt.legend(loc=1)
plt.show()