# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

# Load Dataset

In [3]:
train_data = pd.read_csv(r'C:\Users\Sanjeshni\Documents\DIV_IOD\Data\train.csv')

# Remove rows with missing target values
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_data.SalePrice # Target variable             
train_data.drop(['SalePrice'], axis=1, inplace=True) # Removing target variable from training data

train_data.drop(['LotFrontage', 'GarageYrBlt', 'MasVnrArea'], axis=1, inplace=True) # Remove columns with null values

# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (1460, 34) and shape of target variable: (1460,)


Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,8450,7,5,2003,2003,706,0,150,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,9600,6,8,1976,1976,978,0,284,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,11250,7,5,2001,2002,486,0,434,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,9550,7,5,1915,1970,216,0,540,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,14260,8,5,2000,2000,655,0,490,...,836,192,84,0,0,0,0,0,12,2008


# Modelling Score using KFold


In [6]:
# Lets split the data into 5 folds.  
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 1168, Test set:292
Fold:2, Train set: 1168, Test set:292
Fold:3, Train set: 1168, Test set:292
Fold:4, Train set: 1168, Test set:292
Fold:5, Train set: 1168, Test set:292


In [7]:
"""
Why we are using '-' sign to calculate RMSE?
ANS: Classification accuracy is reward function, means something you want to maximize. Mean Square Error is loss function, 
means something you want to minimize. Now if we use 'cross_val_score' function then best score(high value) will give worst 
model in case of loss function! There are other sklearn functions which also depends on 'cross_val_score' to select best model by
looking for highest scores, so a design decision was made for 'cross_val_score' to negate the output of all loss function. 
So that when other sklearn function calls 'cross_val_score' those function can always assume that highest score indicate better model.
In short ignore the negative sign and rate the error based on its absolute value.
"""
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

# Using Linear Regression

In [8]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-1.39334669e+09 -1.32533433e+09 -3.39493937e+09 -9.31045536e+08
 -7.16620849e+08]
rmse= 39398.70


# Using Decision Tree Regressor

In [9]:
score = cross_val_score(tree.DecisionTreeRegressor(random_state= 42), X, y, cv=kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-2.28396934e+09 -1.70193863e+09 -2.50505513e+09 -1.48547479e+09
 -1.66691378e+09]
rmse= 43916.63


# Using Random Forest Regressor

In [10]:
score = cross_val_score(ensemble.RandomForestRegressor(random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-8.58316418e+08 -6.13821216e+08 -2.06121160e+09 -7.97273029e+08
 -5.68429309e+08]
rmse= 31301.92


# Decision Tree Regressor Tuning

In [11]:
max_depth = [1,2,3,4,5,6,7,8,9,10]

for val in max_depth:
    score = cross_val_score(tree.DecisionTreeRegressor(max_depth= val, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For max depth: {val}')
    rmse(score.mean())

For max depth: 1
rmse= 58803.64
For max depth: 2
rmse= 50060.31
For max depth: 3
rmse= 42152.85
For max depth: 4
rmse= 39218.54
For max depth: 5
rmse= 40185.90
For max depth: 6
rmse= 40522.15
For max depth: 7
rmse= 41089.08
For max depth: 8
rmse= 41161.27
For max depth: 9
rmse= 41441.94
For max depth: 10
rmse= 41758.39


# Random Forest Regressor Tuning

In [12]:
estimators = [50, 100, 150, 200, 250, 300, 350]

for count in estimators:
    score = cross_val_score(ensemble.RandomForestRegressor(n_estimators= count, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    rmse(score.mean())


For estimators: 50
rmse= 31450.86
For estimators: 100
rmse= 31301.92
For estimators: 150
rmse= 31187.45
For estimators: 200
rmse= 31176.16
For estimators: 250
rmse= 31246.61
For estimators: 300
rmse= 31242.74
For estimators: 350
rmse= 31313.74


# K Fold: Classification Example

# Load Dataset

In [19]:
train_data = pd.read_csv(r'C:\Users\Sanjeshni\Documents\DIV_IOD\Data\train (2).csv')

In [20]:
# Remove rows with missing target values
train_data.dropna(axis=0, subset=['Survived'], inplace=True)
y = train_data.Survived #  Target variable
train_data.drop(['Survived'], axis=1, inplace=True) #Removing target variable from the training dataset

train_data.drop(['Age'], axis=1, inplace=True) # Removing columns with null values

# selecting numeric columns only
numeric_cols = [cname for cname in train_data if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))
pd.concat([X,y], axis=1).head() # shows the first 5 training examples

Shape of input data: (891, 5) and shape of target variable: (891,)


Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Fare,Survived
0,1,3,1,0,7.25,0
1,2,1,1,0,71.2833,1
2,3,3,0,0,7.925,1
3,4,1,1,0,53.1,1
4,5,3,0,0,8.05,0


# Understanding the Data

## Model Score Using KFold

In [22]:
# Spilting data into 5 folds using kf

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1

for train_index, test_index in kf.split(X, y):
    print(f'fold:{cnt}, Train set: {len(train_index)}, Test set: {len(test_index)}')
    
    cnt += 1
    
    

fold:1, Train set: 712, Test set: 179
fold:2, Train set: 713, Test set: 178
fold:3, Train set: 713, Test set: 178
fold:4, Train set: 713, Test set: 178
fold:5, Train set: 713, Test set: 178


## Using Logistic Regression

In [25]:
score = cross_val_score(linear_model.LogisticRegression(random_state=  42), X, y, cv= kf, scoring="accuracy")

print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')
                        

Scores for each fold are: [0.66480447 0.69662921 0.70224719 0.69101124 0.66292135]
Average score: 0.68


## Using Decision Classifier

In [28]:
score = cross_val_score(tree.DecisionTreeClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')


Scores for each fold are: [0.67039106 0.61235955 0.5505618  0.64044944 0.69101124]
Average score: 0.63


## Using Random Forest Classifier

In [29]:
score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.74301676 0.66292135 0.65730337 0.70786517 0.73033708]
Average score: 0.70
