## Random Forests and Gradient Boosting models

In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
housing_data = pd.read_csv('Housing_data.csv', sep=';')
housing_df = pd.DataFrame(housing_data)
housing_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
X = housing_data[['area']] 
y = housing_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# List to maintain the cross-validation scores
cross_val_scores_lasso = []

# List to maintain the different values of Lambda
Lambda = []

# Loop to compute the cross-validation scores
for i in range(1, 9):
	lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925)
	lassoModel.fit(X_train, y_train)
	scores = cross_val_score(lassoModel, X, y, cv = 10)
	avg_cross_val_score = np.mean(scores)*100
	cross_val_scores_lasso.append(avg_cross_val_score)
	Lambda.append(i * 0.25)

# Loop to print the different values of cross-validation scores
for i in range(0, len(Lambda)):
	print(str(Lambda[i])+' : '+str(cross_val_scores_lasso[i]))

0.25 : -7550.679102775206
0.5 : -7550.6791023616615
0.75 : -7550.679101948117
1.0 : -7550.679101534574
1.25 : -7550.679101121031
1.5 : -7550.679100707486
1.75 : -7550.679100293943
2.0 : -7550.679099880401


In [5]:
# List to maintain the different cross-validation scores
cross_val_scores_ridge = []

# List to maintain the different values of alpha
alpha = []

# Loop to compute the different values of cross-validation scores
for i in range(1, 9):
	ridgeModel = Ridge(alpha = i * 0.25)
	ridgeModel.fit(X_train, y_train)
	scores = cross_val_score(ridgeModel, X, y, cv = 10)
	avg_cross_val_score = np.mean(scores)*100
	cross_val_scores_ridge.append(avg_cross_val_score)
	alpha.append(i * 0.25)

# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
	print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i]))

0.25 : -7550.679102761498
0.5 : -7550.679102334251
0.75 : -7550.679101907001
1.0 : -7550.67910147975
1.25 : -7550.679101052505
1.5 : -7550.679100625254
1.75 : -7550.679100198005
2.0 : -7550.679099770756


### Practising on the NAC dataset

In [6]:
NAC_data = 'NAC_data.csv'
file = pd.read_csv(NAC_data)
NAC_df = pd.DataFrame(file)

In [11]:
# Handle NaN values in the target variable
NAC_df.dropna(subset=['Aerial duels won, %'], inplace=True)

# Splitting the dataset into testing and training sets
X = NAC_df[['Goals', 'Age', 'xG', 'Duels per 90', 'Height', 'Assists per 90']]
y = NAC_df['Aerial duels won, %']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using one-hot encoding for adressing categorical data
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Handling NaN values using a simple imputer
imputer = SimpleImputer(strategy='mean')
X_train_encoded = pd.DataFrame(imputer.fit_transform(X_train_encoded), columns=X_train_encoded.columns)
X_test_encoded = pd.DataFrame(imputer.transform(X_test_encoded), columns=X_test_encoded.columns)

In [15]:
# List to maintain the cross-validation scores
cross_val_scores_lasso = []

# List to maintain the different values of Lambda
Lambda = []

# Loop to compute the cross-validation scores
for i in range(1, 9):
	lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925)
	lassoModel.fit(X_train_encoded, y_train)
	scores = cross_val_score(lassoModel, X_train_encoded, y_train, cv = 10)
	avg_cross_val_score = np.mean(scores)*100
	cross_val_scores_lasso.append(avg_cross_val_score)
	Lambda.append(i * 0.25)

# Loop to print the different values of cross-validation scores
for i in range(0, len(Lambda)):
	print(str(Lambda[i])+' : '+str(cross_val_scores_lasso[i]))

0.25 : 31.58007610305143
0.5 : 31.577060765609172
0.75 : 31.562543837626695
1.0 : 31.540777702751623
1.25 : 31.511762360983937
1.5 : 31.475497812323677
1.75 : 31.431984056770823
2.0 : 31.38122109432538


In [16]:
# List to maintain the different cross-validation scores
cross_val_scores_ridge = []

# List to maintain the different values of alpha
alpha = []

# Loop to compute the different values of cross-validation scores
for i in range(1, 9):
	ridgeModel = Ridge(alpha = i * 0.25)
	ridgeModel.fit(X_train_encoded, y_train)
	scores = cross_val_score(ridgeModel, X_train_encoded, y_train, cv = 10)
	avg_cross_val_score = np.mean(scores)*100
	cross_val_scores_ridge.append(avg_cross_val_score)
	alpha.append(i * 0.25)

# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
	print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i]))

0.25 : 33.099867123050934
0.5 : 33.0998983261935
0.75 : 33.09990978270535
1.0 : 33.099901793188145
1.25 : 33.09987465366572
1.5 : 33.099828655661426
1.75 : 33.09976408627398
2.0 : 33.09968122825192
