In [3]:
# Download and Read the Real Estate Valuation Dataset:

# Download the dataset from the UCI repository.
# Read the dataset using pandas.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np
import openpyxl




# Path to the Excel file
file_path = 'C:/Deakin/MachineLearning/MLAssignment/Week6/realestate.xlsx'

# Read the dataset into a pandas DataFrame
real_estate_data = pd.read_excel(file_path)
real_estate_data


Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500000,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.666667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250000,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000000,8.1,104.81010,5,24.96674,121.54067,52.5


In [4]:
# Data Splitting and Linear Regression Model:

# Split the dataset into training and testing sets.
# Train a linear regression model on the training set.
# Evaluate the model using at least four performance metrics.

# Prepare the features and target variable
X = real_estate_data.drop(columns=['No', 'Y house price of unit area'])
y = real_estate_data['Y house price of unit area']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

# Evaluate the model using various performance metrics
metrics = {
    'Mean Squared Error': mean_squared_error,
    'R^2 Score': r2_score,
    'Mean Absolute Error': mean_absolute_error,
    'Explained Variance Score': explained_variance_score
}

train_performance = {metric: func(y_train, y_pred_train) for metric, func in metrics.items()}
test_performance = {metric: func(y_test, y_pred_test) for metric, func in metrics.items()}

(train_performance, test_performance)



({'Mean Squared Error': 83.11097569290108,
  'R^2 Score': 0.5581330550666339,
  'Mean Absolute Error': 6.3397048524170945,
  'Explained Variance Score': 0.5581330550666339},
 {'Mean Squared Error': 53.50225236118001,
  'R^2 Score': 0.6810781244679236,
  'Mean Absolute Error': 5.305232952585015,
  'Explained Variance Score': 0.6811692967585974})

In [5]:
# PCA and Linear Regression:

# Apply PCA to the dataset and select the first three principal components.
# Split the dataset into training and testing sets using the same method.
# Train a linear regression model on the training set with the principal components.
# Compare the performance of this model with the original model

# Apply PCA and select the first three principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train a linear regression model on the PCA-transformed data
linear_model_pca = LinearRegression()
linear_model_pca.fit(X_train_pca, y_train_pca)

# Make predictions
y_pred_train_pca = linear_model_pca.predict(X_train_pca)
y_pred_test_pca = linear_model_pca.predict(X_test_pca)

# Evaluate the model using the same performance metrics
train_performance_pca = {metric: func(y_train_pca, y_pred_train_pca) for metric, func in metrics.items()}
test_performance_pca = {metric: func(y_test_pca, y_pred_test_pca) for metric, func in metrics.items()}

(train_performance_pca, test_performance_pca)


({'Mean Squared Error': 91.28310770735649,
  'R^2 Score': 0.5146851833900654,
  'Mean Absolute Error': 6.7208985427717485,
  'Explained Variance Score': 0.5146851833900654},
 {'Mean Squared Error': 58.87842544111352,
  'R^2 Score': 0.6490312642672955,
  'Mean Absolute Error': 5.63185148927695,
  'Explained Variance Score': 0.6506599140893383})

In [7]:
# Iris Dataset and PCA with Logistic Regression:

# Load the Iris dataset from sklearn.
# Apply PCA and select the first three principal components.
# Split the dataset into training and testing sets.
# Train a logistic regression model on the training set.
# Evaluate the model using at least four performance metrics

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Apply PCA and select the first three principal components
pca_iris = PCA(n_components=3)
X_iris_pca = pca_iris.fit_transform(X_iris)

# Split the dataset into training and testing sets
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris_pca, y_iris, test_size=0.2, random_state=42)

# Train a logistic regression model
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train_iris, y_train_iris)

# Make predictions
y_pred_train_iris = logistic_model.predict(X_train_iris)
y_pred_test_iris = logistic_model.predict(X_test_iris)

# Evaluate the model using various performance metrics
iris_metrics = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1 Score': f1_score
}

train_performance_iris = {
    metric: func(y_train_iris, y_pred_train_iris, average='macro') if metric != 'Accuracy' else func(y_train_iris, y_pred_train_iris)
    for metric, func in iris_metrics.items()
}
test_performance_iris = {
    metric: func(y_test_iris, y_pred_test_iris, average='macro') if metric != 'Accuracy' else func(y_test_iris, y_pred_test_iris)
    for metric, func in iris_metrics.items()
}

(train_performance_iris, test_performance_iris)



({'Accuracy': 0.975,
  'Precision': 0.9761904761904763,
  'Recall': 0.975609756097561,
  'F1 Score': 0.9749960931395533},
 {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0})

In [9]:
# Regularization and Logistic Regression:

# Apply L1 or L2 regularization to the logistic regression model.
# Evaluate the new model on the same training and testing data.
# Compare the performance with the previous logistic regression model

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Apply PCA and select the first three principal components
pca_iris = PCA(n_components=3)
X_iris_pca = pca_iris.fit_transform(X_iris)

# Split the dataset into training and testing sets
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris_pca, y_iris, test_size=0.2, random_state=42)

# Train a logistic regression model
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train_iris, y_train_iris)

# Make predictions
y_pred_train_iris = logistic_model.predict(X_train_iris)
y_pred_test_iris = logistic_model.predict(X_test_iris)

# Evaluate the model using various performance metrics
iris_metrics = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1 Score': f1_score
}

train_performance_iris = {
    metric: func(y_train_iris, y_pred_train_iris, average='macro') if metric != 'Accuracy' else func(y_train_iris, y_pred_train_iris)
    for metric, func in iris_metrics.items()
}
test_performance_iris = {
    metric: func(y_test_iris, y_pred_test_iris, average='macro') if metric != 'Accuracy' else func(y_test_iris, y_pred_test_iris)
    for metric, func in iris_metrics.items()
}

# Output the performance
print("Train Performance:", train_performance_iris)
print("Test Performance:", test_performance_iris)

# Apply L2 regularization to the logistic regression model
logistic_model_l2 = LogisticRegression(penalty='l2', C=1.0, max_iter=200)
logistic_model_l2.fit(X_train_iris, y_train_iris)

# Make predictions with the regularized model
y_pred_train_iris_l2 = logistic_model_l2.predict(X_train_iris)
y_pred_test_iris_l2 = logistic_model_l2.predict(X_test_iris)

# Evaluate the regularized model using the same performance metrics
train_performance_iris_l2 = {
    metric: func(y_train_iris, y_pred_train_iris_l2, average='macro') if metric != 'Accuracy' else func(y_train_iris, y_pred_train_iris_l2)
    for metric, func in iris_metrics.items()
}
test_performance_iris_l2 = {
    metric: func(y_test_iris, y_pred_test_iris_l2, average='macro') if metric != 'Accuracy' else func(y_test_iris, y_pred_test_iris_l2)
    for metric, func in iris_metrics.items()
}

# Output the performance of the regularized model
print("Train Performance with L2 Regularization:", train_performance_iris_l2)
print("Test Performance with L2 Regularization:", test_performance_iris_l2)


Train Performance: {'Accuracy': 0.975, 'Precision': 0.9761904761904763, 'Recall': 0.975609756097561, 'F1 Score': 0.9749960931395533}
Test Performance: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
Train Performance with L2 Regularization: {'Accuracy': 0.975, 'Precision': 0.9761904761904763, 'Recall': 0.975609756097561, 'F1 Score': 0.9749960931395533}
Test Performance with L2 Regularization: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}
