<a href="https://colab.research.google.com/github/fatma-othman/EC3310-Project/blob/main/EC3310_Code_Final_Project_Draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **EC3310 Final Project: The Relationship Between Energy Efficiency and House Prices**

##**Data Collecting and Pre-Processing**

###**Loading Libraries**

In [1]:
# Loading Libraries

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import statsmodels.api as sm
import requests
import re
import json
from datetime import datetime
from bs4 import BeautifulSoup
import time
import random
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

###**House Price Data**

In [None]:
import requests
url = 'http://prod1.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.txt'
response = requests.get(url)
if response.status_code == 200:
    # Save the content of the text file locally
    with open('pp-complete.txt', 'w') as f:
        f.write(response.text)
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

###**Energy Efficiency Data**

In [None]:
#importing energy ratings data

import requests

api_key = '4a0b2cb2b8153fa0d214210a941a88506f3a5340'
headers = {
    "Accept": "application/json",
    "Authorization": "Basic ZmF0bWFvdGhtYW5Ab3V0bG9vay5jb206NGEwYjJjYjJiODE1M2ZhMGQyMTQyMTBhOTQxYTg4NTA2ZjNhNTM0MA=="
}

import requests

regions_bounding_boxes = {
    "Greater London": "-0.510375,51.286760,0.334015,51.691874",
    "South East": "-1.719970,50.539137,1.763634,51.800636",
    "South West": "-5.776567,49.824672,-1.166504,52.993573",
    "East Midlands": "-1.638000,52.099000,0.944000,53.625000",
    "West Midlands": "-3.235000,51.825000,-1.238000,53.235000",
    "East of England": "0.052978,51.448341,1.768343,52.992273",
    "North West": "-3.764000,53.057000,-1.833000,55.218000",
    "Yorkshire and The Humber": "-2.985000,53.325000,-0.783000,54.577000",
    "North East": "-2.689000,54.410000,-0.890000,55.810000"
}

base_url = "https://epc.opendatacommunities.org/api/v1/domestic/search"

def fetch_energy_data(region, bounding_box, num_pages=10):
    epcs = []
    for page in range(num_pages):
        params = {
            "size": 100,  # Number of results per page
            "from": page * 100,    # Starting index for the results
            "bounding_box": bounding_box
        }

        response = requests.get(base_url, params=params, headers=headers)

        if response.status_code == 200:
            json_data = response.json()
            epcs.extend(json_data["rows"])
        else:
            print("Error:", response.status_code, response.text)
            return []
    return epcs


for region, bbox in regions_bounding_boxes.items():
    epcs = fetch_energy_data(region, bbox)
    print(f"Energy Ratings Data for {region}:")
    for epc in epcs:
        print("Address:", epc.get("address", "N/A"))
        print("Postcode:", epc.get("postcode", "N/A"))
        print("Current energy rating:", epc.get("current-energy-rating", "N/A"))
        print("Current energy efficiency:", epc.get("current-energy-efficiency", "N/A"))
        print("\n")

###**Cleaning Data**

####**Cleaning House Price Data**

In [None]:
ppd_data = pd.read_csv('pp-complete.txt', delimiter=',', header=None)

In [None]:
# Rename columns in ppd_data
ppd_data.columns = [
    'transaction_id', 'price', 'transaction_date', 'postcode',
    'property_type', 'newly_built', 'tenure',
    'PAON', 'SAON', 'street', 'locality', 'town',
    'district', 'county', 'ppd_category', 'record_status'
]

In [None]:
print("Columns in ppd_data:")
print(ppd_data.columns)

In [None]:
# Preprocessing ppd_data
ppd_data = ppd_data[ppd_data['record_status'] == 'A']  # Keep only rows with full market value
ppd_data = ppd_data[ppd_data['postcode'].notna()]  # Remove rows with missing postcodes
ppd_data['address'] = ppd_data['postcode'] + ' ' + ppd_data['PAON'].astype(str) + ' ' + ppd_data['SAON'].fillna('') + ' ' + ppd_data['street']
ppd_data['address'] = ppd_data['address'].str.upper()  # Capitalize address strings

####**Cleaning EPC Data**

In [None]:
# Initialize an empty list to store the EPC data
epc_list = []

for region, bbox in regions_bounding_boxes.items():
    epcs = fetch_energy_data(region, bbox)
    epc_list.extend(epcs)  # Append the fetched data to the list

# Convert the list of EPC data into a pandas DataFrame
epc_data = pd.DataFrame(epc_list)

In [None]:
print("Columns in epc_data:")
print(epc_data.columns)

###**Merging Data**

In [None]:
linked_data = pd.merge(ppd_data, epc_data, how='inner', on='postcode')
print(linked_data)

##**Data Analysis**

###**Descriptive Statistics**

In [None]:
# Descriptive statistics
print(linked_data.describe())

# Correlations
print(linked_data.corr())

###**Regression Analysis**

####**Hedonic Pricing Model**

In [None]:
# Step 1: Convert categorical variables into numerical values or dummy variables
# Assuming energy ratings are categorical, convert them to dummy variables
energy_rating_dummies = pd.get_dummies(merged_data_df['current-energy-rating'], prefix='energy-rating')
merged_data_df = pd.concat([merged_data_df, energy_rating_dummies], axis=1)
merged_data_df.drop('current-energy-rating', axis=1, inplace=True)

# Step 2: Split the dataset into training and testing sets
X = merged_data_df.drop('price', axis=1)
y = merged_data_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Standardize/normalize the numerical features, if necessary
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train a regression model on the training data
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# Step 5: Evaluate the model on the testing data
y_pred = regressor.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean squared error: ", mse)
print("R-squared: ", r2)

# Step 6: Interpret the results
coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': regressor.coef_})
print(coefficients)

####**Regional Differences**