In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
import mapclassify
import scipy.stats
import folium

In [None]:
property = pd.read_csv('London_property.csv')
location = pd.read_csv('London_loc.csv')

property

In [None]:
property = property.dropna()
property = property[property.CONSTRUCTION_AGE_BAND != 'NO DATA!']
property = property[property.price > 50000]
property = property[property.tfarea > 1]



property

In [None]:
prices = property['price']

prices

In [None]:
location = location.rename(columns={"Postcode": "postcode"})

comb = pd.merge(property, location, on ='postcode')

In [None]:
geometry = [Point(xy) for xy in zip(comb.Eastings, comb.Northings)]
gdf = gpd.GeoDataFrame(comb, geometry=geometry, crs={'init': 'epsg:27700'})

In [None]:
point = Point(532704, 181111)

gdf['distance_sqmile'] = gdf.geometry.distance(point)

print(gdf.columns)

In [None]:
gdf.plot(column='distance_sqmile', cmap='Reds', scheme='quantiles')

gdf.plot(column='price', cmap='Reds', scheme='quantiles')

In [None]:
gdf['distance_sqmile']

In [None]:
gdf['price']

In [None]:
from scipy.stats import pearsonr

# Calculate the Pearson correlation coefficient and the p-value
corr, p_value = pearsonr(gdf['price'], gdf['distance_sqmile'])
print(corr, p_value)

In [None]:
correlation_matrix = gdf.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)

plt.show

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


X = gdf[['distance_sqmile']]
y = gdf['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model = LinearRegression()


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))
print ('R2: ', r2_score(y_test,y_pred))


In [None]:
from sklearn.ensemble import RandomForestRegressor



X = gdf[['distance_sqmile']]
y = gdf['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model = RandomForestRegressor(n_estimators=100)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))
print ('R2: ', r2_score(y_test,y_pred))

In [None]:
import xgboost as xgb


X = gdf[['distance_sqmile']]
y = gdf['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model = xgb.XGBRegressor()


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))
print ('R2: ', r2_score(y_test,y_pred))

In [None]:
from sklearn import linear_model

X = gdf[['distance_sqmile']]
y = gdf['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model = linear_model.Lasso(alpha=0.1)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))
print ('R2: ', r2_score(y_test,y_pred))


In [None]:
X = gdf[['distance_sqmile']]

# Create and fit the model
model = linear_model.Lasso(alpha=0.1)
model.fit(X, gdf['price'])

# Use the model to predict the value of var2
y_pred = model.predict(X)

print(y_pred)

In [None]:
gdf = gdf.assign(predicted_price=y_pred)

print(gdf.columns)

In [None]:
from sklearn.model_selection import GridSearchCV

model = xgb.XGBRegressor()

param_grid = {
    'n_estimators': [100, 250, 500],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model = xgb.XGBRegressor()

param_dist = {
    'n_estimators': np.arange(50, 1000, 50),
    'max_depth': np.arange(1, 20, 2),
    'learning_rate': np.logspace(-3, 0, 4)
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters: ", random_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
gdf['geometry']

In [None]:

m = folium.Map(location=[gdf.geometry.y.mean(), gdf.geometry.x.mean()], zoom_start=10, tiles='OpenStreetMap')

# Plot the GeoDataFrame on the map
folium.GeoJson(gdf, name='geojson').add_to(m)

# Show the map
m