In [None]:
import os
import pandas as pd
import configparser as cp
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr, spearmanr

In [None]:
CONFIG = os.path.abspath('../config/config.ini')

config = cp.ConfigParser()
config.read(CONFIG)

HOUSE_DATA = config['KC_HOUSE_DATA']['HouseData']

In [None]:
df = pd.read_csv(HOUSE_DATA)
# data = DataCleaning(df)

print(len(df))
# print(data.drop_columns(''))
# print(data.drop_columns('asc'))

df.head()

# House Price Correlation Report

## Intro
	The report investigates the correlation between house prices and various house characteristics in the dataset.
	Data will be explored with and without tha aid of specialized data analysis tools and identify relevant functions in Pandas and Seaborn libraries.

## Price calculation
	Price correlation has been calculated mainly with Pearson statistical correlation, both in the use of library functions as with the use of custom functions.
	The use of different types of charts (scatterplots, lineplots, heatmaps, histplots)
	show different movements of the data that can justify the density and correlation between data in terms of time and quantity

## Unraveling Correlations
    In statistics correlation usually refers to the degree to which a pair of variables are linearly related.
    
    Correlation is a statistical measure that illustrates the extent to which two variables change together. A positive correlation indicates that as one variable increases, the other also tends to increase, and vice versa. Conversely, a negative correlation implies that as one variable increases, the other tends to decrease.


In [None]:
# Calculate the correlation of all features with 'price'
correlations = df.corr(numeric_only=True)['price'].sort_values(ascending=False)
top_correlations = correlations[1:11]
# print(top_correlations)

# Select the top correlated features including Price
selected_features = list(top_correlations.index) + ['price']
# print(selected_features)
 
# Compute the correlations for the selected features (top_correlations)
correlation_matrix = df[selected_features].corr()
print(correlation_matrix)
 
# Set up the matplotlib figure
plt.figure(figsize=(12, 8))
 
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=.5, fmt=".2f", vmin=-1, vmax=1)
 
plt.title("Heatmap of Correlations among Top Features with Price", fontsize=16)
 
plt.show()

In [None]:
print(df['price'].describe())

df.describe().T \
    .style \
    .bar(subset=['mean']) \
    .background_gradient(subset=['std'], cmap='Blues') \
    .background_gradient(subset=['50%'], cmap='BuGn')

# sns.histplot(df['price'], kde=True, bins=20)
# plt.xlabel('House Price ($) per 10^6')
# plt.ylabel('Number of Houses')
# plt.title('Distribution of House Prices')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

# sns.histplot(df['sqft_living'], kde=True, bins=20)
# plt.xlabel('Square feet living')
# plt.ylabel('Density')
# plt.title('Sqft Living Distributions')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

In [None]:
df_pairplot = df[['price', 'sqft_living']]
# sns.pairplot(df_pairplot, diag_kind='auto', kind='scatter')

dataset = df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_above', 'yr_renovated']].dropna()
# sns.pairplot(dataset, hue="price", kind='scatter')

In [None]:
# df['date'] = pd.to_datetime(df['date'], format='ISO8601')
df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')
df['month'] = df['date'].dt.month
df['year'] = pd.to_datetime(df['date']).dt.year
df['quarter'] = df['date'].dt.quarter 

print(df[['date', 'month', 'year', 'quarter']])


price_per_timeframe = df.groupby('date')['price'].sum()
print(price_per_timeframe)

In [None]:
price_per_timeframe_smoothed = price_per_timeframe.rolling(window=15).mean()
plt.plot(price_per_timeframe_smoothed)
plt.plot()
plt.title('Time Series Data')
plt.xlabel('Timeframe')
plt.ylabel('Price per 10^7')
plt.show()

# sns.lineplot(df['price'])
# sns.displot(df['price'], kind="kde")
sns.displot(df['price'])
plt.plot()
plt.title('Time Series Data')
plt.xlabel('Price per 10^6')
plt.ylabel('Quantity sold')
plt.show()

sns.displot(df['sqft_living'])
plt.plot()
plt.title('House dimension quantity')
plt.xlabel('Square feet living')
plt.ylabel('Density')
plt.show()

In [None]:
print(df[['price', 'sqft_living']])
price = df['price']
sqft_living = df['sqft_living']
grade = df['grade']
# sns.scatterplot(price)
# sns.scatterplot(sqft_living)

corr_P, _ = pearsonr(price, sqft_living)
corr_S, _ = spearmanr(price, sqft_living)
print('Pearsons correlation: %.3f' % corr_P)
print('Spearmanr correlation: %.3f' % corr_S)

corr_P, _ = pearsonr(price, grade)
corr_S, _ = spearmanr(price, grade)
print('Pearsons correlation: %.3f' % corr_P)
print('Spearmanr correlation: %.3f' % corr_S)

# Interpreting Scatterplots: 

## Positive association
	This occur when the points in the scatterplot tend to move diagonally upward from left to right
	As the value of one increases, the value of the other variable also tends to increase


## Negative association
	This occur when the points in the scatterplot tend to move diagonally downward from left to right
	As the value of one increases, the valule of the other variable also tends to increase

## Linear vs Non-linear
	Linear is a trend where the points tend to follow a straight line (positive/negative association)
	Non-linear ->
		curved trend : more complex relationship where rate of change of one variable is not constant as the other variable changes
		clusters : points might form distinct clusters in different areas of the plot, indicating potential subgroups within data

## Strength of association
	Strong association : points form a tight cluster around a well-defined trendline
	Weak association : points are more scattered and do not follow a clear trendline 

In [None]:
price = df['price']
sqft_living = df['sqft_living']


# Using Pearsonr correlation
correlation = price.corr(sqft_living)

# Scatter Plot with Trendline [price - square feet living]
plt.figure(figsize=(10, 6))
plt.scatter(df['price'], df['sqft_living'], alpha=0.5, s=50, c='royalblue')
m, b = np.polyfit(price, sqft_living, 1)  # Linear regression
plt.plot(price, m * price + b, color='red')  # Trendline
plt.xlabel('price')
plt.ylabel('sqft_living')
plt.title('Scatter Plot with Trendline (Correlation: {})'.format(correlation))
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Scatter Plot with Trendline [price - zipcode]
correlation_z = price.corr(df['zipcode'])
plt.figure(figsize=(10, 6))
plt.scatter(df['price'], df['zipcode'], alpha=0.5, s=50, c='purple')
m, b = np.polyfit(price, df['zipcode'], 1)  # Linear regression
plt.plot(price, m * price + b, color='red')  # Trendline
plt.xlabel('price')
plt.ylabel('zipcode')
plt.title('Scatter Plot with Trendline (Correlation: {})'.format(correlation_z))
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()


# Creates joint plots with histograms and scatterplots for exploring bivariate distributions.
sns.set_theme(style="darkgrid")
plt.figure(figsize=(10, 6))
df_sns = pd.DataFrame({'x': df['price'], 'y': df['sqft_living']})

sns.jointplot(
    x="sqft_living",
    y="price",
    color="orange",
    data=df
)

plt.xlabel('Price')
plt.ylabel('Square Feet Living')
# plt.title('Price vs Sqft Living (Correlation: {:.2f})'.format(correlation))

plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Pearson Correlation Coefficient

## Measures the strength and direction of the linear relationship between two variables
## Denoted by symbol r and ranges from -1 to +1

    r = nΣxy / (Σx² * Σy²)^(1/2)

    r is the correlation coefficient
    n is the number of data pairs
    Σ (sigma) represents the sum
    x and y are the variables
    xy is the product of each x and y value
    x^2 and y^2 are each variable squared

## Steps to calculate the correlation coefficient manually
    - Find the mean of each variable (x and y)
    - For each data pair, subtract the mean of x from x and subtract the mean of y from y. (this is called centering data)
    - Multiply the centered values of x and y together for each data pair
    - Square each of the centered x values and each of the centered y values
    - Sum the products of xy (3rd step), the squared centered x values (4th step), and the squared centered y values (4th step)
    - Multiply the number of data pairs (n) by the sum of the xy products (5th step)
    - Multiply the sum of the squared centered x values (5th step) by the sum of the quared centered y values (5th step)
    - Take the square root of the product in 7th step
    - Divide the result from 6th step by the result from 8th step


<!-- x = df['price']. -->

In [None]:
def pearson_correlation(x, y):
	"""
	Function calculates Pearson correlation coefficient between two lists

	Args:
		x, y: Lists of numerical values

	Returns:
		The Pearson correlation coefficient between x and y
	"""

	# Calculate means
	mean_x = sum(x) / len(x)
	mean_y = sum(y) / len(y)

	# Centering the data
	centered_x = []
	centered_y = []
	for valx, valy in zip(x, y):
		centered_x.append(valx - mean_x)
		centered_y.append(valy - mean_y)

	# Calculate numerator (sum of products of centered data)
	numerator = 0
	for a, b in zip(centered_x, centered_y):
		numerator += a * b

	# Calculate denominator (square root of product of variances)
	squared_sum_x = 0
	squared_sum_y = 0
	for valx, valy in zip(centered_x, centered_y):
		squared_sum_x += valx**2
		squared_sum_y += valy**2

	denominator = (squared_sum_x * squared_sum_y)**0.5

	# Handle division by zero
	if denominator == 0:
		return 0

	# Calculate and return correlation coefficient
	correlation = numerator / denominator
	return correlation


custom = pearson_correlation(price, sqft_living)
print("Custom pearson correlation coefficient:", custom)

correlation_list = []
for col in df.columns.sort_values():
	correlation_list.append(round(pearson_correlation(price, df[col].astype(int)), 3))

for x in correlation_list:
	print(float(x))


## Linear Regression is a statistical technique used for understanding the relationship between two variables
	It's essentially the way to predict the value of one variable (Dependent variable) based on the value of another variable (independent variable)
	
	Prediction of house prices:
	Size of the house	=	Indipendent variable
	House price		=	Dipendent variable

	Linear regression helps find a straight line that best fits the data points of house size and price

	Y = x * w + b

	Y = Represents the dependent variable
	x = Represents the independent variable
	w = Represents the slope of the line
	b = Represents the y-intercept of the line (the line where the line crosses the Y-axis)


In [None]:
def linear_regression(x, y):
	"""
	Performs linear regression on the given data points

	Args:
		x (numpy.ndarray): An array of independent variable values
		y (numpy.ndarray): An array of dependent variable values
		
	Returns:
		tuple: A tuple containing the slope and intercept of the best fit
	"""

# Calculate the mean of x and y
	x_mean = np.mean(x)
	y_mean = np.mean(y)
	
# Calculate the numerator and denominator for the slope
	numerator = np.sum((x - x_mean) * (y - y_mean))
	denominator = np.sum((x -x_mean)**2)
    
# Calculate the slope and intercept
	slope = numerator / denominator
	intercept = y_mean - slope * x_mean

	return slope, intercept

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])

slope, intercept = linear_regression(x, y)

print(f"Slope: {slope:.2f}")
print(f"Intercept: {intercept:.2f}")

slope, intercept = linear_regression(price, sqft_living)

print(f"Price - Sqft_living\nSlope: {slope:.2f}")
print(f"Intercept: {intercept:.2f}")



In [None]:
# Custom Pearson correlation and linear regression

correlation = pearson_correlation(price, sqft_living)

plt.figure(figsize=(10, 6))
plt.scatter(df['price'], df['sqft_living'], alpha=0.5, s=50, c='royalblue')
w, b = linear_regression(price, sqft_living)
plt.plot(price, w * price + b, color='red')
plt.xlabel('price')
plt.ylabel('sqft_living')
plt.title('Scatter Plot with Trendline (Correlation: {})'.format(correlation))
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()