# Logistic Regression Model

In [None]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Loading data
file_path = Path("../Resources/output/TX_full_data.csv")
df_texas = pd.read_csv(file_path)
df_texas.head()

In [None]:
#Seeing the rows above and putting them to zero
df_texas.fillna(0, inplace=True)

In [None]:
df_texas.info()

In [None]:
#focusing on the non float values 
df_texas = df_texas[["zipcode", "population","Net_Income", "P_2015", "P_2016","P_2017","P_2018","P_2019","P_2020","P_2021","P_2022","P_2023", "bedroom_number", "bathroom_number", 
                    "living_space", "land_space"]]

df_texas.head()

In [None]:
# Create the X set by using the `reshape` function to format the ads data as a single column array.
X =  df_texas.drop(columns='P_2023')

# Display sample data
X[:5]

In [None]:
y = df_texas["P_2023"]
y[:5]

In [None]:
model = LinearRegression()

In [None]:
model.fit(X, y)

In [None]:
predicted_y_values = model.predict(X)

In [None]:
# Create a copy of the original data
df_texas_predicted = df_texas.copy()

# Add a column with the predicted sales values
df_texas_predicted["p_2024"] = predicted_y_values

# Display sample data
df_texas_predicted.head()

In [None]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")