# Logistic Regression Model

In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Loading data
file_path = Path("../Resources/output/TX_full_data.csv")
df_texas = pd.read_csv(file_path)
df_texas.head()

Unnamed: 0,zipcode,latitude,longitude,state,population,P_2015,P_2016,P_2017,P_2018,P_2019,P_2020,P_2021,P_2022,P_2023,Net_Income,bedroom_number,bathroom_number,living_space,land_space
0,75001,32.96,-96.84,TX,16287.0,273859.58,295281.26,319308.56,332937.39,333861.35,339818.07,378728.24,440760.63,454387.59,31081.0,2.0,3.0,1981.0,1873.0
1,75002,33.09,-96.61,TX,71102.0,239753.96,264265.55,284191.72,296218.09,296925.02,305948.22,360917.02,456706.81,464720.11,97936.0,4.0,3.0,3044.0,3827.0
2,75006,32.96,-96.9,TX,48104.0,166228.03,190571.63,211957.41,230517.64,237664.55,247785.26,278907.45,332128.56,341701.77,115491.0,3.0,2.0,1938.0,5075.0
3,75007,33.0,-96.9,TX,54448.0,202984.63,229672.38,252570.85,270069.18,275111.24,285478.08,325031.65,389614.5,400732.13,100570.0,3.0,2.0,2049.0,6753.0
4,75009,33.34,-96.75,TX,24737.0,280691.57,310914.56,336086.42,346691.19,347364.97,354443.47,428784.45,567323.67,557517.04,16893.0,4.0,4.0,3153.0,5374.0


In [3]:
df_texas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1164 entries, 0 to 1163
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   zipcode          1164 non-null   int64  
 1   latitude         1164 non-null   float64
 2   longitude        1164 non-null   float64
 3   state            1164 non-null   object 
 4   population       1164 non-null   float64
 5   P_2015           1164 non-null   float64
 6   P_2016           1164 non-null   float64
 7   P_2017           1164 non-null   float64
 8   P_2018           1164 non-null   float64
 9   P_2019           1164 non-null   float64
 10  P_2020           1164 non-null   float64
 11  P_2021           1164 non-null   float64
 12  P_2022           1164 non-null   float64
 13  P_2023           1164 non-null   float64
 14  Net_Income       1164 non-null   float64
 15  bedroom_number   1164 non-null   float64
 16  bathroom_number  1164 non-null   float64
 17  living_space  

In [4]:
#focusing on the non float values 
df_texas = df_texas[["zipcode", "population","Net_Income", "P_2015", "P_2016","P_2017","P_2018","P_2019","P_2020","P_2021","P_2022","P_2023", "bedroom_number", "bathroom_number", 
                    "living_space", "land_space"]]

df_texas.head()

Unnamed: 0,zipcode,population,Net_Income,P_2015,P_2016,P_2017,P_2018,P_2019,P_2020,P_2021,P_2022,P_2023,bedroom_number,bathroom_number,living_space,land_space
0,75001,16287.0,31081.0,273859.58,295281.26,319308.56,332937.39,333861.35,339818.07,378728.24,440760.63,454387.59,2.0,3.0,1981.0,1873.0
1,75002,71102.0,97936.0,239753.96,264265.55,284191.72,296218.09,296925.02,305948.22,360917.02,456706.81,464720.11,4.0,3.0,3044.0,3827.0
2,75006,48104.0,115491.0,166228.03,190571.63,211957.41,230517.64,237664.55,247785.26,278907.45,332128.56,341701.77,3.0,2.0,1938.0,5075.0
3,75007,54448.0,100570.0,202984.63,229672.38,252570.85,270069.18,275111.24,285478.08,325031.65,389614.5,400732.13,3.0,2.0,2049.0,6753.0
4,75009,24737.0,16893.0,280691.57,310914.56,336086.42,346691.19,347364.97,354443.47,428784.45,567323.67,557517.04,4.0,4.0,3153.0,5374.0


In [5]:
# Create the X set by using the `reshape` function to format the ads data as a single column array.
X =  df_texas.drop(columns='P_2023')

# Display sample data
X[:5]

Unnamed: 0,zipcode,population,Net_Income,P_2015,P_2016,P_2017,P_2018,P_2019,P_2020,P_2021,P_2022,bedroom_number,bathroom_number,living_space,land_space
0,75001,16287.0,31081.0,273859.58,295281.26,319308.56,332937.39,333861.35,339818.07,378728.24,440760.63,2.0,3.0,1981.0,1873.0
1,75002,71102.0,97936.0,239753.96,264265.55,284191.72,296218.09,296925.02,305948.22,360917.02,456706.81,4.0,3.0,3044.0,3827.0
2,75006,48104.0,115491.0,166228.03,190571.63,211957.41,230517.64,237664.55,247785.26,278907.45,332128.56,3.0,2.0,1938.0,5075.0
3,75007,54448.0,100570.0,202984.63,229672.38,252570.85,270069.18,275111.24,285478.08,325031.65,389614.5,3.0,2.0,2049.0,6753.0
4,75009,24737.0,16893.0,280691.57,310914.56,336086.42,346691.19,347364.97,354443.47,428784.45,567323.67,4.0,4.0,3153.0,5374.0


In [6]:
y = df_texas["P_2023"]
y[:5]

0    454387.59
1    464720.11
2    341701.77
3    400732.13
4    557517.04
Name: P_2023, dtype: float64

In [7]:
model = LinearRegression()

In [8]:
model.fit(X, y)

In [9]:
predicted_y_values = model.predict(X)

In [10]:
# Create a copy of the original data
df_texas_predicted = df_texas.copy()

# Add a column with the predicted sales values
df_texas_predicted["p_2024"] = predicted_y_values

# Display sample data
df_texas_predicted.head()

Unnamed: 0,zipcode,population,Net_Income,P_2015,P_2016,P_2017,P_2018,P_2019,P_2020,P_2021,P_2022,P_2023,bedroom_number,bathroom_number,living_space,land_space,p_2024
0,75001,16287.0,31081.0,273859.58,295281.26,319308.56,332937.39,333861.35,339818.07,378728.24,440760.63,454387.59,2.0,3.0,1981.0,1873.0,449404.47896
1,75002,71102.0,97936.0,239753.96,264265.55,284191.72,296218.09,296925.02,305948.22,360917.02,456706.81,464720.11,4.0,3.0,3044.0,3827.0,464225.372772
2,75006,48104.0,115491.0,166228.03,190571.63,211957.41,230517.64,237664.55,247785.26,278907.45,332128.56,341701.77,3.0,2.0,1938.0,5075.0,341165.725607
3,75007,54448.0,100570.0,202984.63,229672.38,252570.85,270069.18,275111.24,285478.08,325031.65,389614.5,400732.13,3.0,2.0,2049.0,6753.0,394075.130057
4,75009,24737.0,16893.0,280691.57,310914.56,336086.42,346691.19,347364.97,354443.47,428784.45,567323.67,557517.04,4.0,4.0,3153.0,5374.0,582176.81712


In [11]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.9965625842587348.
The r2 is 0.9965625842587348.
The mean squared error is 103702332.45026757.
The root mean squared error is 10183.434216916588.
The standard deviation is 173691.366343013.


In [19]:
coefficients = pd.Series(model.coef_, index=X.columns)
sorted_coefficients = coefficients.abs().sort_values(ascending=False)
print(sorted_coefficients)


bedroom_number     2495.621020
bathroom_number    2074.302356
P_2022                1.426468
P_2021                1.188220
zipcode               1.139120
P_2020                0.681378
population            0.279090
P_2018                0.274003
P_2016                0.226300
P_2015                0.219968
Net_Income            0.143073
land_space            0.107706
P_2017                0.092849
living_space          0.059914
P_2019                0.058546
dtype: float64
