In [None]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import datetime as dt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder

### Read realtor data

In [None]:
realtor_2018_data = pd.read_csv(
    Path("realtor_data/2018_realtor_data.csv"))
realtor_2019_data = pd.read_csv(
    Path("realtor_data/2019_realtor_data.csv"))
display(realtor_2018_data)

In [None]:
csv_files = glob.glob('realtor_data/*')
csv_files

### Merge realtor data

In [None]:
realtor_data = pd.DataFrame()

for file in csv_files:
            df_temp = pd.read_csv(file)
            realtor_data = realtor_data.append(df_temp)
            
realtor_data.set_index("zip_code", drop=True, inplace=True)

display(realtor_data)

### Merge IRS data

In [None]:
csv_files = glob.glob('IRS_Income_tax_data/*.csv')
csv_files

In [None]:
IRS_sliced_data = pd.DataFrame()

for file in csv_files:
            df_temp = pd.read_csv(file)
            IRS_sliced_data = IRS_sliced_data.append(df_temp, ignore_index=True)
            
display(IRS_sliced_data)

In [None]:
# Set the Index to zip-code
IRS_sliced_data.rename(columns = {'zipcode':'zip_code'}, inplace=True)
IRS_sliced_data.set_index('zip_code', drop=True, inplace=True)

display(IRS_sliced_data)

### Merge realtor and IRS data on zip_code (left join)

In [None]:
df1 = realtor_data.merge(IRS_sliced_data, how = 'left', on = "zip_code")
df1.drop(columns=["state"], inplace = True)

## The final DataFrame we are going to work with!!!

In [None]:
display(df1)

# Prepare the realtor data

In [None]:
realtor_data.reset_index()
#realtor_data.set_index("sold_date", inplace = True)
display(realtor_data)

# Encode the dataset’s categorical variables using OneHotEncoder

In [None]:
# Create a list of categorical variables 
categorical_variables = list(df1.dtypes[df1.dtypes == "object"].index)

In [None]:
display(categorical_variables)

In [None]:
enc = OneHotEncoder(sparse=False)

In [None]:
encoded_data = enc.fit_transform(df1[categorical_variables])

In [None]:
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names_out(categorical_variables)
)

In [None]:
display(encoded_df)

# Apply Logistic Regression

# Split training into testing sets (Create X, or features DataFrame, and create y, or target DataFrame)

In [None]:
features = df1[['STATE', 'A18425']]

target = df1['price']

In [None]:
training_features, testing_features, training_targets, testing_targets = train_test_split(features, target)

In [None]:
training_features

# Create and use a classifier that can predict whether the house sold price price will be higher or lower

In [None]:
logistic_regression_model = LogisticRegression()

# Fit: Train the Model by supplying it with some training it

In [None]:
logistic_regression_model.fit(training_features, training_targets)

# Generate predictions from the model we just fit

In [None]:
predictions = logistic_regression_model.predict(training_features)

In [None]:
results_df = pd.DataFrame({"Prediction": predictions, "Actual": training_targets})
results_df

# Apply the fitted model to the test dataset

In [None]:
testing_predictions = logistic_regression_model.predict(testing_features)

In [None]:
results_df = pd.DataFrame({
    "Testing Data Predictions": testing_predictions,
    "Testing Data Actual Targets": testing_targets})
results_df

# Compare each predicted value to its actual value

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(testing_targets, testing_predictions)

# We evaluated the model predictions. If high accuracy (closer to 1) it may ean that there is overfitting which may mean that the model won't perform well on new data it was not trained on

# We can categorize the predictions on higher house prices or lower house prices according to a confusion matrix.