In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# loaded a bit of the data because it's too long
sample_fraction = 0.1  # 10%
y16_iter = pd.read_csv("/work/properties_2016.csv", chunksize=100000)
y16_df = pd.concat(chunk.sample(frac=sample_fraction, random_state=42) for chunk in y16_iter)

y16_df['hashottuborspa'] = y16_df['hashottuborspa'].map({'true': True, 'false': False})
y16_df.head()

y16_df.info()
y16_df.isnull().sum().sort_values(ascending=False).head(15)

y16_df = y16_df.dropna(subset=['bathroomcnt', 'bedroomcnt', 'taxvaluedollarcnt'])
numeric_cols = y16_df.select_dtypes(include=[np.number]).columns
y16_df[numeric_cols] = y16_df[numeric_cols].fillna(y16_df[numeric_cols].median())
print("Cleaned:", y16_df.shape)

plt.hist(y16_df['taxvaluedollarcnt'], bins=50, color='skyblue')
plt.title("Distribution of Property Values")
plt.xlabel("Tax Value ($)")
plt.ylabel("Count")
plt.show()

correlation = y16_df[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'taxvaluedollarcnt']].corr()
print(correlation)

features = ['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']
target = 'taxvaluedollarcnt'

X = y16_df[features]
y = y16_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

preds = ridge.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
print("RMSE:", rmse)