In [4]:
# Import libraries here
import warnings
from glob import glob
import plotly.express as px
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

In [5]:
# Wrangle function:
# read in a csv file
# apartments in <cityName> < $100000
# remove outliers
# separate columns
# create new columns from existing
# take care of highly null columns
# low and high cardinality
# Leakage
# multicolinearity

def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in <cityName>, less than 100,000
    mask_ba = df["place_with_parent_names"].str.contains('Distrito Federal')
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # split lat-lon column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Extract newColumnName
    df['borough'] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    df.drop(columns="place_with_parent_names", inplace=True)

    # Drop feature with high null count
    df.drop(columns=["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", "expenses"], inplace=True)

    # Drop low- and high- categorical variables
    df.drop(columns=["operation", "property_type", "currency", "properati_url"], inplace=True)

    # Drop leaky columns
    df.drop(columns=["price", "price_aprox_local_currency", "price_per_m2"], inplace=True)

    # Drop columns with multi-colinerlity
    #df.drop(columns=["surface_total_in_m2", "rooms"], inplace=True)

    return df


In [8]:
# Use this cell to test your wrangle function and explore the data

# df =pd.read_csv("data/mexico-city-real-estate-1.csv")
# df
df= wrangle("data/mexico-city-real-estate-1.csv")
df

In [None]:
frames = [wrangle(file) for file in files]
df = pd.concat(frames , ignore_index=True)
print(df.info())
df.head()

In [None]:
# Build histogram
plt.hist(df['price_aprox_usd'])

# Label axes
plt.xlabel('Price [$]')
plt.ylabel('Count')
# Add title
plt.title('Distribution of Apartment Prices')

# Don't delete the code below 👇
plt.savefig("images/2-5-4.png", dpi=150)


In [None]:
import matplotlib.pyplot as plt

# Build scatter plot
plt.scatter(x=df["surface_covered_in_m2"], y=df["price_aprox_usd"])

# Label axes
plt.xlabel('Area [sq meters]')
plt.ylabel('Price [USD]')

# Add title
plt.title('Mexico City: Price vs. Area')



# Show plot
plt.show()


In [None]:
df.head()

In [None]:
# Split data into feature matrix `X_train` and target vector `y_train`.
# splitting data into feature matrix and target vector

target = "price_aprox_usd"  # <--- vector
features = ["surface_covered_in_m2", "lat", "lon", "borough"]   # <--- matrix
X_train = df[features]  # training data
y_train = df[target]    # " " " "

# The vector is what we are trying to predict using the matrix
# In this case we are trying to predict the price of a property
# using the features in the matrix

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae =  mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

In [None]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)

# fit...
model.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from category_encoders import OneHotEncoder

# Fit the model on training data
model.fit(X_train, y_train)

# Read the CSV file for test features
X_test = pd.read_csv("data/mexico-city-test-features.csv")

# Ensure X_test has the same column order as X_train
X_test = X_test[X_train.columns]

# Predict using the trained model
y_pred = model.predict(X_test)

# Display information about X_test
print(X_test.info())
X_test.head()


In [None]:
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()



In [None]:
intercept = model.named_steps["ridge"].intercept_

# retrieve coefficients
coefficients = model.named_steps["ridge"].coef_

# retrieve names
features = model.named_steps["onehotencoder"].get_feature_names()

# create a series of names and values
feat_imp = pd.Series(coefficients, index=features)
feat_imp


In [None]:
# Build bar chart
feat_imp.sort_values(key=abs).tail(10).plot(kind = 'barh')

# Label axes
plt.xlabel('Importance [USD]')
plt.ylabel('Feature')
# Add title
plt.title('Feature Importances for Apartment Price')

# Don't delete the code below 👇
plt.savefig("images/2-5-13.png", dpi=150)
