In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [2]:
df_m1 = pd.read_csv('../data/raw_extraction/vancouver_real_estate_m1.csv', index_col = 0)
df_m1_2 = pd.read_csv('../data/raw_extraction/vancouver_real_estate2_m1.csv', index_col = 0)


In [3]:
df_m1 = pd.concat([df_m1, df_m1_2], axis=0)
df_m1.drop_duplicates(inplace=True)

In [4]:
# Convert empty strings and None to NaN
df_m1.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Convert price to float
df_m1["price"] = df_m1["price"].replace(r"[\$,]", "", regex=True).astype("float32")

# Convert bed to integer (using Int64 which supports NaN)
df_m1["bed"] = df_m1["bed"].astype(str).str.extract(r"(\d+)").astype("Int32")

# Convert bath to float (supports half baths)
df_m1["bath"] = df_m1["bath"].astype(str).str.extract(r"(\d+\.?\d*)").astype("float32")

# Convert sqr_footage to float, handling missing values properly
df_m1["sqr_footage"] = (
    df_m1["sqr_footage"]
    .astype(str)
    .str.replace(",", "", regex=True)
    .str.extract(r"(\d+)")
    .astype("float32")
)



In [5]:
df_m1.drop_duplicates(inplace=True)
df_m1.columns = ['address', 'postalCode', 'price', 'bedroom', 'bathroom', 'square_footage', 'property_link']
df_m1

Unnamed: 0,address,postalCode,price,bedroom,bathroom,square_footage,property_link
0,4091 W 34th Ave,V6N 2L6,2788000.0,5,2.0,1653.0,https://www.redfin.com/bc/vancouver/4091-W-34t...
1,4067 W 37th Ave,V6N 2W6,4800000.0,6,4.0,4189.0,https://www.redfin.com/bc/vancouver/4067-W-37t...
2,4056 W 36th Ave,V6N 2S9,6750000.0,4,3.0,3837.0,https://www.redfin.com/bc/vancouver/4056-W-36t...
3,3968 W 23rd Ave,V6S 1L2,3388000.0,4,3.5,2359.0,https://www.redfin.com/bc/vancouver/3968-W-23r...
4,4022 W 30th Ave,V6S 1X5,6590000.0,5,5.5,4190.0,https://www.redfin.com/bc/vancouver/4022-W-30t...
...,...,...,...,...,...,...,...
5352,550 Taylor St #2207,V6B 1R1,829000.0,2,2.0,865.0,https://www.redfin.com/bc/vancouver/550-Taylor...
5353,63 Keefer Pl #1510,V6B 6N6,529900.0,1,1.0,579.0,https://www.redfin.com/bc/vancouver/63-Keefer-...
5354,188 Keefer St #816,V6A 0E3,599000.0,1,1.0,608.0,https://www.redfin.com/bc/vancouver/188-Keefer...
5355,511 E Georgia St,V6A 1Z8,1299000.0,,,,https://www.redfin.com/bc/vancouver/511-E-Geor...


In [6]:
df_m2 = pd.read_csv("../data/raw_extraction/vancouver_real_estate_m2", index_col = 0)
df_event_m2 = pd.read_csv("../data/raw_extraction/vancouver_real_estate_event_m2", index_col = 0)
df_event_list_m2 = pd.read_csv("../data/raw_extraction/vancouver_real_estate_event2_m2", index_col = 0)

In [7]:
print(f'df_m2: {df_m2.shape}')
print(f'df_event_m2: {df_event_m2.shape}')
print(f'df_event_list_m2: {df_event_list_m2.shape}')

df_m2: (14907, 8)
df_event_m2: (4786, 6)
df_event_list_m2: (1724, 5)


In [8]:
print(f'columns of df_m2: {df_m2.columns}')
print(f'columns of df_event_m2: {df_event_m2.columns}')
print(f'columns df_event_list_m2: {df_event_list_m2.columns}')

columns of df_m2: Index(['address', 'postalCode', 'latitude', 'longitude', 'price',
       'square_footage', 'bedroom', 'url'],
      dtype='object')
columns of df_event_m2: Index(['address', 'postalCode', 'latitude', 'longitude', 'price', 'url'], dtype='object')
columns df_event_list_m2: Index(['address', 'postalCode', 'latitude', 'longitude', 'url'], dtype='object')


In [9]:
df_m2 = pd.concat([df_m2, df_event_m2, df_event_list_m2], axis=0)

In [10]:
df_m2.drop_duplicates(inplace=True)
df_m2.columns = ['address',	'postalCode', 'latitude', 'longitude', 'price', 'square_footage', 'bedroom', 'property_link']
df_m2

Unnamed: 0,address,postalCode,latitude,longitude,price,square_footage,bedroom,property_link
0,4615 W 4th Ave,V6R 1R6,49.269275,49.269275,2888000.0,2300.0,5.0,https://www.redfin.ca/bc/vancouver/4615-W-4th-...
1,4922 Queensland Rd,V6T 1G4,49.266380,49.266380,3550000.0,2392.0,5.0,https://www.redfin.ca/bc/greater-vancouver-reg...
2,4650 W 6th Ave,V6R 1V7,49.267092,49.267092,4088800.0,2548.0,5.0,https://www.redfin.ca/bc/vancouver/4650-W-6th-...
3,4343 W 14 Ave,V6R 2X9,49.260349,49.260349,2699000.0,2202.0,3.0,https://www.redfin.ca/bc/vancouver/4343-W-14th...
4,4688 W 10th Ave #402,V6R 2J5,49.263644,49.263644,1198000.0,1165.0,2.0,https://www.redfin.ca/bc/vancouver/4688-W-10th...
...,...,...,...,...,...,...,...,...
297,189 Keefer St #1006,V6A 0C8,49.279651,-123.100137,,,,https://www.redfin.ca/bc/vancouver/189-Keefer-...
298,120 Powell St #31,V6A 1G1,49.283004,-123.101533,,,,https://www.redfin.ca/bc/vancouver/120-Powell-...
299,518 Beatty St #804,V6B 6G8,49.280869,-123.108795,,,,https://www.redfin.ca/bc/vancouver/518-Beatty-...
300,188 Keefer Pl #226,V6B 0J1,49.279640,-123.108857,,,,https://www.redfin.ca/bc/vancouver/188-Keefer-...


In [24]:
merged_df = pd.merge(df_m1, df_m2, on='address', how='outer', suffixes=('_df1', '_df2'))
print(len(merged_df), len(df_m1), len(df_m2))

6362 4492 6192


In [25]:
addresses_df1 = set(df_m1['address'])
addresses_df2 = set(df_m2['address'])

common_addresses = addresses_df1.intersection(addresses_df2)
common_rows_merged_df = merged_df[merged_df['address'].isin(common_addresses)]

unique_addresses_df1 = addresses_df1 - addresses_df2
unique_addresses_df2 = addresses_df2 - addresses_df1
unique_addresses = unique_addresses_df1.union(unique_addresses_df2)
unique_rows_merged_df = merged_df[merged_df['address'].isin(unique_addresses)]


In [27]:
# manual adjustment for name consistency
unique_rows_merged_df = unique_rows_merged_df.rename(columns={
    'bath': 'bathroom_df1',
    'latitude': 'latitude_df2',
    'longitude': 'longitude_df2'
})

union_columns = set(df_m1.columns).union(set(df_m2.columns))
combined_data = {}
for metric in union_columns:
    df1_col = f"{metric}_df1"
    df2_col = f"{metric}_df2"

    if df1_col in unique_rows_merged_df.columns and df2_col in unique_rows_merged_df.columns:
        combined_data[metric] = unique_rows_merged_df[df1_col].combine_first(unique_rows_merged_df[df2_col])
    elif df1_col in unique_rows_merged_df.columns:
        combined_data[metric] = unique_rows_merged_df[df1_col]
    elif df2_col in unique_rows_merged_df.columns:
        combined_data[metric] = unique_rows_merged_df[df2_col]

non_suffixed_columns = [col for col in unique_rows_merged_df.columns if '_df1' not in col and '_df2' not in col]
for col in non_suffixed_columns:
    combined_data[col] = unique_rows_merged_df[col]

combined_df = pd.DataFrame(combined_data)


In [28]:
combined_df

Unnamed: 0,bedroom,postalCode,price,property_link,longitude,square_footage,latitude,address,bathroom
52,2.0,V6E 1T7,1599000.0,https://www.redfin.com/bc/vancouver/1010-Beach...,,1602.0,,1010 Beach Ave #401,2.0
59,1.0,V6Z 1P5,579000.0,https://www.redfin.ca/bc/vancouver/1010-Howe-S...,49.279610,500.0,49.279610,1010 Howe St #909,
63,2.0,V6M 2A8,749000.0,https://www.redfin.ca/bc/vancouver/1010-W-42nd...,49.232849,956.0,49.232849,1010 W 42nd Ave #203,
64,,V6M 2A8,749000.0,https://www.redfin.ca/bc/vancouver/1010-W-42nd...,-123.128576,,49.232849,1010 W 42nd Ave #203,
96,3.0,V6Z 2W1,1599000.0,https://www.redfin.ca/bc/vancouver/1019-Expo-B...,49.275064,1593.0,49.275064,1019 Expo Blvd,
...,...,...,...,...,...,...,...,...,...
6309,2.0,V6A 0G9,599900.0,https://www.redfin.ca/bc/vancouver/983-E-Hasti...,49.281349,642.0,49.281349,983 E Hastings St #303,
6310,,V6A 0G9,599900.0,https://www.redfin.ca/bc/vancouver/983-E-Hasti...,-123.082745,,49.281349,983 E Hastings St #303,
6324,1.0,V5Z 1Z1,549900.0,https://www.redfin.ca/bc/vancouver/988-W-21st-...,49.252492,503.0,49.252492,988 W 21st Ave #306,
6325,,V5Z 1Z1,549900.0,https://www.redfin.ca/bc/vancouver/988-W-21st-...,-123.127033,,49.252492,988 W 21st Ave #306,
