In [78]:
import pandas as pd
import numpy as np
import os
my_dir = os.path.dirname(os.path.abspath('__file__'))
os.chdir(my_dir)

data = 'data/uncleaned data/local data/zillow_listings_combined.csv'

In [79]:
df = pd.read_csv(data)

In [80]:
# Clean up home type
house_conditions = [
    df['type'].str.contains('condo', case=False, na=False),
    df['type'].str.contains('townhouse', case=False, na=False),
    df['type'].str.contains('manufactured', case=False, na=False)
]
house_values = ['condominium', 'townhouse', 'manufactured']
default_house_value = 'single family'
df['housing category'] = np.select(house_conditions, house_values, default_house_value)
df.drop('type', axis=1, inplace=True)

# Simplify and clean heating
heat_conditions = [
    df['heat type'].str.contains('ceiling', case=False, na=False),
    df['heat type'].str.contains('baseboard', case=False, na=False),
    df['heat type'].str.contains('ductless', case=False, na=False),
    df['heat type'].str.contains('forced air', case=False, na=False)
]
heat_values = ['ceiling', 'baseboard', 'ductless', 'forced air']
default_heat_value = 'other'
df['heat type'] = np.select(heat_conditions, heat_values, default_heat_value)
df.drop('heat type', axis=1, inplace=True)

# Convert HOA to yearly cost
def calculate_hoa(value):
    if "monthly" in value:
        return float(value.replace(" monthly HOA fee", "").replace("$", "").replace(",","").replace("monthly", "")) * 12
    elif "semi-annually" in value:
        return float(value.replace(" semi-annually HOA fee", "").replace("$", "").replace(",","")) * 2
    else:
        return 0

df['HOA yearly'] = df['HOA cost'].apply(calculate_hoa)
df.drop('HOA cost', axis=1, inplace=True)

# Convert sqft to integer type
def sqft_to_int(value):
    if "sqft" in value:
        return int(value.replace(" sqft", "").replace(",", ""))
    elif "Acres" in value:
        # Assuming 1 acre = 43560 sqft
        #return int(float(value.replace(" Acres", "")) * 43560)
        return None
    else:
        return None

df['sqft'] = df['sqft'].apply(sqft_to_int)

# Convert price to int
df['listing price'] = df['listing price'].astype(int)

# Convert 'price per sqft' to integer
df['price per sqft'] = df['price per sqft'].str.replace("$", "").str.replace(" price/sqft", "").astype(int)

# Convert 'buyers fee' to float
df['buyers fee'] = df['buyers fee'].str.replace("buyers agency fee", "")
df['buyers fee'] = df['buyers fee'].str.replace("%", "")
df['buyers fee'] = df['buyers fee'].str.replace("¬†", "")
# Handling for when there is a fixed dollar value
def clean_buyers_fee(buyers_fee, price):
    if isinstance(buyers_fee, str) and buyers_fee.startswith('$'):
        buyers_fee = int(buyers_fee.replace('$', '').replace(',', ''))
        buyers_fee = (buyers_fee / price) * 100
    return buyers_fee
df['buyers fee'] = df.apply(lambda row: clean_buyers_fee(row['buyers fee'], row['price per sqft']), axis=1)

#df['buyers fee'] = df['buyers fee'].astype(float) / 100

# Convert 'year built' to int
df['year built'] = df['year built'].str.replace('Built in ', "")
df['year built'] = df['year built'].astype(int)

# Drop duplicates and then drop the address column
df = df.drop_duplicates(subset = 'address')
df = df.drop('address', axis=1)

print(df)

          city state    zip  year built    cooling type   
0     Portland    OR  97233        1983     Central air  \
1     Portland    OR  97225        1964     Central air   
2     Portland    OR  97215        1913  Window unit(s)   
3     Portland    OR  97229        1957     Central air   
4     Portland    OR  97218        1928       Heat pump   
..         ...   ...    ...         ...             ...   
461  Corvallis    OR  97330        1990     Central air   
462  Corvallis    OR  97330        2006     Central air   
463  Corvallis    OR  97333        1935         No data   
464  Corvallis    OR  97330        1967     Central air   
465  Corvallis    OR  97333        2005     Central air   

                  garage type    sqft  price per sqft buyers fee   
0    2 Attached garage spaces  7840.0             340       2.5   \
1    2 Attached garage spaces     NaN             312       2.5    
2              1 Garage space  3920.0             358       2.5    
3             2 Gar

In [81]:
df.head()

Unnamed: 0,city,state,zip,year built,cooling type,garage type,sqft,price per sqft,buyers fee,listing price,housing category,HOA yearly
0,Portland,OR,97233,1983,Central air,2 Attached garage spaces,7840.0,340,2.5,402500,single family,0.0
1,Portland,OR,97225,1964,Central air,2 Attached garage spaces,,312,2.5,695000,single family,0.0
2,Portland,OR,97215,1913,Window unit(s),1 Garage space,3920.0,358,2.5,575000,single family,0.0
3,Portland,OR,97229,1957,Central air,2 Garage spaces,,312,2.5,1000000,single family,0.0
4,Portland,OR,97218,1928,Heat pump,Open parking,4791.0,280,2.5,295000,single family,0.0


In [82]:
df.to_csv('data/cleaned data/zillow_listings_combined_clean.csv', index = False)