In [1]:
import requests
import pandas as pd
from datetime import datetime


url = "https://unofficial-redfin.p.rapidapi.com/properties/list"

querystring = {
    "region_id": "10297",
    "region_type": "6",
    "uipt": "1,2,3,4,7,8",
    "status": "9",
    "sf": "1,2,3,5,6,7",
    "num_homes": "500"
}

headers = {
    "X-RapidAPI-Key": "b60faef7a6msh2c985c19232815ap1c4e46jsna04a608045c9",
    "X-RapidAPI-Host": "unofficial-redfin.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)
data = response.json()

# Extracting nested data from 'homes' -> 'homeData'
homes_data = [home['homeData'] for home in data['homes']]


# Convert the extracted data to pandas DataFrame
df = pd.json_normalize(homes_data)

# Get the current date and time
current_datetime = datetime.now()

# Add the current date and time as a new column in the DataFrame
df['Data Requested Date'] = current_datetime



In [2]:
import re

# Function to format column names
def format_column_name(column_name):
    # Replace periods with spaces
    column_name = column_name.replace('.', ' ')

    # Add space before capital letters and make lowercase
    formatted_name = ''.join([' ' + char if char.isupper() else char for char in column_name]).strip().lower()
    
    # Remove leading/trailing spaces and replace multiple spaces with a single space
    formatted_name = re.sub(' +', ' ', formatted_name.strip()).lower()

    return formatted_name
# Format the column headers
df.columns = [format_column_name(col) for col in df.columns]

In [3]:
# Convert 'days on market days on market value' to numeric
df['days on market days on market value'] = pd.to_numeric(df['days on market days on market value'], errors='coerce')
# Convert 'days on market days on market value' to a timedelta and subtract from 'data requested date'
df['date added to market'] = df['data requested date'] - pd.to_timedelta(df['days on market days on market value'], unit='D')


In [4]:
df['days on market time on redfin seconds'] = pd.to_numeric(df['days on market time on redfin seconds'], errors='coerce')
# Convert 'days on market time on redfin seconds' to a timedelta and subtract from 'data requested date'
df['date added to redfin'] = df['data requested date'] - pd.to_timedelta(df['days on market time on redfin seconds'], unit='s')

In [5]:
# Convert 'last sale data last sold date seconds' to numeric, filling NaNs with 0
df['last sale data last sold date seconds'] = pd.to_numeric(df['last sale data last sold date seconds'], errors='coerce').fillna(0)

# Apply the calculation to all rows
df['date of last sale'] = df['data requested date'] - pd.to_timedelta(df['last sale data last sold date seconds'], unit='s')

# For rows where 'last sale data last sold date seconds' is 0, set 'date of last sale' to NaN or some default value
df.loc[df['last sale data last sold date seconds'] == 0, 'date of last sale'] = pd.NaT  # or some other default value


In [6]:
df.to_excel('data/dshs_test_lq.xlsx')

In [7]:
df.describe()

Unnamed: 0,beds value,baths value,days on market days on market value,days on market time on redfin seconds,days on market time on redfin nanos,days on market listing added date nanos,year built year built value,last sale data last sold date seconds,direct access info time zone id,full baths value,...,bath info computed full baths,bath info computed total baths,address info centroid centroid latitude,address info centroid centroid longitude,bath info raw three quarter baths,partial baths value,bath info raw half baths,bath info computed partial baths,bath info raw quarter baths,last sale data last sold date nanos
count,396.0,396.0,395.0,395.0,393.0,395.0,377.0,399.0,399.0,396.0,...,396.0,396.0,399.0,399.0,62.0,209.0,190.0,209.0,24.0,1.0
mean,3.467172,3.311869,61.081013,5274481.0,492946600.0,515673400.0,2002.31565,1009651000.0,7.0,3.106061,...,3.106061,3.400253,33.66401,-116.271145,1.790323,1.114833,1.078947,1.114833,1.125,743000000.0
std,4.270764,4.087488,56.34681,4870745.0,288336200.0,286244100.0,14.316283,712362900.0,0.0,4.045171,...,4.045171,4.096096,0.032359,0.025742,1.118211,0.434365,0.369583,0.434365,0.612372,
min,1.0,1.0,1.0,74590.0,1000000.0,3000000.0,1937.0,0.0,7.0,1.0,...,1.0,1.0,33.600363,-116.318621,1.0,1.0,1.0,1.0,1.0,743000000.0
25%,3.0,2.0,23.0,2006488.0,255000000.0,257000000.0,1990.0,0.0,7.0,2.0,...,2.0,2.0,33.638818,-116.295514,1.0,1.0,1.0,1.0,1.0,743000000.0
50%,3.0,3.0,47.0,4058540.0,491000000.0,526000000.0,2003.0,1409209000.0,7.0,3.0,...,3.0,3.0,33.659116,-116.267968,1.0,1.0,1.0,1.0,1.0,743000000.0
75%,4.0,3.5,75.5,6522057.0,718000000.0,766000000.0,2013.0,1612210000.0,7.0,3.0,...,3.0,3.5,33.683373,-116.251388,2.0,1.0,1.0,1.0,1.0,743000000.0
max,86.0,80.0,414.0,35769160.0,992000000.0,997000000.0,2024.0,1696316000.0,7.0,80.0,...,80.0,80.0,33.736599,-116.233736,5.0,4.0,4.0,4.0,4.0,743000000.0
