In [0]:
%sh
pip install -U homeharvest

Collecting homeharvest
  Downloading homeharvest-0.3.33-py3-none-any.whl (17 kB)
Collecting pydantic<3.0.0,>=2.7.4
  Downloading pydantic-2.8.2-py3-none-any.whl (423 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 423.9/423.9 kB 8.9 MB/s eta 0:00:00
Collecting requests<3.0.0,>=2.31.0
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.9/64.9 kB 7.7 MB/s eta 0:00:00
Collecting pandas<3.0.0,>=2.1.1
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 80.6 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 345.4/345.4 kB 26.5 MB/s eta 0:00:00
Collecting pydantic-core==2.20.1
  Downloading pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 80.5 MB/s et

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
databricks-sdk 0.1.6 requires requests<2.29.0,>=2.28.1, but you have requests 2.32.3 which is incompatible.


Successfully installed annotated-types-0.7.0 homeharvest-0.3.33 pandas-2.2.2 pydantic-2.8.2 pydantic-core-2.20.1 requests-2.32.3 typing-extensions-4.12.2 tzdata-2024.1



[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: pip install --upgrade pip


## **Scraping data**

In [0]:
from homeharvest import scrape_property
from datetime import datetime

# Generate filename based on current timestamp
filename = "REdata.csv"

properties = scrape_property(
  location="New York, NY",
  listing_type="sold",  # or (for_sale, for_rent, pending)
  past_days=120,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)

  
  mls_only=True,  # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")

#properties.to_csv(filename, index=False)
print(properties.head())

dbutils.fs.mkdirs("/tmp")
dbfs_path = "/dbfs/tmp/raw_file.csv"
properties.to_csv(dbfs_path, index=False)

Number of properties: 4035
                                        property_url  ...                                         alt_photos
0  https://www.realtor.com/realestateandhomes-det...  ...  http://ap.rdcpix.com/8ba992edb20c6f21bb700f63d...
1  https://www.realtor.com/realestateandhomes-det...  ...  http://ap.rdcpix.com/d26219b8f70df7ea20c11042a...
2  https://www.realtor.com/realestateandhomes-det...  ...  http://ap.rdcpix.com/06cee0ee39e9771d359ef30fb...
3  https://www.realtor.com/realestateandhomes-det...  ...  http://ap.rdcpix.com/cb251f60979988caed227a782...
4  https://www.realtor.com/realestateandhomes-det...  ...  http://ap.rdcpix.com/e47b9b9a69ac98762cf373df9...

[5 rows x 43 columns]


### Storing raw data in ADLS

In [0]:
spark.conf.set("fs.azure.account.auth.type.djrestores.dfs.core.windows.net", "SAS")
spark.conf.set("fs.azure.sas.token.provider.type.djrestores.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set("fs.azure.sas.fixed.token.djrestores.dfs.core.windows.net", "sp=racwd&st=2024-07-29T01:55:11Z&se=2024-07-29T09:55:11Z&spr=https&sv=2022-11-02&sr=c&sig=tVJkrmF87EJiDz6Crqo0Ea4Av7Rlqp12da5Mrzni6zQ%3D")

source_path = "/tmp/raw_file.csv"
adls_path = f"abfss://rawdata@djrestores.dfs.core.windows.net/rawfile.csv"

# Move the file from DBFS to ADLS
dbutils.fs.cp(f"dbfs:{source_path}", adls_path)
     

True

### Data cleaning

In [0]:
# Drop 'agent' column and all columns to its right
agent_index = properties.columns.get_loc('agent')
properties = properties.iloc[:, :agent_index]

#fix null values
properties['hoa_fee'].fillna(0, inplace=True)
properties['parking_garage'].fillna(0, inplace=True)

# Export to csv
properties.to_csv(filename, index=False)
print(properties.head())

                                        property_url  ... parking_garage
0  https://www.realtor.com/realestateandhomes-det...  ...              1
1  https://www.realtor.com/realestateandhomes-det...  ...              2
2  https://www.realtor.com/realestateandhomes-det...  ...              0
3  https://www.realtor.com/realestateandhomes-det...  ...              0
4  https://www.realtor.com/realestateandhomes-det...  ...              0

[5 rows x 34 columns]


Saving processed data

In [0]:
spark.conf.set("fs.azure.account.auth.type.djrestores.dfs.core.windows.net", "SAS")
spark.conf.set("fs.azure.sas.token.provider.type.djrestores.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set("fs.azure.sas.fixed.token.djrestores.dfs.core.windows.net", "sp=racwdlm&st=2024-07-29T02:33:14Z&se=2024-07-29T10:33:14Z&spr=https&sv=2022-11-02&sr=c&sig=DyhaRpv73pzVmZc69YTrBXhD0ygcrUZueh0WRsdlZNM%3D")


dbfs_path = "/dbfs/tmp/proc_file.csv"
properties.to_csv(dbfs_path, index=False)

source_path = "/tmp/proc_file.csv"
adls_path = f"abfss://processed@djrestores.dfs.core.windows.net/processedfile.csv"

# Move the file from DBFS to ADLS
dbutils.fs.cp(f"dbfs:{source_path}", adls_path)

True