## Step 00: Load data

In this notebook, we retrieve the dataset "Crop mapping using fused optical-radar data set" from the UC Irvine Machine Learning Repository and unzip the file into the directory "../data/crop_mapping/". We also scrape the dataset's web page for the list of variable attributes and save this list to a file "features.txt".

In [1]:
import os
import urllib.request as request
import zipfile
from pathlib import Path

from bs4 import BeautifulSoup

In [2]:
DATA_PUBLIC_URLS = {
    'crop_mapping': 'https://archive.ics.uci.edu/static/public/525/crop+mapping+using+fused+optical+radar+data+set.zip'
}

In [3]:
def download_file(dataset_name, url):
    filename, headers = request.urlretrieve(
        url = url,
        filename = f"../data/{dataset_name}.zip"
    )

def extract_zip_file(dataset_name):
    zipfile_name = f"../data/{dataset_name}.zip"
    unzip_path = f"../data/{dataset_name}"
    os.makedirs(unzip_path, exist_ok=True)
    with zipfile.ZipFile(zipfile_name, 'r') as zip_ref:
        zip_ref.extractall(unzip_path)

In [4]:
if not os.path.exists("../data"):
    os.mkdir("../data")

In [5]:
# Download crop mapping data
dataset_name = 'crop_mapping'
url = DATA_PUBLIC_URLS[dataset_name]
download_file(dataset_name, url)
extract_zip_file(dataset_name)

In [6]:
# Fix crop mapping data filename
os.replace(f"../data/crop_mapping/WinnipegDataset.txt", f"../data/crop_mapping/WinnipegDataset.csv")

In [7]:
# Retrieve crop mapping features
response = request.urlopen('https://archive.ics.uci.edu/dataset/525/crop+mapping+using+fused+optical+radar+data+set')
if response.status == 200:
    html_content = response.read()
else:
    print("Failed to retrieve the page. Status code:", response.status)

In [8]:
soup = BeautifulSoup(html_content, 'html.parser')
features_description = soup.find_all("p",attrs={"class":"whitespace-pre-wrap svelte-1xc1tf7"})[2].text
features = features_description[features_description.index('label'): features_description.index('For more information') - 4]
features = ''.join(features.split('\r'))

In [9]:
with open("../data/crop_mapping/features.txt", "w") as features_file:
    features_file.write(features)