In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd

def load_mushroom_data():
    features, target = fetch_openml(data_id=24, return_X_y=True, as_frame=True)
    X = pd.get_dummies(features)
    y = (target == "p").astype(int) # p="poisonous"
    return X, y


X, y = load_mushroom_data()

In [2]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

def load_california_housing_data():
    dataset = fetch_california_housing()
    X = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
    y = pd.Series(data=dataset.target, name="target")
    return X, y

X, y = load_california_housing_data()

In [3]:
# https://www.axxio.io/wp-content/uploads/2018/05/documentation.pdf
import numpy as np
import pandas as pd


def load_german_credit_data():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases"
    url += "/statlog/german/german.data-numeric"

    col_names = [
        "checking_account", "duration", "credit_history", "credit_amount",
        "savings_account", "employment_duration", "personal_status",
        "residence_duration", "property", "age", "other_installment_plans",
        "number_credits", "people_liable", "telephone", "foreign_worker",
        "purpose_car_new", "purpose_car_used", "other_debtors_none",
        "other_debtors_coapplicant", "housing_rent", "housing_own",
        "job_unskilled_non_resident", "job_unskilled_resident",
        "job_employee", "credit_risk"]

    df = pd.read_csv(
        url, header=None, names=col_names, delim_whitespace=True)

    X = df.iloc[:, :-1]
    y = (df.iloc[:, -1] == 2).astype(int) # 2 = "Bad"

    return X, y

X, y = load_german_credit_data()

In [4]:
import pandas as pd

def load_palmer_penguins(only_numeric=True, no_missing=True, multiclass=True):
    url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/main/inst/extdata/penguins.csv"
    numeric_features = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
    categorical_features = ["island", "sex"]
    data = pd.read_csv(url)
    if no_missing:
        data = data.dropna()
    if multiclass:
        target = data.species.replace({'Adelie':1, 'Gentoo':2, 'Chinstrap':3})
    else:
        target = data.species.replace({'Adelie':1, 'Gentoo':0, 'Chinstrap':0})
    if only_numeric:
        return data[numeric_features], target
    else:
        return data[numeric_features + categorical_features], target
    
X, y = load_palmer_penguins(only_numeric=True, no_missing=True, multiclass=False)

In [5]:
import pandas as pd
import urllib.request
import zipfile

def get_movielens():
    url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
    filename = 'ml-1m.zip'
    urllib.request.urlretrieve(url, filename)
    params = {"sep":"::", "engine":"python", "encoding":"latin-1"}

    with zipfile.ZipFile('ml-1m.zip', 'r') as zip_file:
        with zip_file.open('ml-1m/users.dat') as file:
            users = pd.read_csv(file, names=['user_id', 'gender', 'age', 'occupation', 'zip'], **params)
        with zip_file.open('ml-1m/ratings.dat') as file:
            ratings = pd.read_csv(file, names=['user_id', 'movie_id', 'rating', 'timestamp'], **params)
        with zip_file.open('ml-1m/movies.dat') as file:
            movies = pd.read_csv(file, names=['movie_id', 'title', 'genres'], **params)
    return pd.merge(pd.merge(ratings, users), movies)
        
movielens = get_movielens()