In [5]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('titanic',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")

Local environment
Added the following directory to the PYTHONPATH: /home/jdowling/Projects/mlfs-book


In [6]:
import os
import hopsworks
import pandas as pd
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

project = hopsworks.login()
fs = project.get_feature_store()

2025-01-08 23:43:30,879 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-08 23:43:30,892 INFO: Initializing external client
2025-01-08 23:43:30,893 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-08 23:43:32,377 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/398


In [3]:
titanic_df = pd.read_csv(f"{root_dir}/data/titanic.csv")
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Feature Engineering

 * Impute any missing values for `Age` and `Embarked`

In [4]:
titanic_df = titanic_df[['PassengerId', 'Sex','Age','Pclass','Fare','Parch','SibSp','Embarked', 'Survived']]
#fill NAs with some imputed values
def_values = {'Age': titanic_df['Age'].mean(), 'Embarked': titanic_df['Embarked'].value_counts().idxmax()}
titanic_df = titanic_df.fillna(value=def_values)
titanic_df

Unnamed: 0,PassengerId,Sex,Age,Pclass,Fare,Parch,SibSp,Embarked,Survived
0,1,male,22.000000,3,7.2500,0,1,S,0
1,2,female,38.000000,1,71.2833,0,1,C,1
2,3,female,26.000000,3,7.9250,0,0,S,1
3,4,female,35.000000,1,53.1000,0,1,S,1
4,5,male,35.000000,3,8.0500,0,0,S,0
...,...,...,...,...,...,...,...,...,...
886,887,male,27.000000,2,13.0000,0,0,S,0
887,888,female,19.000000,1,30.0000,0,0,S,1
888,889,female,29.699118,3,23.4500,2,1,S,0
889,890,male,26.000000,1,30.0000,0,0,C,1


In [5]:
titanic_fg = fs.get_or_create_feature_group(
    name="titanic",
    version=1,
    primary_key=['PassengerId'],
    description="Titanic passengers dataset"
)

In [6]:
titanic_fg.insert(titanic_df, wait=True)





Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████████████████████| Rows 891/891 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: titanic_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/17565/jobs/named/titanic_1_offline_fg_materialization/executions


(Job('titanic_1_offline_fg_materialization', 'SPARK'), None)