In [1]:
import pandas as pd

# Data loading and simple exploration

In [2]:
data = pd.read_csv('../data/03_primary/preprocessed_data.csv')
data.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-0.282401,-0.62872,1.237076,0.240536,0.644799,0.700348,-0.662809,-1.186289,0.051565,-0.212036,...,-0.120263,-0.238889,-0.705162,0.179966,0.699409,0.165166,0.075183,-0.030301,0.021784,0
1,1.186334,-0.083001,-7.13906,2.773082,-6.757845,4.446456,-5.464428,-1.713401,-6.485365,3.409395,...,-0.616949,1.30325,-0.016118,-0.87667,0.38223,-1.054624,-0.614606,-0.766848,0.409424,1
2,0.391253,0.870217,-1.01847,-0.800243,2.73563,-1.710196,-0.756206,-0.299152,-0.86865,0.263516,...,0.360591,0.499789,1.190104,-0.230243,0.016621,0.394464,0.046844,0.052241,0.082638,0
3,2.29344,0.289078,-1.000611,3.34685,-5.534491,6.835802,-0.299803,0.095951,-2.440419,1.286301,...,1.189814,0.439757,-0.694099,0.29966,-0.657601,0.101648,0.430457,0.824685,0.326952,1
4,-0.135401,0.969807,0.211042,1.352425,-2.257035,0.154344,0.530051,0.207244,-2.051166,-5.231705,...,1.100233,-2.511672,0.943468,-0.019769,0.586162,0.618007,0.621341,-0.098063,0.13397,0


The data is already preprocessed and it looks like there is no room for further feature engineering.<br>
<br>
That way, the only thing left to do is to push all the columns to our feature store.

# Hopsworks 

## Upload

In [2]:
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()

True

In [4]:
import hopsworks
from great_expectations.core import ExpectationSuite

In [5]:
from keyword import iskeyword

In [7]:
iskeyword('class_')

False

In [8]:
def to_feature_store(
    df: pd.DataFrame,
    group_name: str,
    description: str,
    group_description: dict,
    feature_group_version: Union[int, None] = None,
    validation_expectation_suite: ExpectationSuite = None
):
    '''
    This function takes in a pandas DataFrame and a validation expectation suite,
      performs validation on the data using the suite, and then saves the data to a
      feature store in the feature store.

    Args:
        - data (pd.DataFrame): Dataframe with the data to be stored
        - group_name (str): Name of the feature group.
        - feature_group_version (int | None): Version of the feature group. If None, increment 1 from last version
        - description (str): Description for the feature group.
        - group_description (dict): Description of each feature of the feature group.
        - validation_expectation_suite (ExpectationSuite): group of expectations to check data.

    Returns:
        - A dictionary with the feature view version, feature view name and training dataset feature version.
    '''
    ##### DELETE THIS BIT AFTER GX IS IMPLEMENTED #####
    if validation_expectation_suite:
        raise NotImplementedError
    ###################################################

    if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series):
        raise TypeError(f'Expect pd.DataFrame, got {type(df)}')
    if isinstance(df, pd.Series):
        new_df = pd.DataFrame({'index': df.index, df.name: df})
        df = new_df

    # Create primary key to posteriorly joins
    if 'index' not in df.columns:
        df = df.reset_index()

    # Hopsworks only accepts lowercase column names, better to sanitize beforehand. Also, try to protect from Python's reserved words
    df.columns = list(map(lambda x: x.lower() if not iskeyword(x.lower()) else x.lower() + '_', df.columns))

    # Get credentials
    project_name = os.environ.get('FS_PROJECT_NAME')
    api_key = os.environ.get('FS_API_KEY')

    # Connect to feature store.
    project = hopsworks.login(
        api_key_value=api_key, project=project_name
    )
    feature_store = project.get_feature_store()

    # Create feature group.
    object_feature_group = feature_store.get_or_create_feature_group(
        name=group_name,
        version=feature_group_version,
        primary_key=['index'],
        description= description,
        online_enabled=False,
        expectation_suite=validation_expectation_suite
    )

    # Upload data.
    object_feature_group.insert(
        features=df,
        overwrite=False,
        write_options={
            "wait_for_job": True,
        },
    )

    # Add feature descriptions.
    for description in group_description:
        object_feature_group.update_feature_description(
            description["name"], description["description"]
        )

    # Update statistics.
    object_feature_group.statistics_config = {
        "enabled": True,
        "histograms": True,
        "correlations": True,
    }
    object_feature_group.update_statistics_config()
    object_feature_group.compute_statistics()

    return object_feature_group

NameError: name 'Union' is not defined

### Test run

In [18]:
data.columns

Index(['scaled_amount', 'scaled_time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
       'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16',
       'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26',
       'V27', 'V28', 'Class'],
      dtype='object')

### Feature descriptions

In [82]:
feature_descriptions = [{'name': f'v{i}', 'description': 'Anonymized credit card data', 'validation_rules': 'TO DETERMINE'} for i in range(1, 29)]
feature_descriptions += [
    {'name': 'scaled_amount', 'description': 'Scaled amount of transaction', 'validation_rules': 'TO DETERMINE'},
    {'name': 'scaled_time', 'description': 'Scaled amount of time, relative to first transaction observation', 'validation_rules': 'TO DETERMINE'},
    {'name': 'index', 'description': 'Index of the observations', 'validation_rules': 'Positive integer, unique'},
]

feature_descriptions

[{'name': 'v1',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v2',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v3',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v4',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v5',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v6',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v7',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v8',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v9',
  'description': 'Anonymized credit card data',
  'validation_rules': 'TO DETERMINE'},
 {'name': 'v10',
  'description': 'Anonymized credit card data',
  'valid

In [115]:
class_description = [
    {'name': 'index', 'description': 'Index of the observations', 'validation_rules': 'Positive integer, unique'},
    {'name': 'class_', 'description': 'Predicted class of the observation. 1 for fraud, 0 otherwise', 'validation_rules': '0 or 1'}
]

In [84]:
X = data.drop(columns=['Class'])
y = data['Class']

In [85]:
to_feature_store(
    df=X, group_name='features',
    feature_group_version=1, description='Test run of features',
    group_description=feature_descriptions
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 0.00% |          | Rows 0/946 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/features_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x7b1253e48890>

In [116]:
to_feature_store(
    df=y, group_name='target',
    feature_group_version=5, description='Test run of target',
    group_description=class_description
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 0.00% |          | Rows 0/946 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: target_5_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/target_5_offline_fg_materialization/executions
Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/824468/jobs/named/target_5_compute_stats_22062024193250/executions


<hsfs.feature_group.FeatureGroup at 0x7b1253e067d0>

## Download

In [25]:
from typing import Union
from operator import attrgetter

def get_features(
    group_name: str,
    version: Union[int, None] = None
):
    '''
    This function takes in the group name of the desired features in Hopsworks and returns ad pd.DataFrame with them.

    Args:
        - group_name (str): Name of the feature group.
        - version (int | None): Version number of feature group. If None, latest is returned

    Returns:
        - A pd.DataFrame with the features.
    '''
    project_name = os.environ.get('FS_PROJECT_NAME')
    api_key = os.environ.get('FS_API_KEY')

    project = hopsworks.login(api_key_value=api_key, project=project_name)
    fs = project.get_feature_store()

    if version:
        features = fs.get_feature_group(name=group_name, version=version)
    else:
        # Get a list with all the versions and chooses the latest
        features = max(fs.get_feature_groups(name=group_name), key=attrgetter('version'))

    df = features.read()

    return df

In [26]:
y = get_features(
    'target'
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.68s) 


In [27]:
y

Unnamed: 0,index,class_
0,866,1
1,848,1
2,21,1
3,762,0
4,294,0
...,...,...
943,707,1
944,239,0
945,171,1
946,874,1


In [28]:
X = get_features(
    'features'
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/824468
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.43s) 


In [29]:
X

Unnamed: 0,index,scaled_amount,scaled_time,v1,v2,v3,v4,v5,v6,v7,...,v19,v20,v21,v22,v23,v24,v25,v26,v27,v28
0,866,4.570391,0.970947,-1.374424,2.793185,-4.346572,2.400731,-1.688433,0.111136,-0.922038,...,0.422342,0.650196,-0.870779,0.504849,0.137994,0.368275,0.103137,-0.414209,0.454982,0.096711
1,848,-0.293440,0.013769,-3.365265,2.928541,-5.660999,3.891160,-1.840375,-1.800887,-5.558679,...,2.355919,0.576379,0.875260,-0.102501,-0.606283,-0.743165,0.096319,-0.135060,1.238695,0.099824
2,21,8.747293,-0.489949,-4.423508,1.648048,-6.934388,4.894601,-5.078131,0.010849,-3.409096,...,2.590173,-0.562264,0.698359,0.487478,1.228698,-0.535217,0.388278,-0.009466,2.300164,0.081231
3,762,-0.237546,-0.245915,-1.117810,1.448685,2.040043,0.894239,-0.421149,-0.384121,0.181589,...,-0.080405,0.132128,-0.058182,-0.147958,-0.031986,0.592719,0.107354,-0.477520,0.240366,0.080978
4,294,11.482149,0.947615,0.676239,-3.088923,-3.719711,-0.077681,-0.133599,-0.605352,1.562385,...,-0.235869,1.085748,0.379340,-0.197804,-0.969146,-0.988181,0.302357,0.978962,-0.286391,0.031691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,707,-0.177601,0.778404,-2.423535,1.659093,-3.071421,2.588033,1.135791,-1.892388,-2.588418,...,0.028485,0.353898,-0.934127,0.922038,-0.180255,-0.281719,0.299285,-0.263801,0.150156,0.292112
944,239,-0.169077,-0.573891,1.158798,0.088561,0.224882,1.022582,-0.103449,0.063610,-0.180103,...,-0.408696,-0.263354,0.109261,0.258725,-0.114308,-0.344156,0.547584,-0.242593,0.016818,0.002048
945,171,18.615944,-0.226824,-2.923827,1.524837,-3.018758,3.289291,-5.755542,2.218276,-0.509995,...,1.176446,-0.447039,-0.511657,-0.122724,-4.288639,0.563797,-0.949451,-0.204532,1.510206,-0.324706
946,874,9.020471,0.706258,-2.405207,2.943823,-7.616654,3.533374,-5.417494,-0.112632,-1.329372,...,3.490069,-0.338707,0.652683,0.414132,0.023869,-0.260616,0.405316,0.029107,0.519807,-0.469537


In [30]:
import pandas as pd
df = pd.merge(left=X, right=y, how='inner', on='index')

In [34]:
df = df.set_index('index').sort_index()
df.head()

Unnamed: 0_level_0,scaled_amount,scaled_time,v1,v2,v3,v4,v5,v6,v7,v8,...,v20,v21,v22,v23,v24,v25,v26,v27,v28,class_
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.098093,0.26322,0.393037,-2.448608,-3.237613,1.792742,0.648011,0.401792,1.725625,-0.517292,...,1.547939,0.516173,-0.1028,-0.951883,-0.342286,0.390038,-0.504712,-0.228074,0.066442,0
1,9.905121,-0.663189,-2.787248,-0.07134,-1.505288,3.361777,-3.357422,0.565835,0.303653,0.966914,...,1.607397,0.88294,-0.246202,1.752227,0.219925,0.156282,-0.265894,0.220694,0.256077,1
2,-0.296793,-0.569227,1.287651,0.4976,-1.016074,0.350477,0.989378,0.157572,0.216842,0.028636,...,-0.026498,-0.14145,-0.32467,-0.232334,-1.391343,0.661412,0.462251,-0.020189,0.010466,0
3,2.426605,0.66528,-6.352337,-2.370335,-4.875397,2.335045,-0.809555,-0.413647,-4.082308,2.239089,...,0.186898,1.325218,1.226745,-1.485217,-1.470732,-0.240053,0.112972,0.910591,-0.650944,1
4,-0.148397,-0.511178,-7.334341,4.960892,-8.45141,8.174825,-7.237464,-2.382711,-11.508842,4.635798,...,0.723314,2.153755,0.033922,-0.014095,0.62525,-0.05339,0.164709,1.411047,0.315645,1


In [38]:
df.rename(columns=lambda x: x.upper() if 'v' in x else 'Class' if x == 'class_' else x)

Unnamed: 0_level_0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.098093,0.263220,0.393037,-2.448608,-3.237613,1.792742,0.648011,0.401792,1.725625,-0.517292,...,1.547939,0.516173,-0.102800,-0.951883,-0.342286,0.390038,-0.504712,-0.228074,0.066442,0
1,9.905121,-0.663189,-2.787248,-0.071340,-1.505288,3.361777,-3.357422,0.565835,0.303653,0.966914,...,1.607397,0.882940,-0.246202,1.752227,0.219925,0.156282,-0.265894,0.220694,0.256077,1
2,-0.296793,-0.569227,1.287651,0.497600,-1.016074,0.350477,0.989378,0.157572,0.216842,0.028636,...,-0.026498,-0.141450,-0.324670,-0.232334,-1.391343,0.661412,0.462251,-0.020189,0.010466,0
3,2.426605,0.665280,-6.352337,-2.370335,-4.875397,2.335045,-0.809555,-0.413647,-4.082308,2.239089,...,0.186898,1.325218,1.226745,-1.485217,-1.470732,-0.240053,0.112972,0.910591,-0.650944,1
4,-0.148397,-0.511178,-7.334341,4.960892,-8.451410,8.174825,-7.237464,-2.382711,-11.508842,4.635798,...,0.723314,2.153755,0.033922,-0.014095,0.625250,-0.053390,0.164709,1.411047,0.315645,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,-0.293440,-0.092048,1.140208,1.156431,-1.471578,2.076278,0.774809,-1.002532,0.264948,0.013162,...,-0.125097,-0.387895,-0.866812,-0.121583,-0.356109,0.634573,-0.306311,0.094087,0.121065,1
944,4.108992,0.107626,-5.839192,7.151532,-12.816760,7.031115,-9.651272,-2.938427,-11.543207,4.843627,...,0.055684,2.462056,1.054865,0.530481,0.472670,-0.275998,0.282435,0.104886,0.254417,1
945,1.020052,0.371597,1.990802,-1.241515,-0.569048,-0.974138,-1.047198,-0.211196,-1.030209,-0.031967,...,0.227852,0.478431,1.254151,-0.019371,-0.426786,-0.170647,-0.067825,0.001676,-0.043149,0
946,0.539370,-0.574737,-3.218952,2.708535,-3.263042,1.361866,-1.645776,-1.852982,-3.069958,-1.796876,...,0.571654,1.807877,-0.890421,-0.325814,0.123040,-0.093014,0.232106,-0.310519,-0.745295,1
