## <span style='color:#ff5f27'> 📝 Imports

In [None]:
# !pip install -U 'hopsworks[python]' --quiet

In [None]:
!pip install -r requirements.txt -q

In [None]:
import config
import pandas as pd

from functions.utils import (
    load_image,
    show_image,
    download_and_extract_zip,
)
from functions.donut import (
    load_cheque_parser,
    parse_text,
)
from features.cheque import (
    spell_check,
    amount_letter_number_match,
    get_amount_match_column,
)

## <span style='color:#ff5f27'>🗄️ Data Loading

In [None]:
download_and_extract_zip(config.DOWNLOAD_URL)

In [None]:
data = pd.read_csv('data/res.csv')
data.columns = data.columns.str.lower()
data.rename(
    columns={
        'value_letters': 'amount_in_text',
        'value_numbers': 'amount_in_numbers',
    }, 
    inplace=True,
)
data.head()

In [None]:
data.bank_name.value_counts()

In [None]:
data.valid.value_counts()

## <span style='color:#ff5f27'> 👨🏻‍🎨 Data Visualization

In [None]:
image1 = load_image('1.jpg')

show_image(image1)

In [None]:
data.head(1)

In [None]:
data[data.valid==0].head(5)

In [None]:
image501 = load_image('501.jpg')
show_image(image501)

In [None]:
image502 = load_image('502.jpg')
show_image(image502)

In [None]:
image503 = load_image('503.jpg')
show_image(image503)

## <span style='color:#ff5f27'>👩🏻‍🔬 Feature Engineering </span>


### <span style='color:#ff5f27'>⛳️ Spell Check </span>


In [None]:
spell_check('Three Thousand Seven Hundred and Fifty Five')

In [None]:
spell_check('Threee Thousand Seven Hundred and Fifty Five')

In [None]:
spell_check('for thousand seven hundred and thirty six')

In [None]:
spell_check(' ')

In [None]:
spell_check('missing')

In [None]:
data[['spelling_is_correct', 'amount_in_text_corrected']] = data['amount_in_text'].apply(
    lambda x: pd.Series(spell_check(x))
)
data.head(3)

In [None]:
data[(data.amount_in_text == ' ')].head(3)

### <span style='color:#ff5f27'>⛳️ Amount in Letter and Number Match </span>


In [None]:
amount_letter_number_match('three thousand seven hundred and fifty five', '3755')

In [None]:
amount_letter_number_match('ThreeE Thousand Eight Hundred and Twenty Three', '7203')

In [None]:
amount_letter_number_match('missing', '3754')

In [None]:
amount_letter_number_match('Three Thousand Eight', 'missing')

In [None]:
data['amount_letter_number_match'] = data[['amount_in_text_corrected', 'amount_in_numbers']].apply(
    lambda x: get_amount_match_column(x.iloc[0], x.iloc[1]), 
    axis=1,
)
data.head(3)

In [None]:
condition = (data['spelling_is_correct'] == False) & (data['amount_letter_number_match'] == False)

data.loc[condition, 'valid'] = 0

data[condition].head(3)

In [None]:
data.valid.value_counts()

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

## <span style="color:#ff5f27;"> 🪄 Feature Group Creation </span>

In [None]:
# Get or create the 'cheque_fg' feature group
cheque_fg = fs.get_or_create_feature_group(
    name="cheque_fg",
    description='Parsed Cheque Information',
    primary_key=['cheque_no'],
    version=1,
)

cheque_fg.insert(data)

---