In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from loguru import logger
import yaml
import sys
from pyspark.sql import SparkSession
import pandas as pd

from mlops_course.config import ProjectConfig
from mlops_course.data_processor import DataProcessor
from marvelous.logging import setup_logging
from marvelous.timer import Timer

config = ProjectConfig.from_yaml(config_path="../project_config.yml", env="dev")

setup_logging(log_file="logs/marvelous-1.log")

logger.info("Configuration loaded:")
logger.info(yaml.dump(config, default_flow_style=False))

[32m2025-05-10 10:23:04[0m | [1mINFO[0m | [36m2428171464[0m:[36m<module>[0m - [1mConfiguration loaded:[0m
[32m2025-05-10 10:23:04[0m | [1mINFO[0m | [36m2428171464[0m:[36m<module>[0m - [1m!!python/object:mlops_course.config.ProjectConfig
__dict__:
  cat_features:
  - name
  - full_name
  - birth_date
  - positions
  - nationality
  - preferred_foot
  - body_type
  - national_team
  - national_team_position
  catalog_name: mlops_dev
  num_features:
  - age
  - height_cm
  - weight_kgs
  - overall_rating
  - potential
  - wage_euro
  - international_rep
  - weak_foot
  - skill_moves
  - release_clause_euro
  - national_rating
  - national_jersey_number
  - crossing
  - finishing
  - heading_accuracy
  - short_passing
  - volleys
  - dribbling
  - curve
  - freekick_accuracy
  - long_passing
  - ball_control
  - acceleration
  - sprint_speed
  - agility
  - reactions
  - balance
  - shot_power
  - jumping
  - stamina
  - strength
  - long_shots
  - aggression
  - interce

In [3]:
# Load the fifa players dataset
spark = SparkSession.builder.getOrCreate()

filepath = "../data/fifa_players.csv"

# Load the data
df = pd.read_csv(filepath)
print(df.columns.values.tolist())


['name', 'full_name', 'birth_date', 'age', 'height_cm', 'weight_kgs', 'positions', 'nationality', 'overall_rating', 'potential', 'value_euro', 'wage_euro', 'preferred_foot', 'international_reputation(1-5)', 'weak_foot(1-5)', 'skill_moves(1-5)', 'body_type', 'release_clause_euro', 'national_team', 'national_rating', 'national_team_position', 'national_jersey_number', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'freekick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle', 'sliding_tackle']


In [4]:
# Load the fifa players dataset
with Timer() as preprocess_timer:
    # Initialize DataProcessor
    data_processor = DataProcessor(df, config, spark)

    # Preprocess the data
    data_processor.preprocess()

logger.info(f"Data preprocessing: {preprocess_timer}")

['name', 'full_name', 'birth_date', 'age', 'height_cm', 'weight_kgs', 'positions', 'nationality', 'overall_rating', 'potential', 'value_euro', 'wage_euro', 'preferred_foot', 'international_rep', 'weak_foot', 'skill_moves', 'body_type', 'release_clause_euro', 'national_team', 'national_rating', 'national_team_position', 'national_jersey_number', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'freekick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle', 'sliding_tackle']
[32m2025-05-10 10:23:04[0m | [1mINFO[0m | [36m3306976369[0m:[36m<module>[0m - [1mData preprocessing: Elapsed time: 0.0564 seconds[0m


In [5]:
# Split the data
X_train, X_test = data_processor.split_data()
logger.info("Training set shape: %s", X_train.shape)
logger.info("Test set shape: %s", X_test.shape)

[32m2025-05-10 10:23:04[0m | [1mINFO[0m | [36m2594902236[0m:[36m<module>[0m - [1mTraining set shape: %s[0m
[32m2025-05-10 10:23:04[0m | [1mINFO[0m | [36m2594902236[0m:[36m<module>[0m - [1mTest set shape: %s[0m


In [6]:
# Save to catalog
logger.info("Saving data to catalog")
data_processor.save_to_catalog(X_train, X_test)

# Enable change data feed (only once!)
logger.info("Enable change data feed")
data_processor.enable_change_data_feed()

[32m2025-05-10 10:23:04[0m | [1mINFO[0m | [36m737852268[0m:[36m<module>[0m - [1mSaving data to catalog[0m
[32m2025-05-10 10:24:01[0m | [1mINFO[0m | [36m737852268[0m:[36m<module>[0m - [1mEnable change data feed[0m
