In [None]:
from typing import Tuple, Any, Optional

import pandas as pd
import numpy as np
from pydantic import Field
from dagster import op, Out, In, job, Definitions,AssetKey, Config
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [None]:
class SplitDataConfig(Config):
    data_path:str = Field(description="File path of the input data",default="./data/genres_standardized.csv")
    target_column:str = Field(description="Column name of the target column",default="genre")

In [None]:
@op(out={"input_train":Out(),"input_test":Out(),"target_train":Out(),"target_test":Out(),"target_names":Out()})
def split_data(config:SplitDataConfig)->Tuple[pd.DataFrame,pd.DataFrame,pd.Series,pd.Series,pd.Series]:
    data = pd.read_csv("./data/genres_standardized.csv",sep=";") 
    columns = list(data.columns)
    columns.remove("genre")
    data["genre"]=data["genre"].astype("category")
    data["target"]=data["genre"].cat.codes
    X_train, X_test, y_train, y_test = train_test_split(data[columns], data['target'], test_size=.2)
    return X_train,X_test,y_train,y_test, data["genre"]

In [None]:
class TrainConfig(Config):
    number_of_estimators:int =Field(description="Number of boosting rounds",default=500)
    learning_rate:float = Field(description="Boosting learning rate",default=0.1)
    max_depth: int = Field(description="Maximum tree depth for base learners",default=8)
    min_child_weight: float = Field(description="Minimum sum of instance weight(hessian) needed in a child",default=1)
    gamma: float = Field(description="Minimum loss reduction required to make a further partition on a leaf node of the tree", default =0)
    number_of_jobs: int = Field(description="Number of parallel threads used to run xgboost",default=4)

In [None]:
@op(ins={"input_train":In(),"target_train":In()},out={"classifier":Out()})
def train_classifier(config:TrainConfig,input_train:pd.DataFrame,target_train:pd.Series)->XGBClassifier:
    model = XGBClassifier(learning_rate = config.learning_rate,
     n_estimators=config.number_of_estimators,
     max_depth=config.max_depth,
     min_child_weight=config.min_child_weight,
     gamma=config.gamma,
    n_jobs=config.number_of_jobs)
    model.fit(input_train,target_train)
    return model

In [None]:
@op(ins={"classifier":In(),"input_test":In()},out={"predictions":Out()})
def predict(classifier:XGBClassifier,input_test:pd.DataFrame)->np.ndarray:
    predictions = classifier.predict(input_test)
    return predictions

In [None]:
class AnalyzeConfig(Config):
    confusion_matrix_path: str = Field(default = "./data/confusion_materix.png")
    report_path: str = Field(default="./data/classification_report.csv")

In [None]:
@op(ins={"target_test":In(),"predictions":In(),"target_names":In()})
def analyze(config:AnalyzeConfig,target_test:pd.Series,predictions:np.ndarray,target_names:pd.Series):
    category_labels = target_names.cat.categories
    fig, ax = plt.subplots(figsize=(10, 10))
    ConfusionMatrixDisplay.from_predictions(target_test, predictions, ax=ax, display_labels=category_labels)
    ax.tick_params(axis='x',labelrotation=70,labelbottom=True)
    fig.savefig(config.confusion_matrix_path,pad_inches=20)
    report = classification_report(target_test,predictions,output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report.to_csv(config.report_path)

In [None]:
@job()
def spotify_genre_classification():    
    input_train,input_test,target_train,target_test,target_names = split_data()
    classifier = train_classifier(input_train=input_train,target_train=target_train)
    predictions = predict(classifier,input_test)
    analyze(target_test,predictions,target_names)

In [None]:
defs = Definitions(jobs=[spotify_genre_classification])