# KAGGLE : **San Francisco Crime Classification**

## __. **REFERENCE**

#### __.1. **COLUMNS SPECIFICATION**

<b></b>

## 00. **SET WORK ENVORINMENT**

#### 00.1. **DEFINE PRE-VARIABELS**

In [1]:
seed_num = 2024
compet_nm = 'SFCC'
run_time_limit = 60*60*3 # 3 hours

#### 00.2. **IMPORT PACKAGES AND SET OPTIONS**

In [None]:
#(1) Import packages
import os
import warnings
import datetime
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from autogluon.tabular import TabularPredictor
import shap

#(2) Set system options
warnings.filterwarnings(action='ignore')
np.set_printoptions(precision=3, suppress=True)
pd.options.display.float_format = '{:.4f}'.format
shap.initjs()

#### 00.3. **CREATE FUNCTIONS**

In [3]:
#(1) Create `relocate_col()` function
def relocate_col(df:pd.DataFrame, tar_col:str, std_col:str, how:str='after') -> pd.DataFrame : 
    '''
    Reorder columns in a DataFrame by moving a target column relative to a standard column.

    Parameters :
    - df (pd.DataFrame): The DataFrame from which the column will be relocated.
    - tar_col (str): The name of the column to be relocated.
    - std_col (str): The column relative to which `tar_col` will be relocated.
    - how (str, optional): Specifies the placement of `tar_col` relative to `std_col`.
      It can be 'after' (default) or 'before'.

    Returns :
    - pd.DataFrame: A new DataFrame with the column `tar_col` relocated as specified.
    '''
    col_ary = np.array(object=df.columns)
    tar_col_idx = np.where(col_ary==tar_col)[0][0]
    std_col_idx = np.where(col_ary==std_col)[0][0]
    col_ary = np.delete(arr=col_ary, obj=tar_col_idx)
    if how == 'after' : 
        if std_col_idx == len(col_ary) : 
            col_ary = np.insert(arr=col_ary, obj=std_col_idx, values=tar_col)
        else :
            col_ary = np.insert(arr=col_ary, obj=std_col_idx+1, values=tar_col)
    elif how == 'before' : 
        if std_col_idx == 0 : 
            col_ary = np.insert(arr=col_ary, obj=std_col_idx, values=tar_col) 
        else : 
            col_ary = np.insert(arr=col_ary, obj=std_col_idx-1, values=tar_col)
    else : 
        pass
    df = df.loc[:, col_ary]
    return df 

#(2) Create `diagnose_df()` function
def diagnose_df(df:pd.DataFrame) -> pd.DataFrame : 
    '''
    Generates a diagnostic summary for a pandas DataFrame, reporting details like data types, 
    count of missing values, and uniqueness for each column.

    Parameters :
    - df (pd.DataFrame): The DataFrame to be diagnosed.

    Returns :
    - pd.DataFrame: A summary table with diagnostics for each column in the input DataFrame, 
      including the column name, data type, total rows, count and rate of missing values, 
      and count and rate of unique values.
    '''
    output = pd.DataFrame(data=df.dtypes).reset_index()
    output.columns = ['COLUMN_NM', 'DATA_TYPE']
    output.loc[:, 'ROW_CNT'] = len(df)
    output.loc[:, 'NA_CNT'] = df.isna().sum().values
    output.loc[:, 'NA_RATE'] = output.loc[:, 'NA_CNT'] / output.loc[:, 'ROW_CNT']
    output.loc[:, 'UNIQUE_CNT'] = df.nunique().values
    output.loc[:, 'UNIQUE_RATE'] = output.loc[:, 'UNIQUE_CNT'] / output.loc[:, 'ROW_CNT']
    format_columns = ['ROW_CNT', 'NA_CNT', 'UNIQUE_CNT']
    for col in format_columns:
        output[col] = output[col].apply(func=lambda x: f'{x:,.0f}')
    return output

#(3) Create `diagnose_num_df()` function
def diagnose_num_df(df:pd.DataFrame) -> pd.DataFrame : 
    '''
    Generates a diagnostic summary for numeric columns in a pandas DataFrame, including statistical 
    measures like mean, median, mode, and various percentiles.

    Parameters :
    - df (pd.DataFrame): The DataFrame to be diagnosed. Only numeric columns will be considered.

    Returns :
    - pd.DataFrame: A summary table with diagnostics for each numeric column in the input DataFrame, 
      including count, mean, standard deviation, min, max, median, mode, and various percentiles.
    '''
    df = df.select_dtypes(include='number')
    output = df.describe(include='all', percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).transpose()
    output.loc[:, 'median'] = df.median()
    output.loc[:, 'mode'] = df.mode().values[0]
    output = relocate_col(df=output, tar_col='median', std_col='mean', how='after')
    output = relocate_col(df=output, tar_col='mode', std_col='median', how='after')
    output.columns = np.array(object=list(map(np.char.upper, output.columns)))
    output = output.rename(columns={'COUNT' : 'ROW_CNT'})
    output['ROW_CNT'] = output['ROW_CNT'].astype(dtype='int')
    return output

#(3) Create `split_dt()` function
def split_dt(df:pd.DataFrame, tar_col:str) -> pd.DataFrame :
    output = df.copy()
    output.loc[:, 'year'] = output.loc[:, tar_col].dt.strftime(date_format='%Y').astype(dtype='int')
    output.loc[:, 'month'] = output.loc[:, tar_col].dt.strftime(date_format='%m').astype(dtype='int')
    output.loc[:, 'day'] = output.loc[:, tar_col].dt.strftime(date_format='%d').astype(dtype='int')
    output.loc[:, 'hour'] = output.loc[:, tar_col].dt.strftime(date_format='%H').astype(dtype='int')
    output.loc[:, 'minute'] = output.loc[:, tar_col].dt.strftime(date_format='%M').astype(dtype='int')
    output = relocate_col(df=output, tar_col='year', std_col=tar_col, how='after')
    output = relocate_col(df=output, tar_col='month', std_col='year', how='after')
    output = relocate_col(df=output, tar_col='day', std_col='month', how='after')
    output = relocate_col(df=output, tar_col='hour', std_col='day', how='after')
    output = relocate_col(df=output, tar_col='minute', std_col='hour', how='after')
    output = output.drop(labels=tar_col, axis=1)
    return output

#(4)
def split_addr(df:pd.DataFrame, tar_col:str) -> pd.DataFrame : 
    output = df.copy()
    output.loc[:, 'Address'] = output.loc[:, 'Address'].str.replace(pat=r'  ', repl=' ')
    ###

    ###
    return output

#### 00.4. **CREATE CLASSES**

In [4]:
#(1) Create `AutogluonWrapper`
class AutogluonWrapper :
    def __init__(self, predictor, feature_names, target_class=None):
        self.ag_model = predictor
        self.feature_names = feature_names
        self.target_class = target_class
        if target_class is None and predictor.problem_type != 'regression' :
            print("Since target_class not specified, SHAP will explain predictions for each class")
    
    def predict_proba(self, X) :
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names)
        preds = self.ag_model.predict_proba(X)
        if self.ag_model.problem_type == "regression" or self.target_class is None:
            return preds
        else:
            return preds[self.target_class]    

<b></b>

## 01. **READ AND CONCATENATE DATASETS**

##### 01.1. **READ DATASETS**

In [5]:
#(1) Read Datasets
train_raw = pd.read_table(filepath_or_buffer=f'{os.getcwd()}/../data/train.csv', sep=',')
test_raw = pd.read_table(filepath_or_buffer=f'{os.getcwd()}/../data/test.csv', sep=',')

<b></b>

## 02. **EDA**

#### 02.1. **CHECK DATASET SHAPE**

In [None]:
print(f'>> Train raw dataset shape : {train_raw.shape}')
print(f'>> Test raw dataset shape : {test_raw.shape}')

#### 02.2. **CHECK COLUMN NOT IN TEST**

In [None]:
CON = ~(np.isin(element=train_raw.columns, test_elements=test_raw.columns))
train_raw.columns[CON]

#### 02.3. **DIAGNOSE DATASETS**

##### 02.3.1. **CHECK _**

In [None]:
#(1) Diagnose `train_raw`
diag_train = diagnose_df(df=train_raw)

#(2) Print `diag_train`
diag_train

In [None]:
#(3) Diagnose `test_raw`
diag_test = diagnose_df(df=test_raw)

#(4) Print `diag_test`
diag_test

##### 02.3.2. **CHECK _**

In [None]:
#(5) Check numerical columns (descriptive statistics)
num_cols = train_raw.select_dtypes(include=['float', 'int']).columns

#(6) Diagnose number columns
diagnose_num_df(df=train_raw.loc[:, num_cols])

<b></b>

## 03. **CLEANSE DATASETS**

##### 03.1. **SET FORMAT**

In [None]:
#(1) 
train = train_raw.copy()
test = test_raw.copy()

#(2)
train = train.drop_duplicates()

#(3)
train = train.drop(labels=['Descript', 'Resolution'], axis=1)
test = test.drop(labels=['Id'], axis=1)

#(3)
train = relocate_col(df=train, tar_col='Category', std_col=train.columns[-1], how='after')

#(4)
train['Dates'] = pd.to_datetime(arg=train['Dates'], errors='coerce')
test['Dates'] = pd.to_datetime(arg=test['Dates'], errors='coerce')

#(5)
train.head()

In [None]:
#(6)
test.head()

#### 03.2. **_**

##### 03.2.1. **SPLIT DATE**

- `YYYY-mm-DD HH:MM:SS` $ \rightarrow{} $ `YYYY` / `mm` / `DD` / `HH` / `MM`

In [None]:
#(1)
train = split_dt(df=train, tar_col='Dates')
test = split_dt(df=test, tar_col='Dates')

#(2)
train

In [None]:
#(3)
test

##### 03.2.2. **_ ADDRESS**

- `[\s{2,}]` $ \rightarrow{} $ `[\s]`

In [15]:
#(1)
train = split_addr(df=train, tar_col='Address')
test = split_addr(df=test, tar_col='Address')

##### 03.2.3. **MODIFY ABBREVIATION**

- AL :

- AV :

- BL :

- CR :

- CT :

- DR :

- EX :

- HY : 

- LN : 

In [None]:
#(2)
temp = train.loc[:, 'Address'].str.split(pat=' ')
# temp = temp.explode().tolist()
# temp = list(np.unique(ar=temp))
# temp
np.unique(ar=temp.apply(lambda x: x[-1]).tolist())

#### 03.3. **DIAGNOSE DATASETS**

In [None]:
#(1) Diagnose `train`
diagnose_df(df=train)

In [None]:
#(2) Diagnose `test`
diagnose_df(df=test)

<b></b>

## 04. **BUILD MODELS BY `AUTO-GLUON`**

#### 04.0. **ABOUT `AUTO-GLUON`**

- 📃 [API document](https://auto.gluon.ai/stable/index.html)

- 📃 [paper](https://arxiv.org/pdf/2003.06505)

#### 04.1. **SET UP**

In [19]:
#(1) Define `req_dttm`
req_dttm = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

#(1) Define `predictor`
predictor = TabularPredictor(
    label='Category',
    eval_metric='log_loss',
    problem_type='multiclass',
    path=f'../temp/ATOGL_{req_dttm}'
)

#### 04.2. **FIT TRAIN DATASET**

In [None]:
predictor.fit(
    train_data=train,
    excluded_model_types=['KNN'],
    presets='best_quality',
    # num_bag_folds=10,
    # auto_stack=True,
    num_cpus='auto',
    num_gpus='auto',
    time_limit=run_time_limit,
    verbosity=2
)

#### 04.3. **CHECK SCORE**

In [None]:
predictor.leaderboard(extra_info=False)

<b></b>

## 05. **CHECK MODEL INTERPRETABILITY**

#### 05.1. **CHECK FEATURE-IMPORTANCE**

In [None]:
#(1) Print feature importance
predictor.feature_importance(data=train)

#### 05.2. **CHECK SHAPLY VALUE**

##### 05.2.1. **SET UP**

In [47]:
#(1)
X_train_pp = predictor.transform_features(data=train).astype(dtype=float)
X_test_pp = predictor.transform_features(data=test).astype(dtype=float)

#(2)
# baseline = X_train_pp.loc[:, :].sample(n=100, random_state=seed_num)

#(3)
ag_wrapper = AutogluonWrapper(predictor=predictor, feature_names=X_train_pp.columns, target_class=train.loc[:, 'Category'])
# explainer = shap.KernelExplainer(model=ag_wrapper.predict_proba, data=X_train_pp)
# print(f'>> Baseline prediction : {np.mean(ag_wrapper.predict_proba(X=baseline)):.4f}')  # this is the same as explainer.expected_value

In [26]:
#(3)
# shap_samples = 15
# shap_values = explainer.shap_values(X=X_train_pp, nsamples=shap_samples)

##### 05.2.3. **DRAW FORCE PLOT (ONE-SAMPLE)**

In [None]:
#(1) Draw force plot by 1 sample
# row_idx = 0  # index of an example datapoint

#(2)
# single_datapoint = X_train_pp.iloc[[row_idx], :]
# single_prediction = ag_wrapper.predict_proba(X=single_datapoint)

#(3)
# shap_values_single = explainer.shap_values(X=single_datapoint, nsamples=shap_samples)
# shap.force_plot(explainer.expected_value, shap_values_single, X_train_pp.iloc[row_idx, :])

##### 05.2.4. **DRAW FORCE PLOT (N-SAMPLES)**

In [None]:
# shap.force_plot(base_value=explainer.expected_value, shap_values=shap_values, features=X_test_pp)

##### 05.2.5. **DRAW SUMMARY PLOT**

In [None]:
# shap.summary_plot(shap_values=shap_values, features=X_test_pp)

<b></b>

## 06. **PREDICT**

#### 06.1. **PREDICT TARGET**

In [None]:
#(1) Make `pred`
pred = pd.DataFrame(data={
   'Id'       : test_raw.loc[:, 'Id'].values,
   'Category' : predictor.predict(data=test).values
})

#(2) Print `pred`
pred

#### 06.2. **WRITE PREDICT**

In [None]:
# #(1) Define `submission_nm`
# submission_nm = f'atogl_stacking_{req_dttm}'

# #(2) Write `pred`
# pred.loc[:, ['id', 'class']].to_csv(path_or_buf=f'../output/{compet_nm}_{submission_nm}.csv', index=False)