In [None]:
data_prep_script_file = 'data_prep.py'


In [None]:
sagemaker_utils.make_dirs(data_prep_script_file)


In [None]:
%%writefile $data_prep_script_file
import argparse
import pickle
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def to_pkl(data, file):
    with open(file, 'wb') as f:
        pickle.dump(data, f)
        
if __name__=='__main__':
    script_name = os.path.basename(__file__)
    
    print(f'INFO: {script_name}: Iniciando la preparación de los datos')
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--data-file', type=str, default='train.csv')
    parser.add_argument('--train-data-file', type=str)
    parser.add_argument('--train-target-file', type=str)
    parser.add_argument('--test-data-file', type=str)
    parser.add_argument('--test-target-file', type=str)
    parser.add_argument('--encoder-file', type=str)
    
    args, _ = parser.parse_known_args()    
    
    print(f'INFO: {script_name}: Parámetros recibidos: {args}')
    
    input_path = '/opt/ml/processing/input'
    output_path = '/opt/ml/processing/output'
    
    data_path = os.path.join(input_path, args.data_file) 
    
    
    # Cargar dataset
    df = pd.read_csv(data_path)

    df = df.sort_values(by=["id", "loan_date"])
    df = df.reset_index(drop=True)
    df["loan_date"] = pd.to_datetime(df.loan_date)

    #Feature nb_previous_loans
    df_grouped = df.groupby("id")
    df["nb_previous_loans"] = df_grouped["loan_date"].rank(method="first") - 1

    # Feature avg_amount_loans_previous
    df['avg_amount_loans_previous'] = (df.groupby('id')['loan_amount'].apply(lambda x: x.shift().expanding().mean()))

    # Feature age
    from datetime import datetime, date

    df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce')
    df['age'] = (pd.to_datetime('today').normalize() - df['birthday']).dt.days // 365

    # Feature years_on_the_job

    df['job_start_date'] = pd.to_datetime(df['job_start_date'], errors='coerce')
    df['years_on_the_job'] = (pd.to_datetime('today').normalize() - df['job_start_date']).dt.days // 365

    # Feature flag_own_car

    df['flag_own_car'] = df.flag_own_car.apply(lambda x : 0 if x == 'N' else 1)

    # Selección de columnas
    columns = ['id', 'age', 'years_on_the_job', 'nb_previous_loans', 'avg_amount_loans_previous', 'flag_own_car', 'status']
    
    df = df[columns]

    cust_df = df.copy()
    cust_df.fillna(0, inplace=True)

    Y = cust_df['status'].astype('int')

    cust_df.drop(['status'], axis=1, inplace=True)
    cust_df.drop(['id'], axis=1, inplace=True)


    X = cust_df
        
    import pandas as pd

    import matplotlib.pyplot as plt
    %matplotlib inline

    from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import (
        accuracy_score, confusion_matrix, recall_score, 
        plot_confusion_matrix, precision_score, plot_roc_curve
    )

    from sklearn.ensemble import RandomForestClassifier

    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state = 123)

    # Using Synthetic Minority Over-Sampling Technique(SMOTE) to overcome sample imbalance problem.
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)
    X_train = pd.DataFrame(X_train, columns=X.columns)

    # Guardar los dataframes resultantes y el encoder
    X_train.to_csv(os.path.join(output_path, 'train_data', args.train_data_file), index=False)
    y_train.to_csv(os.path.join(output_path, 'train_target', args.train_target_file), index=False)
    X_test.to_csv(os.path.join(output_path, 'test_data', args.test_data_file), index=False)
    y_test.to_csv(os.path.join(output_path, 'test_target', args.test_target_file), index=False)
    to_pkl(encoder, os.path.join(output_path, 'encoder', args.encoder_file))

    print(f'INFO: {script_name}: Finalizando la preparación de los datos')

Y subimos el script creado a un bucket de Amazon S3.

In [None]:
data_prep_script_path = sagemaker_utils.upload(data_prep_script_file, f's3://{bucket}/{code_prefix}')
