In [3]:
%%writefile rf_tuning.py

import numpy as np
from mpi4py import MPI
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from itertools import product
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

"""
X_train = pd.read_csv('preprocessed_train.csv')
X_test = pd.read_csv('preprocessed_test.csv')
y_train = pd.read_csv('preprocessed_train_y.csv')
y_test = pd.read_csv('preprocessed_test_y.csv')
X_test.drop(columns=['SK_ID_CURR'],inplace=True)
X_train.drop(columns=['SK_ID_CURR'],inplace=True)
y_train = np.ravel(y_train.values)
y_test = np.ravel(y_test.values)

n_estimators = [50, 100, 150, 200, 250]
#max_features = ['sqrt','log2']
max_depth = [5,10,20,30]
min_samples_split = [10,20,30,40]
param_grid_prod = list(product(n_estimators,max_depth,min_samples_split))
num_params = len(param_grid_prod)
rank_size = num_params // size
"""

X = pd.read_csv('input.csv')
y = X['target']
X = X.drop(columns = ['target'])
X = X.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

n_estimators = [50, 100, 150, 200, 250]
max_depth = [5,10,20,30]
min_samples_split = [10,20,30,40]
param_grid_prod = list(product(n_estimators,max_depth,min_samples_split))
num_params = len(param_grid_prod)
rank_size = num_params // size


def rf_model (X_train,y_train,X_test,y_test,param):
    """
    Args:
    parm = [max_depth, n_estimators, 'min_samples_split', 'max_features']
    
    """
    rf = RandomForestClassifier(n_estimators = param[0],max_depth = param[1], min_samples_split = param[2])
    #rf.grid = GridSearchCV(rf, param_grid,cv = 5,scoring = 'roc_auc')
    rf.fit(X_train, y_train)
    y_test_prob = rf.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_test_prob)
    #y_test_pred = np.where(y_test_prob >= 0.5, 1 , 0)
    #precision = precision_score(y_test, y_test_pred)
    #recall = recall_score(y_test, y_test_pred)
    print(param, auc)
    #return (auc, precision, recall)
    return auc

#auc = np.zeros(1)
#best_auc = np.zeros(1)

start = time.time()
if rank == (size-1):
    sub_param_grid = param_grid_prod[rank*rank_size:num_params]
    auc_result = []
    #auc_result = np.zeros(N - rank*rank_size)
    for param in sub_param_grid:
        auc = rf_model(X_train, y_train, X_test, y_test,param)
        #auc,precision,recall = rf_model(X_train, y_train, X_test, y_test,sub_param_grid[i])
        auc_result.append(auc)
    best_auc = np.max(auc_result)
    idx = np.argmax(best_auc)
    best_param = sub_param_grid[idx]
    #comm.Gather(best_auc, auc_rank, root=0)
    print('Rank: ',rank, 'Best AUC: ', best_auc, 'Best Parameter: ',best_param)

else:
    sub_param_grid = param_grid_prod[rank*rank_size:(rank+1)*rank_size]
    auc_result = []
    param_list = []
    for i in range(len(sub_param_grid)):
        auc = rf_model(X_train, y_train, X_test, y_test,sub_param_grid[i])
        #auc,precision,recall = rf_model(X_train, y_train, X_test, y_test,sub_param_grid[i])
        auc_result.append(auc)
    
    best_auc = np.max(auc_result)
    idx = np.argmax(auc_result)
    best_param = sub_param_grid[idx]
    print('Rank: ',rank, 'Best AUC: ', best_auc, 'Best Parameter: ',best_param)
    
end = time.time()
print('Rank',rank, 'takes ', end-start,'s')

Overwriting rf_tuning.py
