In [1]:
import pandas as pd
from pandas import plotting
from pandas.plotting import scatter_matrix
import numpy as np
import re
import datetime
import ast
import seaborn as sns
import string
import csv
import warnings
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import numpy as np
from mpi4py import MPI
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from itertools import product
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


# BaseLine

In [3]:
X = pd.read_csv('final_metadata_2.csv')
y = X['Y']
X = X.drop(columns = ['Y','score','id'])
X = X.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_test_prob = rf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_test_prob)
print(auc)

0.7892881839064193


# MPI

In [4]:
%%writefile rf_tuning.py

import numpy as np
from mpi4py import MPI
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from itertools import product
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()


X = pd.read_csv('final_metadata_2.csv')
y = X['Y']
X = X.drop(columns = ['Y','score','id'])
X = X.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

n_estimators = [25,50, 100, 150, 200]
max_depth = [5,10,20,30]
min_samples_split = [5,10,20,30]
param_grid_prod = list(product(n_estimators,max_depth,min_samples_split))
num_params = len(param_grid_prod)
rank_size = num_params // size

def rf_model (X_train,y_train,X_test,y_test,param):
    """
    Args:
    parm = [max_depth, n_estimators, 'min_samples_split', 'max_features']
    
    """
    rf = RandomForestClassifier(n_estimators = param[0],max_depth = param[1], min_samples_split = param[2])
    
    rf.fit(X_train, y_train)
    y_test_prob = rf.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_test_prob)
    
    print(param, auc)
    
    return auc



start = time.time()
if rank == (size-1):
    sub_param_grid = param_grid_prod[rank*rank_size:num_params]
    auc_result = []

    for param in sub_param_grid:
        auc = rf_model(X_train, y_train, X_test, y_test,param)
        
        auc_result.append(auc)
    best_auc = np.max(auc_result)
    idx = np.argmax(auc_result)
    best_param = sub_param_grid[idx]
    all_best_param=comm.gather(best_param, root=0)
    all_best_auc=comm.gather(best_auc, root=0)
    print('Rank: ',rank, 'Best AUC: ', best_auc, 'Best Parameter: ',best_param)

else:
    sub_param_grid = param_grid_prod[rank*rank_size:(rank+1)*rank_size]
    auc_result = []

    
    for i in range(len(sub_param_grid)):
        auc = rf_model(X_train, y_train, X_test, y_test,sub_param_grid[i])
        
        auc_result.append(auc)
    
    best_auc = np.max(auc_result)
    idx = np.argmax(auc_result)
    best_param = sub_param_grid[idx]
    all_best_param=comm.gather(best_param, root=0)
    all_best_auc=comm.gather(best_auc, root=0)
    print('Rank: ',rank, 'Best AUC: ', best_auc, 'Best Parameter: ',best_param)
if rank ==0:
    best_auc = np.max(all_best_auc)
    best_param = all_best_param[np.argmax(all_best_auc)]
    print("final result",best_auc,"best param",best_param)
end = time.time()
print('Rank',rank, 'takes ', end-start,'s')

Overwriting rf_tuning.py


In [5]:
!mpirun -n 16 python3 rf_tuning.py

--------------------------------------------------------------------------
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to restrictive ptrace settings.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

  Local host: NarakuX
--------------------------------------------------------------------------
[NarakuX:00061] 15 more processes have sent help message help-btl-vader.txt / cma-permission-denied
[NarakuX:00061] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
(25, 20, 20) 0.8065038381967513
(50, 30, 20) 0.8069445094760134
(25, 5, 5) 0.7666604544248937
(50, 10, 5) 0.798228862160539
(25, 20, 30) 0.8077209804883267
(25, 30, 30) 0.8058473134233235
(25, 5, 10) 0.7670105354457418
(25, 10, 10) 0.795089797208792
(100, 5, 30) 0.7695240894654709
(50, 10, 10) 0.7948048938065465
(25, 30, 5) 0.7964718848218639
(25, 10, 20) 0.7931209215584755


# Without MPI

In [6]:
X = pd.read_csv('final_metadata_2.csv')
y = X['Y']
X = X.drop(columns = ['Y','score','id'])
X = X.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
n_estimators = [25,50, 100, 150, 200]
max_depth = [5,10,20,30]
min_samples_split = [5,10,20,30]
param_grid_prod = list(product(n_estimators,max_depth,min_samples_split))


In [7]:
def rf_model (X_train,y_train,X_test,y_test,param):
    """
    Args:
    parm = [max_depth, n_estimators, 'min_samples_split', 'max_features']
    
    """
    rf = RandomForestClassifier(n_estimators = param[0],max_depth = param[1], min_samples_split = param[2])
    rf.fit(X_train, y_train)
    y_test_prob = rf.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_test_prob)
    
    print(param, auc)

    return auc

In [10]:
start = time.time()
auc_result = []
for param in param_grid_prod:
        auc = rf_model(X_train, y_train, X_test, y_test,param)
        #auc,precision,recall = rf_model(X_train, y_train, X_test, y_test,sub_param_grid[i])
        auc_result.append(auc)
best_auc = np.max(auc_result)
idx = np.argmax(auc_result)
best_param = param_grid_prod[idx]
    #comm.Gather(best_auc, auc_rank, root=0)
print('Best AUC: ', best_auc, 'Best Parameter: ',best_param)
end = time.time()
print('takes ', end-start,'s')

(25, 5, 5) 0.7708082922320002
(25, 5, 10) 0.7666472507467066
(25, 5, 20) 0.7667038193302937
(25, 5, 30) 0.7718485644451875
(25, 10, 5) 0.7945954822863124
(25, 10, 10) 0.7943147472201402
(25, 10, 20) 0.7933685112477458
(25, 10, 30) 0.7945066919912029
(25, 20, 5) 0.8047458898252409
(25, 20, 10) 0.8047015302032776
(25, 20, 20) 0.8071751889837131
(25, 20, 30) 0.8049200718005363
(25, 30, 5) 0.794820738220371
(25, 30, 10) 0.8029137758604694
(25, 30, 20) 0.8042459973848901
(25, 30, 30) 0.8044378355791785
(50, 5, 5) 0.7633392971328028
(50, 5, 10) 0.7666250117264056
(50, 5, 20) 0.7690654777624837
(50, 5, 30) 0.7681504214175928
(50, 10, 5) 0.7953608101047476
(50, 10, 10) 0.7939748028347433
(50, 10, 20) 0.7966483404350175
(50, 10, 30) 0.7954946942173786
(50, 20, 5) 0.8062013614685597
(50, 20, 10) 0.8075514168399309
(50, 20, 20) 0.8088696768068785
(50, 20, 30) 0.8093536182566299
(50, 30, 5) 0.8020643352831436
(50, 30, 10) 0.8044972225263607
(50, 30, 20) 0.8082547169755462
(50, 30, 30) 0.8084831820