In [1]:
import pandas as pd
from numpy import mean
from numpy import std
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek 
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.metrics import geometric_mean_score
from matplotlib import pyplot

In [None]:
df = pd.read_csv("../landslide_data.csv")
print('Landslides are', round(df['y'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
print('No Landslides are', round(df['y'].value_counts()[0]/len(df) * 100,2), '% of the dataset')

In [4]:
def get_models():
    models, names = list(), list()
    # RandomOverSampler 
    models.append(RandomOverSampler()) 
    names.append('ROS')
    # SMOTE
    models.append(SMOTE()) 
    names.append('SMOTE')
    # BorderlineSMOTE 
    models.append(BorderlineSMOTE())
    names.append('BLSMOTE')
    # SVMSMOTE 
    models.append(SVMSMOTE()) 
    names.append('SVMSMOTE')
    # ADASYN
    models.append(ADASYN()) 
    names.append('ADASYN')
    models.append(SMOTETomek()) 
    names.append('SMOTETomek')
     
    return models, names

In [5]:
from sklearn.metrics import make_scorer
def evaluate_model(X, y, model):
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=7, random_state=1)
  # define the model evaluation metric
  metric = make_scorer(geometric_mean_score)
  # evaluate model
  scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=17)
  return scores

In [6]:

models, names = get_models() 
results = list()
# define the pipeline steps
for i in range(len(models)):
    # define the model
    model = RandomForestClassifier(n_estimators=1000,n_jobs=-1)
    # define the pipeline steps
    steps = [('s', MinMaxScaler()), ('o', models[i]), ('m', model)] # define the pipeline
    pipeline = Pipeline(steps=steps)
    # evaluate the model and store results
    scores = evaluate_model(X, y, pipeline)
    results.append(scores)
    # summarize and store
    print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))
# plot the results

>ROS 0.906 (0.004)
>SMOTE 0.936 (0.003)


In [None]:
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()