## This method gives a very bad result with default values of the algorithms.
## With a test set using 20% of the full data set, we have an ROC AUC of 0.53 (very bad)

## As the data set is unbalanced, we use an oversampling method (SMOTE) to obtain a balanced set. After that, we train a Random Forest classifier ##

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

## Build the data set from file

In [None]:
df_train = pd.read_csv('../input/train.csv')
columns = df_train.columns
labels = df_train['target']
features = df_train.drop(['id', 'target'], axis=1)

## Build train and test sets (20% of data reserved to test set)

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                                                            labels, 
                                                                            test_size=0.2, 
                                                                            random_state=0)

## Create from train set a new data set to obtain a balanced data set using SMOTE

In [None]:
smote = SMOTE(random_state=0)
os_features, os_labels = smote.fit_sample(features_train,labels_train)

In [None]:
# verify new data set is balanced
print(len(os_labels[os_labels==0]))
print(len(os_labels[os_labels==1]))

## Perform training of the random forest using the (over sampled) train set

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(os_features, os_labels)

In [None]:
# perform predictions on test set
actual = labels_test
predictions = clf.predict(features_test)
predictions_scores = clf.predict_proba(features_test)[:,1] # needed for roc calc

## confusion matrix on test set

In [None]:
confusion_matrix(actual, predictions)

## Let's go further and use the roc_auc indicator

In [None]:
from sklearn.metrics import roc_curve, auc

false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions_scores)
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Acknoledgments:
Many thanks for https://www.kaggle.com/chtaret

