import all the required modules.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [2]:
train = pd.read_csv("data/Opportunities_train.csv")    # Read the data
train.head()

Unnamed: 0,Opp No,Opp Type,Status,Region,Country,BU,Sector,Customer Group,Customer,Product,...,Cust Dec Date,Go Live Date,Date Exp Gain,Payment Terms,Contract Term,Milestone,Prob to win %,BCA_Status,Ann Rev,Ann GP
0,9WG2RW,2. Renewal,5. Lost,UK & IRELAND,UK,ELLP,Automotive,FORD MOTOR,Ford Motor,Inbound-to-Manufacturing (I2M),...,19/11/2016,01/12/2016,31/12/2016,60,< 1 Year,Closed - Lost,,Approved,0.33,0.13
1,9YG5FC,1. New,5. Lost,Greater China,China,China,Automotive,FORD MOTOR,Ford Motor,Aftermarket Services,...,03/11/2017,29/12/2017,24/11/2017,40,2-3 Years,Closed - Lost,,Not started,73.59,57.31
2,A4THP4,1. New,2. Opportunity,UK & IRELAND,UK,ELLP,Automotive,FORD MOTOR,Ford Motor,DHL Lead Logistics Partner,...,04/11/2016,31/07/2017,31/03/2017,60,2-3 Years,Verbal Customer Commitment Received,90.0,Awaiting Approval,17.74,35.41
3,A54GJ8,1. New,2. Opportunity,UK & IRELAND,UK,ELLP,Automotive,FORD MOTOR,Ford Motor,DHL Lead Logistics Partner,...,01/08/2017,08/01/2018,02/10/2017,60,3-5 Years,Qualified and signed off by Sponsor,20.0,Not started,176.25,12.14
4,9XY2SC,2. Renewal,2. Opportunity,Greater China,China,China,Automotive,FORD MOTOR,Ford Motor,Warehousing,...,09/06/2017,01/01/2017,16/06/2017,60,3-5 Years,Qualified and signed off by Sponsor,85.0,Awaiting Approval,23.69,18.87


In [3]:
# Impute median Payment Terms for NA Payment Terms values
new_payment_terms_var = np.where(train["Payment Terms"].isnull(), # Logical check
                       60,                       # Value if check is true
                       train["Payment Terms"])     # Value if check is false

train["Payment Terms"] = new_payment_terms_var 

# Impute median  Prob to win %  for NA Prob to win %  values
new_prob_to_win_var = np.where(train["Prob to win %"].isnull(), # Logical check
                       70,                       # Value if check is true
                       train["Prob to win %"])     # Value if check is false

train["Prob to win %"] = new_payment_terms_var 

# Impute median Ann Rev for NA Ann Rev values
new_ann_rev_var = np.where(train["Ann Rev"].isnull(), # Logical check
                       73,                       # Value if check is true
                       train["Ann Rev"])     # Value if check is false

train["Ann Rev"] = new_ann_rev_var 

# Impute median Ann GP for NA Ann GP values
new_ann_gp_var = np.where(train["Ann GP"].isnull(), # Logical check
                       28,                       # Value if check is true
                       train["Ann GP"])     # Value if check is false

train["Ann GP"] = new_ann_gp_var 

In [4]:
# Set the seed
np.random.seed(12)

# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# Convert some variables to numeric
train["Opp Type"] = label_encoder.fit_transform(train["Opp Type"])
train["Region"] = label_encoder.fit_transform(train["Region"])
train["Country"] = label_encoder.fit_transform(train["Country"])
train["BU"] = label_encoder.fit_transform(train["BU"])
train["Sector"] = label_encoder.fit_transform(train["Sector"])
train["Customer Group"] = label_encoder.fit_transform(train["Customer Group"])
train["Customer"] = label_encoder.fit_transform(train["Customer"])
train["Product"] = label_encoder.fit_transform(train["Product"])
train["Contract Term"] = label_encoder.fit_transform(train["Contract Term"])
train["Milestone"] = label_encoder.fit_transform(train["Milestone"])
train["BCA_Status"] = label_encoder.fit_transform(train["BCA_Status"])

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=1000, # Number of trees
                                  max_features=2,    # Num features considered
                                  oob_score=True)    # Use OOB scoring*

features = ["Opp Type","Region","Country","BU","Sector","Customer Group","Customer","Product","Payment Terms","Contract Term","Milestone","Prob to win %","BCA_Status","Ann Rev","Ann GP"]

# Train the model
rf_model.fit(X=train[features],
             y=train["Status"])

print("OOB accuracy: ")
print(rf_model.oob_score_)

OOB accuracy: 
0.8


Since random forest models involve building trees from random subsets or "bags" of data, model performance can be estimated by making predictions on the out-of-bag (OOB) samples instead of using cross validation. You can use cross validation on random forests, but OOB validation already provides a good estimate of performance and building several random forest models to conduct K-fold cross validation with random forest models can be computationally expensive.
The random forest classifier assigns an importance value to each feature used in training. Features with higher importance were more influential in creating the model, indicating a stronger association with the response variable. Let's check the feature importance for our random forest model:

In [5]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature, imp)

('Opp Type', 0.045194830342816426)
('Region', 0.039989182982889095)
('Country', 0.039607178281129675)
('BU', 0.057833092697555023)
('Sector', 0.026592517358930777)
('Customer Group', 0.031445440403546125)
('Customer', 0.040674120908347947)
('Product', 0.097860919081353095)
('Payment Terms', 0.041258077831037014)
('Contract Term', 0.036751685508059621)
('Milestone', 0.23572587739869463)
('Prob to win %', 0.039672027625222268)
('BCA_Status', 0.11787216737260303)
('Ann Rev', 0.081119024119597802)
('Ann GP', 0.068403858088217209)


Feature importance can help identify useful features and eliminate features that don't contribute much to the model.
As a final exercise, let's use the random forest model to make predictions on the opportunity test set to see how our actual generalization performance compares to the OOB estimate:

In [6]:
# Read and prepare test data
test = pd.read_csv("data/Opportunities_test.csv")    # Read the data

# Impute median Payment Terms for NA Payment Terms values
new_payment_terms_var = np.where(test["Payment Terms"].isnull(), # Logical check
                       60,                       # Value if check is true
                       test["Payment Terms"])     # Value if check is false

test["Payment Terms"] = new_payment_terms_var 

# Impute median  Prob to win %  for NA Prob to win %  values
new_prob_to_win_var = np.where(test["Prob to win %"].isnull(), # Logical check
                       70,                       # Value if check is true
                       test["Prob to win %"])     # Value if check is false

test["Prob to win %"] = new_payment_terms_var 

# Impute median Ann Rev for NA Ann Rev values
new_ann_rev_var = np.where(test["Ann Rev"].isnull(), # Logical check
                       73,                       # Value if check is true
                       test["Ann Rev"])     # Value if check is false

test["Ann Rev"] = new_ann_rev_var 

# Impute median Ann GP for NA Ann GP values
new_ann_gp_var = np.where(test["Ann GP"].isnull(), # Logical check
                       28,                       # Value if check is true
                       test["Ann GP"])     # Value if check is false

test["Ann GP"] = new_ann_gp_var 

# Convert some variables to numeric
test["Opp Type"] = label_encoder.fit_transform(test["Opp Type"])
test["Region"] = label_encoder.fit_transform(test["Region"])
test["Country"] = label_encoder.fit_transform(test["Country"])
test["BU"] = label_encoder.fit_transform(test["BU"])
test["Sector"] = label_encoder.fit_transform(test["Sector"])
test["Customer Group"] = label_encoder.fit_transform(test["Customer Group"])
test["Customer"] = label_encoder.fit_transform(test["Customer"])
test["Product"] = label_encoder.fit_transform(test["Product"])
test["Contract Term"] = label_encoder.fit_transform(test["Contract Term"])
test["Milestone"] = label_encoder.fit_transform(test["Milestone"])
test["BCA_Status"] = label_encoder.fit_transform(test["BCA_Status"])

In [7]:
# Make test set predictions
test_preds = rf_model.predict(X= test[features])

# Create a submission for Kaggle
submission = pd.DataFrame({"Opp No":test["Opp No"],
                           "Status":test_preds})

# Save submission to CSV
submission.to_csv("data/result.csv", 
                  index=False)        # Do not save index values

ValueError: could not convert string to float: Transactional