In [76]:
# Pandas is used for data manipulation
import pandas as pd

# Read in data and display first 5 rows
features = pd.read_csv("../processed_data/2018_Clusters_Ratios_AF.csv")
features.head(5)
columns = features.columns

In [77]:
# Set "Name" column to the index
features = features.set_index('Name')

# Drop "Name" column
#features.drop(['Name'], axis=1)

In [78]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['cluster'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('cluster', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [79]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [80]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (312, 17)
Training Labels Shape: (312,)
Testing Features Shape: (105, 17)
Testing Labels Shape: (105,)


In [81]:
# The baseline predictions are the historical averages
#baseline_preds = test_features[:, feature_list.index('average')]

# Baseline errors, and display average baseline error
#baseline_errors = abs(baseline_preds - test_labels)
#print('Average baseline error: ', round(np.mean(baseline_errors), 2))
#Average baseline error:  5.06 degrees.

In [82]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [83]:
from sklearn.metrics import confusion_matrix

test_pred = rf.predict(test_features)
confusion_matrix(test_labels, test_pred)


array([[ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 25,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 16,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  3,  1, 14,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  2,  0, 14,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 11,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0

In [84]:
print(test_labels)

[ 7 20  4  4 11 10 11  9  4  9  7  9 11 10  9  4 10  9 16  9 10  9 12 11
  4  7  9  4 10  7  9 15  4 10 10  4 11  7  7  7  4  4  4  4  3  4  9  1
 10  7  7  9  7  7 11  4 10 10  4  9 11 11  4  9  4 10  4  4 17  4  7  2
  9  4  4  4 11  9 10  1 11  4 10  9 10 10  4  7 10  4  2  9  7  7  7  9
 19 16  7  0 15  9  7  4 11]


In [85]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, test_pred)


0.780952380952381

In [86]:
#Identify feature importance
importances = rf.feature_importances_

In [87]:
print(importances)

[0.02671701 0.04025036 0.03797245 0.13179443 0.03256649 0.0321568
 0.04210209 0.028532   0.28645134 0.12306367 0.06146572 0.02817685
 0.03505742 0.00803659 0.02590206 0.03390786 0.02584685]


In [88]:
print(columns)

Index(['Name', 'Net cash flow / Change in cash', 'Average Payables',
       'Average Receivables', 'currentRatio', 'SG&A to Revenue',
       'daysOfPayablesOutstanding', 'daysOfInventoryOutstanding',
       'eBITperRevenue', 'Intangibles to Total Assets', 'Debt to Assets',
       'Debt to Equity', 'Payout Ratio', 'ROE', 'R&D to Revenue', 'PE ratio',
       'returnOnAssets', 'Dividend Yield', 'cluster'],
      dtype='object')
