# Classification Using Random Forests

In [None]:
import os
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

# Acquire data in an accessible format


In [None]:
df = pd.read_csv(os.path.join(".", "Cleaned_Data", "default.csv"))
df.tail()

# Prepare data for the machine learning model Using the 'Sub_Region' Column as Input

In [None]:
df['Sub_Region'].nunique()

In [None]:
df.drop(columns=['Latitude', 'Longitude','Country', 'Region' ])
df = df.drop(columns=['Latitude', 'Longitude','Country', 'Region'])

# One-Hot Encoding

In [None]:
# Step 0: Reformat data
data = df.values
X = data[:, 0:67]
y = data[:, 68]


In [None]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

In [None]:
from tensorflow.keras.utils import to_categorical

# Step 2: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

In [None]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 15)

# Separate The Data into Features & Targets

In [None]:
target = one_hot_y
target_names = ["negative", "positive"]

In [None]:
data = df.drop("Sub_Region", axis=1)
feature_names = data.columns
data.head()

# Training and Testing Sets

In [None]:
df.dtypes

In [None]:
# Convert 'Sub_Region' Column to Float

df["Sub_Region"] = pd.to_numeric(df.Sub_Region, errors='coerce')
df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

# Shape of all the Data


In [None]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)


# Metrics and Scoring

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, target_names), reverse=True)

# Visualizing the Decision Tree in Regression Task

In [None]:
# Fit the regressor, set max_depth = 3
regr = DecisionTreeRegressor(max_depth=3, random_state=1234)
model = regr.fit(X, one_hot_y)


In [None]:
text_representation = tree.export_text(regr)
print(text_representation)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(regr, feature_names=data.columns, filled=True)
fig.savefig("decision_tree.png")