# RF Intro

* Based on: https://blogs.oracle.com/ai-and-datascience/post/an-introduction-to-building-a-classification-model-using-random-forests-in-python
* Dataset from: https://archive.ics.uci.edu/ml/datasets/Autistic+Spectrum+Disorder+Screening+Data+for+Adolescent+++
* Dataset info converted do markdown: https://github.com/efurlanm/ml/blob/master/datasets/Autism-Adolescent-Data_description.md

---------------------------------------------------------------------------

## Load Python packages

In [2]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import arff
import pandas as pd

## Pre-Process the data

In [3]:
data = arff.loadarff('datasets/Autism-Adolescent-Data.arff')
data[1]

Dataset: adolescent
	A1_Score's type is nominal, range is ('0', '1')
	A2_Score's type is nominal, range is ('0', '1')
	A3_Score's type is nominal, range is ('0', '1')
	A4_Score's type is nominal, range is ('0', '1')
	A5_Score's type is nominal, range is ('0', '1')
	A6_Score's type is nominal, range is ('0', '1')
	A7_Score's type is nominal, range is ('0', '1')
	A8_Score's type is nominal, range is ('0', '1')
	A9_Score's type is nominal, range is ('0', '1')
	A10_Score's type is nominal, range is ('0', '1')
	age's type is numeric
	gender's type is nominal, range is ('m', 'f')
	ethnicity's type is nominal, range is ('Hispanic', 'Black', 'White-European', 'Middle Eastern ', 'South Asian', 'Others', 'Latino', 'Asian')
	jundice's type is nominal, range is ('yes', 'no')
	austim's type is nominal, range is ('yes', 'no')
	contry_of_res's type is nominal, range is ('Austria', 'AmericanSamoa', 'United Kingdom', 'Albania', 'Belgium', 'Afghanistan', 'Australia', 'Bahrain', 'Azerbaijan', 'United Ara

In [7]:
data[0][:2]

array([(b'0', b'0', b'0', b'1', b'1', b'1', b'1', b'1', b'1', b'0', 15., b'm', b'Hispanic', b'yes', b'yes', b'Austria', b'no', 6., b'12-16 years', b'Parent', b'NO'),
       (b'0', b'0', b'0', b'0', b'0', b'0', b'0', b'0', b'1', b'1', 15., b'm', b'Black', b'no', b'no', b'Austria', b'no', 2., b'12-16 years', b'Relative', b'NO')],
      dtype=[('A1_Score', 'S1'), ('A2_Score', 'S1'), ('A3_Score', 'S1'), ('A4_Score', 'S1'), ('A5_Score', 'S1'), ('A6_Score', 'S1'), ('A7_Score', 'S1'), ('A8_Score', 'S1'), ('A9_Score', 'S1'), ('A10_Score', 'S1'), ('age', '<f8'), ('gender', 'S1'), ('ethnicity', 'S15'), ('jundice', 'S3'), ('austim', 'S3'), ('contry_of_res', 'S20'), ('used_app_before', 'S3'), ('result', '<f8'), ('age_desc', 'S11'), ('relation', 'S24'), ('Class/ASD', 'S3')])

In [8]:
df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'0',b'0',b'0',b'1',b'1',b'1',b'1',b'1',b'1',b'0',...,b'm',b'Hispanic',b'yes',b'yes',b'Austria',b'no',6.0,b'12-16 years',b'Parent',b'NO'
1,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'1',...,b'm',b'Black',b'no',b'no',b'Austria',b'no',2.0,b'12-16 years',b'Relative',b'NO'
2,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'1',...,b'f',b'?',b'no',b'no',b'AmericanSamoa',b'no',2.0,b'12-16 years',b'?',b'NO'
3,b'0',b'1',b'1',b'1',b'1',b'1',b'0',b'1',b'1',b'0',...,b'f',b'White-European',b'no',b'no',b'United Kingdom',b'no',7.0,b'12-16 years',b'Self',b'YES'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'0',b'0',b'0',...,b'f',b'?',b'no',b'no',b'Albania',b'no',7.0,b'12-16 years',b'?',b'YES'


In [9]:
list(df)

['A1_Score',
 'A2_Score',
 'A3_Score',
 'A4_Score',
 'A5_Score',
 'A6_Score',
 'A7_Score',
 'A8_Score',
 'A9_Score',
 'A10_Score',
 'age',
 'gender',
 'ethnicity',
 'jundice',
 'austim',
 'contry_of_res',
 'used_app_before',
 'result',
 'age_desc',
 'relation',
 'Class/ASD']

In [10]:
df = df.apply(lambda x: x.astype(str).str.lower())    # Lowercasing
df = df.replace("1", 1)
df = df.replace("0", 0)
df = df.replace("b'1'", 1)
df = df.replace("b'0'", 0)
df = df.replace("b'yes'", 1)
df = df.replace("b'no'", 0)
df = df.replace("b'f'", 1)
df = df.replace("b'm'", 0)
df = df.replace("b'Y'", 1)
df = df.replace("b'N'", 0)

In [11]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,0,0,0,1,1,1,1,1,1,0,...,0,b'hispanic',1,1,b'austria',0,6.0,b'12-16 years',b'parent',0
1,0,0,0,0,0,0,0,0,1,1,...,0,b'black',0,0,b'austria',0,2.0,b'12-16 years',b'relative',0
2,0,0,0,0,0,0,0,0,1,1,...,1,b'?',0,0,b'americansamoa',0,2.0,b'12-16 years',b'?',0
3,0,1,1,1,1,1,0,1,1,0,...,1,b'white-european',0,0,b'united kingdom',0,7.0,b'12-16 years',b'self',1
4,1,1,1,1,1,1,1,0,0,0,...,1,b'?',0,0,b'albania',0,7.0,b'12-16 years',b'?',1


## Subset the data

| **Attribute**                 | **Type**            | **Description**                                                                  |
| ----------------------------- | ------------------- | -------------------------------------------------------------------------------- |
| Question 1 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 2 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 3 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 4 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 5 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 6 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 7 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 8 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 9 Answer             | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Question 10 Answer            | Binary (0, 1)       | The answer code of the question based on the screening method used               |
| Gender                        | String              | Male or Female                                                                   |
| Born with jaundice            | Boolean (yes or no) | Whether the case was born with jaundice                                          |
| Austim                        | Boolean (yes or no) | If the case has autism                                                           |

In [12]:
X = list(df.loc[:,'A1_Score':'A10_Score']) + ['gender'] + ['jundice'] + ['austim']
X

['A1_Score',
 'A2_Score',
 'A3_Score',
 'A4_Score',
 'A5_Score',
 'A6_Score',
 'A7_Score',
 'A8_Score',
 'A9_Score',
 'A10_Score',
 'gender',
 'jundice',
 'austim']

* iloc: location based indexing for selection by position
* Y contains only the column "Class/ASD" (column 20, starting from 0)

In [13]:
Y = df.iloc[:,20]
Y

0      0
1      0
2      0
3      1
4      1
      ..
99     0
100    0
101    1
102    0
103    0
Name: Class/ASD, Length: 104, dtype: int64

New database with only the selected variables

In [14]:
df2 = df[X]
df2.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,jundice,austim
0,0,0,0,1,1,1,1,1,1,0,0,1,1
1,0,0,0,0,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,0,1,1,1,0,0
3,0,1,1,1,1,1,0,1,1,0,1,0,0
4,1,1,1,1,1,1,1,0,0,0,1,0,0


one-hot encoding the categorical variables so that we can turn the categorical variables to numeric

## Split the data into train and test sets

In [15]:
XTrain, XTest, YTrain, YTest = train_test_split(df2, Y, test_size=0.2)    # 80/20%
XTrain.shape, YTrain.shape, XTest.shape, YTest.shape

((83, 13), (83,), (21, 13), (21,))

In [16]:
YTest.head()

33    1
85    1
31    1
75    0
10    0
Name: Class/ASD, dtype: int64

In [17]:
XTest.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,jundice,austim
33,1,1,1,1,1,1,1,0,1,1,0,1,0
85,0,0,1,1,1,0,1,1,1,1,0,1,0
31,1,1,1,1,1,1,0,0,0,1,0,0,0
75,1,1,1,0,1,0,1,0,1,0,0,0,1
10,1,0,0,0,1,1,0,1,1,0,1,1,0


# Build a Random Forest Classifier

* n_jobs = int : The number of jobs to run in parallel over the trees, default=None=1
* random_state = int : Controls the sampling of the features to consider when looking for the best split at each node (if max_features < n_features), default=None, random_state=0=seed used by the random number generator

In [18]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf

RandomForestClassifier(n_jobs=2, random_state=0)

* bootstrap (True) : Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree
* class_weight (None) : Weights associated with classes
* criterion ('gini') : The function to measure the quality of a split. “gini” for the Gini impurity
* max_depth (None) : The maximum depth of the tree
* max_features ('auto') : The number of features to consider when looking for the best split. If “auto”, then max_features=sqrt(n_features)
* max_leaf_nodes (None) : Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes
* **min_impurity_split (deprecated)** : Threshold for early stopping in tree growth
* min_samples_leaf (1) : The minimum number of samples required to be at a leaf node
* min_samples_split (2) : The minimum number of samples required to split an internal node
* min_weight_fraction_leaf (0.0) : The minimum weighted fraction of the sum total of weights
* **n_estimators (100)** : The number of trees in the forest
* **n_jobs (None)** : The number of jobs to run in parallel
* oob_score (False) : Whether to use out-of-bag samples to estimate the generalization score
* **random_state (None)** : Controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True) and the sampling of the features to consider when looking for the best split at each node (if max_features < n_features)
* verbose (0) : Verbosity
* warm_start (False) : When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest

## Fit
fit(X, Y) : Build a forest of trees from the training set, where X is the training input samples, and Y is the target values.

In [19]:
clf.fit(XTrain, YTrain)    # train de model

RandomForestClassifier(n_jobs=2, random_state=0)

In [20]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

RandomForestClassifier(min_impurity_split=1e-07, n_estimators=10, n_jobs=2,
                       random_state=0)

## Predict

In [21]:
preds = clf.predict(XTest)

In [22]:
preds

array([1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1])

## Check de accuracy of the model

In [24]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(YTest, preds))

Accuracy: 0.8571428571428571


In [25]:
pd.crosstab(YTest, preds, rownames=['Actual Result'], colnames=['Predicted Result'])

Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,0
1,3,11


## Check feature importance

In [26]:
list(zip(XTrain, clf.feature_importances_))

[('A1_Score', 0.03827731992746742),
 ('A2_Score', 0.04971970187695681),
 ('A3_Score', 0.11902304990051994),
 ('A4_Score', 0.1012294925765204),
 ('A5_Score', 0.17991865936303242),
 ('A6_Score', 0.09008866379161827),
 ('A7_Score', 0.07474806909839479),
 ('A8_Score', 0.07851124059405166),
 ('A9_Score', 0.05724248340041242),
 ('A10_Score', 0.14525546367053693),
 ('gender', 0.029159907264994605),
 ('jundice', 0.021117635693473438),
 ('austim', 0.0157083128420207)]