# Random Forest: College Admission

## Step 1: Load the Data

In [None]:
import os
import urllib.request

data_location = "../data/college-admissions/admission-data.csv"
data_url = 'https://elephantscale-public.s3.amazonaws.com/data/college-admissions/admission-data.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)


In [None]:
import pandas as pd

dataset = pd.read_csv(data_location)
dataset

## Step 2 : Basic Analysis

### 2.1 Look at data split by 'admit' colunm
Looks like a resaonble distribution

In [None]:
## TODO : group by 'admit' column
dataset.groupby('admit').size()

### 2.2 Summary of data
use 'describe' function

In [None]:
## TODO : use 'describe'
dataset.describe()

## Step 3: Build feature vectors

In [None]:
## TODO : 
## input feature vector is  :  'gre', 'gpa', 'rank'
input_columns = ['gre', 'gpa', 'rank']
x = dataset[input_columns]

## TODO : label vector is 'admit'
y = dataset['admit']

print ('x : ', x.shape)
print ('y : ', y.shape)

## Step 4: Split Data into training and test.

We will split our the data up into training and test.  (You know the drill by now).

In [None]:
## TODO: Use training / test split of 80%/20%

from sklearn.model_selection import train_test_split

x_train,x_test,y_train, y_test = train_test_split(x,y,  test_size=0.2)
## to control train/test split set random_state to a number
# x_train,x_test,y_train, y_test = train_test_split(x,y, random_state=0, test_size=0.3)

print ("x_train :" , x_train.shape )
print ("x_test :", x_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

## Step 5: Random Forest

In [None]:
%%time
## TODO : Create a DecisionTree model 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

## TODO : train on (x_train, y_train)
model = rf.fit(x_train, y_train)
print(model)

## Step 6 : Get Predictions on Test

In [None]:
## TODO : Get predictions on test data
## Hint : 'test' set name
y_pred = model.predict(x_test)
y_pred

In [None]:
import pandas as pd

a = pd.DataFrame({'label' : y_test, 'prediction': y_pred})
a

## Step 7: Evaluate the model.

Let us check to see how the model did, using accuracy as a measure.

### 7.1 Model Accuracy

In [None]:
train_accuracy = model.score(x_train,y_train)
## TODO : score with (x_test, y_test)
test_accuracy = model.score(x_test, y_test)

print ("Train accuracy: ", train_accuracy)
print ("Test accuracy: ", test_accuracy)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

### 7.2 Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (8,5))

# colormaps : cmap="YlGnBu" , cmap="Greens", cmap="Blues",  cmap="Reds"
sns.heatmap(cm, annot=True, cmap="Reds", fmt='d').plot()

## Step 8 Feature Importances

How important were our features?

check out model.feature_importances_

What can you conclude?



In [None]:
model.feature_importances_

In [None]:
feature_importantance_df = pd.DataFrame({'feature' : input_columns, 'importantance' : model.feature_importances_})

feature_importantance_df.sort_values(['importantance'], ascending=False)

## Step 9: Do a Few Runs
- Click 'Cell --> Run All'
- Observe how the 'accuracy' output changes above
- Why?  Can we get the same accuracy all the times?

To get 'consistant' split for testing and random data you can set the seed variable random_state
   
```python
x_train,x_test,y_train, y_test = train_test_split(x,y, random_state=0, test_size=0.2)

```

## Step-10: Hyper Parameter Tuning

Let's use crossvalidation to get the best parameters for the tree!



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

grid_param = {
    "n_estimators" : [50,100],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(2,10,2),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}
grid_search = GridSearchCV(estimator=rf,
                           param_grid=grid_param,
                           cv=5,
                           n_jobs =-1,
                           verbose = 2)

In [None]:
%%time

grid_search.fit(x_train,y_train)

## Create the Best Tree

In [None]:
# Find the best params
print("best parameters", grid_search.best_params_)
print("best score: ", grid_search.best_score_)

In [None]:
#  Use the best params
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
                            n_estimators = grid_search.best_params_['n_estimators'],
                            criterion = grid_search.best_params_['criterion'], 
                            max_depth = grid_search.best_params_['max_depth'],  
                            min_samples_leaf = grid_search.best_params_['min_samples_leaf'], 
                            min_samples_split = grid_search.best_params_['min_samples_split'], 
                            max_features = grid_search.best_params_['max_features'])

model = rf.fit(x_train, y_train)

print ("Train accuracy: ", model.score(x_train,y_train))
print ("Test accuracy: ", model.score(x_test, y_test))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print (cm)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (8,5))

# colormaps : cmap="YlGnBu" , cmap="Greens", cmap="Blues",  cmap="Reds"
sns.heatmap(cm, annot=True, cmap="Reds", fmt='d').plot()