# Random Forest Tree Model

#### Add libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ast
import seaborn as sns
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
import pickle

#### Add the database 2005-2010

In [None]:
# Data for 2005 to 2010 
dataset = pd.read_csv('../Data/features_2005_2010_yearly_citation.csv')
dataset.head()
dataset.info()

In [None]:
# Data for 2010 to extract the paper ids from year 2010
id_2010 = pd.read_csv('../Data/papers2010.csv')
id_2010.head()

In [None]:
# Split data for train and test 
ids = id_2010['id'].tolist() 
train = dataset[dataset['id'].isin(ids) == False]
test = dataset[dataset['id'].isin(ids)]
train.info()

In [None]:
# Set ytrain and Xtrain
y_train_1yr = train.iloc[:,14]
y_train_2yr = train.iloc[:,15]
y_train_3yr = train.iloc[:,16]
y_train_4yr = train.iloc[:,17]
y_train_5yr = train.iloc[:,18]
y_train_7yr = train.iloc[:,19]
y_train_10yr = train.iloc[:,20]

X_train = train.iloc[:,2:12]
print(y_train_5yr)
X_train.head()

In [None]:
test.info()

In [None]:
# Set ytest and Xtest
y_test_1yr = test.iloc[:,14]
y_test_2yr = test.iloc[:,15]
y_test_3yr = test.iloc[:,16]
y_test_4yr = test.iloc[:,17]
y_test_5yr = test.iloc[:,18]
y_test_7yr = test.iloc[:,19]
y_test_10yr = test.iloc[:,20]

test_ids = test.iloc[:,1].copy()
X_test = test.iloc[:,2:12]
print(y_test_5yr)
X_test.head()

In [None]:
test_ids = test_ids.to_frame()

#### Check the length of training-set vs. testing-set

In [None]:
print("dataset length:", len(dataset))
print("trainset length:", len(train))
print("testset length:", len(test))
train.describe()

# Model Prediction

#### random forest regressor after hyperparameter optimization

In [None]:
def save_model(model, name):
    model_name = '../Results/'+name+'.pkl'
    pickle.dump(model,open(model_name,'wb'))

In [None]:
def train_and_Predict(X_train, y_train, X_test, name):
    regressor = RandomForestRegressor(n_estimators = 32, random_state = 0, min_samples_split = 2, min_samples_leaf = 2, max_features = 'auto', max_depth = 60, bootstrap =  True)
    regressor.fit(X_train, y_train)
    # To save the models and reload them later uncomment the line below
    #save_model(regressor, name)
    y_predict = regressor.predict(X_test)
    return y_predict

In [None]:
y_predict_1yr = train_and_Predict(X_train, y_train_1yr, X_test, 'rf_1yr')
y_predict_1yr

In [None]:
y_predict_2yr = train_and_Predict(X_train, y_train_2yr, X_test, 'rf_2yr')
y_predict_2yr

In [None]:
y_predict_3yr = train_and_Predict(X_train, y_train_3yr, X_test, 'rf_3yr')
y_predict_3yr

In [None]:
y_predict_4yr = train_and_Predict(X_train, y_train_4yr, X_test, 'rf_4yr')
y_predict_4yr

In [None]:
y_predict_5yr = train_and_Predict(X_train, y_train_5yr, X_test, 'rf_5yr')
y_predict_5yr

In [None]:
y_predict_7yr = train_and_Predict(X_train, y_train_7yr, X_test, 'rf_7yr')
y_predict_7yr

In [None]:
y_predict_10yr = train_and_Predict(X_train, y_train_10yr, X_test, 'rf_10yr')
y_predict_10yr

#### In case of loading a model from the results folder

In [None]:
new_model = pickle.load(open('../Results/rf_1yr.pkl','rb'))
y_predict_new = new_model.predict(X_test)
y_predict_new

## Evaluation

#### Results:

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

print("Results for 1 year prediction:")
print("R squared:", r2_score(y_test_1yr, y_predict_1yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1yr, y_predict_1yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_1yr, y_predict_1yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1yr, y_predict_1yr)))
print('----------------------------------------')

print("Results for 2 year prediction:")
print("R squared:", r2_score(y_test_2yr, y_predict_2yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_2yr, y_predict_2yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_2yr, y_predict_2yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_2yr, y_predict_2yr)))
print('----------------------------------------')

print("Results for 3 year prediction:")
print("R squared:", r2_score(y_test_3yr, y_predict_3yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_3yr, y_predict_3yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_3yr, y_predict_3yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_3yr, y_predict_3yr)))
print('----------------------------------------')

print("Results for 4 year prediction:")
print("R squared:", r2_score(y_test_4yr, y_predict_4yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_4yr, y_predict_4yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_4yr, y_predict_4yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_4yr, y_predict_4yr)))
print('----------------------------------------')

print("Results for 5 year prediction:")
print("R squared:", r2_score(y_test_5yr, y_predict_5yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_5yr, y_predict_5yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_5yr, y_predict_5yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_5yr, y_predict_5yr)))
print('----------------------------------------')

print("Results for 7 year prediction:")
print("R squared:", r2_score(y_test_7yr, y_predict_7yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_7yr, y_predict_7yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_7yr, y_predict_7yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_7yr, y_predict_7yr)))
print('----------------------------------------')

print("Results for 10 year prediction:")
print("R squared:", r2_score(y_test_10yr, y_predict_10yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_10yr, y_predict_10yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_10yr, y_predict_10yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_10yr, y_predict_10yr)))

#### Histograms:

In [None]:
from matplotlib.ticker import PercentFormatter
def graph_hist(y_train, y_test, y_predict, bins, title):
    plt.hist([y_train, y_test, y_predict],range=(0,bins), bins = bins, density=True, label=['train', 'test', 'predict'])
    plt.legend(loc='upper right')
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.title(title)
    plt.show()

In [None]:
graph_hist(y_train_1yr, y_test_1yr, y_predict_1yr, 10, "1 year prediction")
graph_hist(y_train_2yr, y_test_2yr, y_predict_2yr, 25, "2 year prediction")
graph_hist(y_train_3yr, y_test_3yr, y_predict_3yr, 25, "3 year prediction")
graph_hist(y_train_4yr, y_test_4yr, y_predict_4yr, 25, "4 year prediction")
graph_hist(y_train_5yr, y_test_5yr, y_predict_5yr, 25, "5 year prediction")
graph_hist(y_train_7yr, y_test_7yr, y_predict_7yr, 25, "7 year prediction")
graph_hist(y_train_10yr, y_test_10yr, y_predict_10yr, 25, "10 year prediction")

## Demo

In [None]:
# choose a paper
X_test.head(10)


In [None]:
X_test_demo = test.iloc[:,1:12]
id_test = X_test_demo.iloc[310].id
id_test = np.int64(id_test)
id_test

In [None]:
print("title:", str(id_2010.loc[id_2010['id'] == id_test].iloc[0]['title']))
print("authors:", str(id_2010.loc[id_2010['id'] == id_test].iloc[0]['authors']))
print("year:", str(id_2010.loc[id_2010['id'] == id_test].iloc[0]['year']))
print("venue:", str(id_2010.loc[id_2010['id'] == id_test].iloc[0]['venue']))

In [None]:
# get the features
y_test_1yr = test.iloc[310,12]
y_test_2yr = test.iloc[310,13]
y_test_5yr = test.iloc[310,14]
y_test_10yr = test.iloc[310,15]

X_test_demo = test.iloc[310,2:12]
X_test_demo

In [None]:
test.iloc[300,:]

In [None]:
# train the model
y_predict_1yr_demo = train_and_Predict(X_train, y_train_1yr, np.array( [X_test_demo,] ) )
y_predict_1yr_demo

In [None]:
# train the model
y_predict_2yr_demo = train_and_Predict(X_train, y_train_2yr, np.array( [X_test_demo,] ) )
y_predict_2yr_demo

In [None]:
# train the model
y_predict_5yr_demo = train_and_Predict(X_train, y_train_5yr, np.array( [X_test_demo,] ) )
y_predict_5yr_demo

In [None]:
# train the model
y_predict_10yr_demo = train_and_Predict(X_train, y_train_10yr, np.array( [X_test_demo,] ) )
y_predict_10yr_demo

## Feature Engineering

In [None]:
# feature importance in eye of the random forest regresor model
regressor = pickle.load(open('../Results/rf_1yr.pkl','rb'))
feature_list = list(X_train.columns)
# Get numerical feature importances
importances = list(regressor.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]


In [None]:
# list of x locations for plotting
x_values = list(range(len(importances)))
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# Make a line graph
plt.plot(x_values, cumulative_importances, 'g-')
# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')
# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');

Now we try to take the features which result in the top 95% of the importance

In [None]:
# after feature engineering
new_train = train[['author_rank','venue_MPI','author_MPI','versatility','diversity','n_citation']].copy()
new_test = test[['author_rank','venue_MPI','author_MPI','versatility','diversity','n_citation']].copy()
new_train.head()

In [None]:
# Set ytrain and Xtrain
y_train_5yr = new_train.iloc[:,-1]
X_train = new_train.iloc[:,1:-1]
# Set ytest and Xtest
y_test_2yr = new_test.iloc[:,-1]
X_test = new_test.iloc[:,1:-1]

In [None]:
X_train.shape

## Correlation graph between features

In [None]:
# Use seaborn for pair plots
import seaborn as sns
sns.set(style="ticks", color_codes=True);
# Create a custom color palete
palette = sns.xkcd_palette(['dark blue', 'dark green', 'gold', 'orange'])
# Make the pair plot with a some aesthetic changes
sns.pairplot(train, diag_kind = 'kde', palette= palette, plot_kws=dict(alpha = 0.7),
                   diag_kws=dict(shade=True))

## Hyperparameter tuning

#### Checking the current random forest parameters:

In [None]:
from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(regressor.get_params())

Lets optimize the following features:

- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)

#### Creating the parameter grid:

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

#### Random forest training:

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
regressor_optimized = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 10 different combinations, and use all available cores
# cross validation is 3, therefore it uses 2 fold to train and 3rd to validate the results.
rf_random = RandomizedSearchCV(estimator = regressor_optimized, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train_2yr)

#### The best parameters:

In [None]:
rf_random.best_params_