In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import ast
import seaborn as sns
from sklearn import metrics
from statistics import mean
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV , RepeatedStratifiedKFold , cross_val_predict, cross_val_score
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, median_absolute_error

In [None]:
# Data for 2005 to 2010 
dataset = pd.read_csv('../Data/features_2005_2010_new.csv')
dataset.head()
dataset.info()

In [None]:
# Data for 2010 to extract the paper ids from year 2010
id_2010 = pd.read_csv('../Data/papers2010.csv')
id_2010.head()

In [None]:
# Split data for train and test 
ids = id_2010['id'].tolist() 
train = dataset[dataset['id'].isin(ids) == False]
test = dataset[dataset['id'].isin(ids)]
train.info()

In [None]:
# Set ytrain and Xtrain
y_train_1yr = train.iloc[:,12]
y_train_2yr = train.iloc[:,13]
y_train_5yr = train.iloc[:,14]
y_train_10yr = train.iloc[:,15]

X_train = train.iloc[:,2:12]
print(y_train_5yr)
X_train.head()

In [None]:
test.info()

In [None]:
# Set ytest and Xtest
y_test_1yr = test.iloc[:,12]
y_test_2yr = test.iloc[:,13]
y_test_5yr = test.iloc[:,14]
y_test_10yr = test.iloc[:,15]

X_test = test.iloc[:,2:12]
print(y_test_5yr)
X_test.head()

# Train Model

In [None]:
def train_and_Predict(X_train, y_train, X_test):
    hgbr = HistGradientBoostingRegressor(learning_rate=0.15, max_iter=100, max_leaf_nodes=31, min_samples_leaf=20)
    model = hgbr.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    return(y_pred)

In [None]:
y_predict_1yr = train_and_Predict(X_train, y_train_1yr, X_test)
y_predict_1yr

In [None]:
y_predict_2yr = train_and_Predict(X_train, y_train_2yr, X_test)
y_predict_1yr

In [None]:
y_predict_5yr = train_and_Predict(X_train, y_train_5yr, X_test)
y_predict_5yr

In [None]:
y_predict_10yr = train_and_Predict(X_train, y_train_10yr, X_test)
y_predict_10yr

# Evaluation

In [None]:
print("Results for 1 year prediction:")
print("R squared:", r2_score(y_test_1yr, y_predict_1yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1yr, y_predict_1yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_1yr, y_predict_1yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1yr, y_predict_1yr)))
print('----------------------------------------')

print("Results for 2 year prediction:")
print("R squared:", r2_score(y_test_2yr, y_predict_2yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_2yr, y_predict_2yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_2yr, y_predict_2yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_2yr, y_predict_2yr)))
print('----------------------------------------')

print("Results for 5 year prediction:")
print("R squared:", r2_score(y_test_5yr, y_predict_5yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_5yr, y_predict_5yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_5yr, y_predict_5yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_5yr, y_predict_5yr)))
print('----------------------------------------')

print("Results for 10 year prediction:")
print("R squared:", r2_score(y_test_10yr, y_predict_10yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_10yr, y_predict_10yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_10yr, y_predict_10yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_10yr, y_predict_10yr)))

In [None]:
def graph_hist(y_test, y_predict, bins, title):
    plt.hist([y_test, y_predict],range=(0,bins), bins = bins, label=['test', 'predict'])
    plt.legend(loc='upper right')
    plt.title(title)
    plt.show()

In [None]:
graph_hist(y_test_1yr, y_predict_1yr, 25, "1 year prediction")
graph_hist(y_test_2yr, y_predict_2yr, 25, "2 year prediction")
graph_hist(y_test_5yr, y_predict_5yr, 25, "5 year prediction")
graph_hist(y_test_10yr, y_predict_10yr, 25, "10 year prediction")

# Hyperparameter tuning

In [None]:
#TUNING 
from sklearn.model_selection import GridSearchCV
LR = {'max_depth': [1,2,3,4,5,6,7,8],'n_estimators': [5,25,50,100,150, 200,250,300],'learning_rate':[0.35,0.30,0.25,0.20,0.15, 0.10, 0.1,0.05]}

tuning = GridSearchCV(estimator=HistGradientBoostingRegressor(), cv = 3, param_grid=LR, scoring = 'r2')

tuning.fit(X_train,y_train_5yr)
tuning.best_params_, tuning.best_score_
#y_pred = tuning.predict(X_test)
