In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import math as m
import pickle

# Needed to classify the data (not regression)
from sklearn.tree import DecisionTreeClassifier

# Used to assign values in a column to a # 
from sklearn.preprocessing import LabelEncoder 

# Splits data into training and testing datasets 
from sklearn.model_selection import train_test_split

# The metrics is used to calculate the accuracy of the model
from sklearn import metrics

In [None]:
import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# Data for 2005 to 2010 
dataset = pd.read_csv('../Data/features_2005_2010_new.csv')

# Data for 2010
id_2010 = pd.read_csv('../Data/papers2010.csv')
dataset.head()

### Applying train and test split 

In [None]:
# Split data for train and test 
ids = id_2010['id'].tolist() 
train = dataset[dataset['id'].isin(ids) == False]
test = dataset[dataset['id'].isin(ids)]
train.info()

In [None]:
# Set values for train dataset columns 
# train = pd.DataFrame(data=train.iloc[:,2:14].values, columns=["diversity","venue_rank","venue_MPI","venue_TPI","productivity","H_index","author_rank","author_MPI","author_TPI","versatility","n_citation"])
# Set ytrain and Xtrain
y_train_1yr = train.iloc[:,12]
y_train_2yr = train.iloc[:,13]
y_train_5yr = train.iloc[:,14]
y_train_10yr = train.iloc[:,15]

X_train = train.iloc[:,2:12]
X_train.head()

In [None]:
y_train_2yr.head()

In [None]:
# Set values for test dataset columns
# Set ytest and Xtest
y_test_1yr = test.iloc[:,12]
y_test_2yr = test.iloc[:,13]
y_test_5yr = test.iloc[:,14]
y_test_10yr = test.iloc[:,15]

X_test = test.iloc[:,2:12]
X_test.head()

In [None]:
y_test_2yr.head()

# Model training

In [None]:
def model_train_and_predict(X_train, y_train, X_test):
    # Creates the decision tree classifier object
    clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth = 25, min_samples_split=4, min_samples_leaf=2, random_state = 0)

    # Train decision tree classifier
    clf = clf.fit(X_train,y_train)

    # Predicts response for test data
    y_pred = clf.predict(X_test)
    return(y_pred)

In [None]:
y_prediction_1yr = model_train_and_predict(X_train, y_train_1yr, X_test)

In [None]:
y_prediction_2yr = model_train_and_predict(X_train, y_train_2yr, X_test)

In [None]:
y_prediction_5yr = model_train_and_predict(X_train, y_train_5yr, X_test)

In [None]:
y_prediction_10yr = model_train_and_predict(X_train, y_train_10yr, X_test)

# Evaluation

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

print("Results for 1 year prediction:")
print("Accuracy: ", metrics.accuracy_score(y_test_1yr,y_prediction_1yr))
print("R squared:", r2_score(y_test_1yr, y_prediction_1yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1yr, y_prediction_1yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_1yr, y_prediction_1yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1yr, y_prediction_1yr)))
print('----------------------------------------')

print("Results for 2 year prediction:")
print("Accuracy: ", metrics.accuracy_score(y_test_2yr,y_prediction_2yr))
print("R squared:", r2_score(y_test_2yr, y_prediction_2yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_2yr, y_prediction_2yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_2yr, y_prediction_2yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_2yr, y_prediction_2yr)))
print('----------------------------------------')

print("Results for 5 year prediction:")
print("Accuracy: ", metrics.accuracy_score(y_test_5yr,y_prediction_5yr))
print("R squared:", r2_score(y_test_5yr, y_prediction_5yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_5yr, y_prediction_5yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_5yr, y_prediction_5yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_5yr, y_prediction_5yr)))
print('----------------------------------------')

print("Results for 10 year prediction:")
print("Accuracy: ", metrics.accuracy_score(y_test_10yr,y_prediction_10yr))
print("R squared:", r2_score(y_test_10yr, y_prediction_10yr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_10yr, y_prediction_10yr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_10yr, y_prediction_10yr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_10yr, y_prediction_10yr)))

In [None]:
def graph_hist(y_test_2yr, y_predict, bins, title):
    plt.hist([y_test_2yr, y_predict],range=(0,bins), bins = bins, label=['test', 'predict'])
    plt.legend(loc='upper right')
    plt.title(title)
    plt.show()

In [None]:
graph_hist(y_test_1yr, y_prediction_1yr, 25, "1 year prediction")
graph_hist(y_test_2yr, y_prediction_2yr, 25, "2 year prediction")
graph_hist(y_test_5yr, y_prediction_5yr, 25, "5 year prediction")
graph_hist(y_test_10yr, y_prediction_10yr, 25, "10 year prediction")