# Baseline Models

#### Add libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ast
import seaborn as sns
from statistics import mean
from sklearn.ensemble import RandomForestRegressor

#### Add the database 2005-2010

In [None]:
# Data for 2005 to 2010 
dataset = pd.read_csv('../Data/features_2005_2010_new.csv')
dataset.head()
dataset.info()

In [None]:
# Data for 2010 to extract the paper ids from year 2010
id_2010 = pd.read_csv('../Data/papers2010.csv')
id_2010.head()

In [None]:
# Split data for train and test 
ids = id_2010['id'].tolist() 
train = dataset[dataset['id'].isin(ids) == False]
test = dataset[dataset['id'].isin(ids)]
train.info()

In [None]:
# Set ytrain and Xtrain
y_train_1yr = train.iloc[:,12]
y_train_2yr = train.iloc[:,13]
y_train_5yr = train.iloc[:,14]
y_train_10yr = train.iloc[:,15]

X_train = train.iloc[:,2:12]
print(y_train_5yr)
X_train.head()

In [None]:
test.info()

In [None]:
# Set ytest and Xtest
y_test_1yr = test.iloc[:,12]
y_test_2yr = test.iloc[:,13]
y_test_5yr = test.iloc[:,14]
y_test_10yr = test.iloc[:,15]

X_test = test.iloc[:,2:12]
print(y_test_5yr)
X_test.head()

#### Check the length of training-set vs. testing-set

In [None]:
print("dataset length:", len(dataset))
print("trainset length:", len(train))
print("testset length:", len(test))
train.describe()

# Model Prediction

#### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def train_and_Predict_linear_regression(X_train, y_train, X_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_predict = regressor.predict(X_test)
    return y_predict

In [None]:
y_predict_1yr_lr = train_and_Predict_linear_regression(X_train, y_train_1yr, X_test)
y_predict_2yr_lr = train_and_Predict_linear_regression(X_train, y_train_2yr, X_test)
y_predict_5yr_lr = train_and_Predict_linear_regression(X_train, y_train_5yr, X_test)
y_predict_10yr_lr = train_and_Predict_linear_regression(X_train, y_train_10yr, X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

print("Results for 1 year prediction:")
print("R squared:", r2_score(y_test_1yr, y_predict_1yr_lr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1yr, y_predict_1yr_lr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_1yr, y_predict_1yr_lr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1yr, y_predict_1yr_lr)))
print('----------------------------------------')

print("Results for 2 year prediction:")
print("R squared:", r2_score(y_test_2yr, y_predict_2yr_lr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_2yr, y_predict_2yr_lr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_2yr, y_predict_2yr_lr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_2yr, y_predict_2yr_lr)))
print('----------------------------------------')

print("Results for 5 year prediction:")
print("R squared:", r2_score(y_test_5yr, y_predict_5yr_lr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_5yr, y_predict_5yr_lr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_5yr, y_predict_5yr_lr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_5yr, y_predict_5yr_lr)))
print('----------------------------------------')

print("Results for 10 year prediction:")
print("R squared:", r2_score(y_test_10yr, y_predict_10yr_lr))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_10yr, y_predict_10yr_lr))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_10yr, y_predict_10yr_lr))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_10yr, y_predict_10yr_lr)))

#### SVM Model

In [None]:
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
def train_and_Predict_svm(X_train, y_train, X_test):
    regressor = SVR(kernel='poly', C=1e3, degree=2)
    regressor.fit(X_train, y_train)
    y_predict = regressor.predict(X_test)
    return y_predict

In [None]:
def train_and_Predict_svm(X_train, y_train, X_test):
    regressor = make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-5, dual = True, max_iter = 2000))
    regressor.fit(X_train, y_train)
    y_predict = regressor.predict(X_test)
    return y_predict

In [None]:
y_predict_1yr_svm = train_and_Predict_svm(X_train, y_train_1yr, X_test)
y_predict_2yr_svm = train_and_Predict_svm(X_train, y_train_2yr, X_test)
y_predict_5yr_svm = train_and_Predict_svm(X_train, y_train_5yr, X_test)
y_predict_10yr_svm = train_and_Predict_svm(X_train, y_train_10yr, X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

print("Results for 1 year prediction:")
print("R squared:", r2_score(y_test_1yr, y_predict_1yr_svm))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1yr, y_predict_1yr_svm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_1yr, y_predict_1yr_svm))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1yr, y_predict_1yr_svm)))
print('----------------------------------------')

print("Results for 2 year prediction:")
print("R squared:", r2_score(y_test_2yr, y_predict_2yr_svm))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_2yr, y_predict_2yr_svm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_2yr, y_predict_2yr_svm))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_2yr, y_predict_2yr_svm)))
print('----------------------------------------')

print("Results for 5 year prediction:")
print("R squared:", r2_score(y_test_5yr, y_predict_5yr_svm))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_5yr, y_predict_5yr_svm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_5yr, y_predict_5yr_svm))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_5yr, y_predict_5yr_svm)))
print('----------------------------------------')

print("Results for 10 year prediction:")
print("R squared:", r2_score(y_test_10yr, y_predict_10yr_svm))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_10yr, y_predict_10yr_svm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_10yr, y_predict_10yr_svm))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_10yr, y_predict_10yr_svm)))

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def train_and_Predict_k_mean(X_train, y_train, X_test):
    regressor = KNeighborsRegressor(n_neighbors=5)
    regressor.fit(X_train, y_train)
    y_predict = regressor.predict(X_test)
    return y_predict

In [None]:
y_predict_1yr_km = train_and_Predict_k_mean(X_train, y_train_1yr, X_test)
y_predict_2yr_km = train_and_Predict_k_mean(X_train, y_train_2yr, X_test)
y_predict_5yr_km = train_and_Predict_k_mean(X_train, y_train_5yr, X_test)
y_predict_10yr_km = train_and_Predict_k_mean(X_train, y_train_10yr, X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

print("Results for 1 year prediction:")
print("R squared:", r2_score(y_test_1yr, y_predict_1yr_km))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1yr, y_predict_1yr_km))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_1yr, y_predict_1yr_km))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1yr, y_predict_1yr_km)))
print('----------------------------------------')

print("Results for 2 year prediction:")
print("R squared:", r2_score(y_test_2yr, y_predict_2yr_km))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_2yr, y_predict_2yr_km))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_2yr, y_predict_2yr_km))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_2yr, y_predict_2yr_km)))
print('----------------------------------------')

print("Results for 5 year prediction:")
print("R squared:", r2_score(y_test_5yr, y_predict_5yr_km))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_5yr, y_predict_5yr_km))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_5yr, y_predict_5yr_km))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_5yr, y_predict_5yr_km)))
print('----------------------------------------')

print("Results for 10 year prediction:")
print("R squared:", r2_score(y_test_10yr, y_predict_10yr_km))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_10yr, y_predict_10yr_km))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_10yr, y_predict_10yr_km))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_10yr, y_predict_10yr_km)))