In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load data
data = pd.read_csv('Data.csv')

In [3]:
# Data inspection; missing value check
data.isna().any()

AT    False
V     False
AP    False
RH    False
PE    False
dtype: bool

In [4]:
# Data inspection; statistical check
data.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


In [5]:
# Splitting Xs and ys
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [6]:
# linear regression
def lin_reg(X, y):
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score, mean_squared_error
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    dic = {'Model': 'Linear', 'MSE':mean_squared_error(y_test, y_pred), 'R2':r2_score(y_test, y_pred)}
    return y_pred, y_test, dic

In [7]:
# Polynomial regression
def poly_reg(X, y, degree):
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score, mean_squared_error
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
    pf = PolynomialFeatures(degree = degree)
    X_train = pf.fit_transform(X_train)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(pf.transform(X_test))
    dic = {'Model': 'Polynomial','MSE':mean_squared_error(y_test, y_pred), 'R2':r2_score(y_test, y_pred)}
    return y_pred, y_test, dic

In [8]:
# Support Vector regression
def sv_reg(X, y):
    from sklearn.svm import SVR
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score, mean_squared_error
    y = y.reshape((len(y), 1))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
    X_scaler = StandardScaler()
    y_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    y_train = y_scaler.fit_transform(y_train)
    regressor = SVR(kernel = 'rbf')
    regressor.fit(X_train, y_train)
    y_pred = y_scaler.inverse_transform(regressor.predict(X_scaler.transform(X_test)).reshape(-1,1))  
    dic = {'Model': 'Support Vector', 'MSE':mean_squared_error(y_test, y_pred), 'R2':r2_score(y_test, y_pred)}  
    return y_pred, y_test, dic

In [9]:
def tree_reg(X,y):
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import r2_score, mean_squared_error
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
    dt = DecisionTreeRegressor()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    dic = {'Model': 'Decision Tree','MSE':mean_squared_error(y_test, y_pred), 'R2':r2_score(y_test, y_pred)}
    return y_pred, y_test, dic

In [10]:
def rf_reg(X,y, estimators):
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score, mean_squared_error
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
    rf = RandomForestRegressor(n_estimators = estimators, random_state=1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    dic = {'Model': 'Random Forest','MSE':mean_squared_error(y_test, y_pred), 'R2':r2_score(y_test, y_pred)}
    return y_pred, y_test, dic

In [11]:
def comparison(poly_deg, rf_estimators):
    _, _, lin_dic = lin_reg(X, y)
    _, _, poly_dic = poly_reg(X ,y, poly_deg)
    _, _, svr_dic = sv_reg(X, y)
    _, _, dt_dic = tree_reg(X, y)
    _, _, rf_dic = rf_reg(X, y, rf_estimators)
    dics = [lin_dic, poly_dic, svr_dic, dt_dic, rf_dic]
    R2_dic = {}
    for i in range(len(dics)):
        R2_dic[dics[i]['Model']] = dics[i]['R2']
    df = pd.DataFrame(dics).sort_values(['R2'], axis=0, ascending=False, ignore_index=True)
    return R2_dic, df

In [12]:
_, df = comparison(poly_deg=4, rf_estimators=10)
df

  y = column_or_1d(y, warn=True)


Unnamed: 0,Model,MSE,R2
0,Random Forest,11.943449,0.958824
1,Support Vector,15.417686,0.946846
2,Polynomial,16.478211,0.94319
3,Linear,20.273706,0.930105
4,Decision Tree,20.425083,0.929583
