# Homework 1
***

We are going to work with the following dataset: fluid current in a tube.
Some statistics are collected for dataset, incl. mean, skewness, kurtosis, etc. We are predicting flow rate ('tohn/hour'). We need to build confidence and predictive intervals.

In [None]:
%matplotlib inline

import numpy as np
from sklearn import datasets, linear_model, preprocessing, model_selection
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv('exxsol_data.csv', sep=';', header=(0))

## There are 10 features and 1 label to predict:

In [None]:
print(df.columns.values)

['mean' 'std' 'skew' 'kurt' 'RMS' 'crest' 'freq_peak' 'shan' 'perm' 'temp'
 'tohn/hour']


In [None]:
y = df['tohn/hour']
freq_temp = df[['freq_peak','temp']]

Physics tells us that flow rate is a function of a frequency peak and temperature.

In [None]:
freq_temp, y = shuffle(freq_temp, y)

# split data into training and testing sets
#from sklearn.model_selection import train_test_split
#train_freq, test_freq, train_y, test_y = train_test_split(freq, y, train_size=0.7, random_state=2)

lr = linear_model.LinearRegression()
predicted = model_selection.cross_val_predict(
    lr, freq_temp, y.ravel(), cv=20)
score = model_selection.cross_val_score(lr, freq_temp, y,
                                         scoring='r2',cv=20)

## Q0: Build point estimate for mean r2 score and its deviation

In [None]:
print('mean r2 score: ', score.mean())
print('its deviation: ', score.std())

mean r2 score:  0.8277067450774153
its deviation:  0.05317070147409783


## Q1: Predicted is an array with predictions of the label y. Assuming, that $\sigma = 0.1$, compute 95% confidence and predictive interval for mean squared error. 

In [None]:
from scipy import stats

se = np.power(y.values - predicted, 2)
s = 0.1
z = stats.norm.ppf(1 - (1 - 0.95) / 2)

lower = err.mean() - z * s
upper = err.mean() + z * s
print('predictive: ' + str([lower, upper]))

lower = err.mean() - z * s / np.sqrt(len(y))
upper = err.mean() + z * s / np.sqrt(len(y))
print('confidence: ' + str([lower, upper]))

predictive: [-0.032932981351083135, 0.3590598155569277]
confidence: [0.1556554495575848, 0.17047138464825973]


## Q2:  Compute 95% confidence and predicted intervals for mean squared error, assuming no knowledge about $\sigma$.

In [None]:
from scipy import stats

se = np.power(y.values - predicted, 2)
s = 0.1
t = stats.t.ppf(1 - (1 - 0.95) / 2, len(y) - 1)

lower = err.mean() - t * err.std()
upper = err.mean() + t * err.std()
print('predictive: ' + str([lower, upper]))

lower = err.mean() - t * err.std() / np.sqrt(len(y))
upper = err.mean() + t * err.std() / np.sqrt(len(y))
print('confidence: ' + str([lower, upper]))
#

predictive: [-0.5225089655634272, 0.8486357997692717]
confidence: [0.13715121667050556, 0.18897561753533898]


We can use additional features and more complex model, e.g. ElasticNet.

In [None]:
y1 = df['tohn/hour']
X = df.drop(['tohn/hour'],axis=1)
X = preprocessing.scale(X)
X, y1 = shuffle(X, y1)

encv = linear_model.ElasticNetCV(cv=10,max_iter=3000, n_alphas=10)
predicted_encv = model_selection.cross_val_predict(
    encv, X, y1.ravel(), cv=20)
score_encv = model_selection.cross_val_score(encv,X, y1.ravel(),
                                         scoring='r2',cv=20)

## Q3:  Compute 95% confidence interval for difference in means of mean squared error between 2 models, assuming no knowledge about $\sigma$.

In [None]:
se = np.power(y.values - predicted, 2)
se_2 = np.power(y1.values - predicted_encv, 2)
X_mean, Y_mean = np.mean(se), np.mean(se_2)
std = np.sqrt(((len(se) - 1) * np.var(se, ddof=1) + (len(se_2) - 1) * np.var(se_2, ddof = 1)) / (len(se) + len(se) - 2))


t2 = stats.t.ppf(1 - (1 - 0.95) / 2, (len(se) + len(se) - 2))

lower = np.mean(se) - np.mean(se_2) - t * std
upper = np.mean(se) - np.mean(se_2) + t * std
print('predictive: ' + str([lower, upper]))

lower = np.mean(se) - np.mean(se_2) - t * std / np.sqrt(len(y))
upper = np.mean(se) - np.mean(se_2) + t * std / np.sqrt(len(y))
print('confidence: ' + str([lower, upper]))

predictive: [-0.4015691326294775, 0.6081227988004898]
confidence: [0.0841954491472763, 0.12235821702373594]
