In [1]:
import pandas as pd
import numpy as np

In [41]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv('grafana_data_export.csv', sep = ';', skiprows=[0])
df.drop(['Unnamed: 2'], axis=1, inplace=True)
df.head()

Unnamed: 0,Time,smarttemperature1
0,2018-04-26 17:23:22,32.0
1,2018-04-26 17:23:20,31.5
2,2018-04-26 17:23:18,31.75
3,2018-04-26 17:23:16,31.75
4,2018-04-26 17:23:14,31.75


In [5]:
df.shape

(5286, 2)

In [6]:
df.describe()

Unnamed: 0,smarttemperature1
count,4594.0
mean,34.00049
std,5.844117
min,30.25
25%,31.5
50%,32.0
75%,34.75
max,187.5


In [7]:
df[df.smarttemperature1 > 100.0]

Unnamed: 0,Time,smarttemperature1
2073,2018-04-26 16:14:14,158.25
2074,2018-04-26 16:14:12,122.75
2075,2018-04-26 16:14:10,141.75
2076,2018-04-26 16:14:08,170.75
2077,2018-04-26 16:14:06,109.5
2816,2018-04-26 15:49:11,187.5


In [8]:
df.dropna(how='any', inplace= True)

In [9]:
df.shape

(4594, 2)

In [11]:
5286-4594 # number of observations dropped because of missing temperature value

692

In [11]:
temp = df.Time[0]
print(temp)

2018-04-26 17:23:22


In [13]:
from datetime import datetime
datetime_object = datetime.strptime('2018-04-26 17:23:22', '%Y-%m-%d %H:%M:%S')

In [14]:
datetime_object

datetime.datetime(2018, 4, 26, 17, 23, 22)

In [20]:
datetime_object.year, datetime_object.month, datetime_object.day, datetime_object.hour, \
datetime_object.minute, datetime_object.second

(2018, 4, 26, 17, 23, 22)

In [22]:
year, month, day, hour, minute, second = [], [], [], [], [], []
for instant in df['Time'].values:
    instant_dt = datetime.strptime(instant, '%Y-%m-%d %H:%M:%S')
    year.append(instant_dt.year)
    month.append(instant_dt.month)
    day.append(instant_dt.day)
    hour.append(instant_dt.hour)
    minute.append(instant_dt.minute)
    second.append(instant_dt.second)
print("successfully converted all the datetime objects into lists")

successfully converted all the datetime objects into lists


In [29]:
# since there is no standard deviation in year, month, day we are dropping those values

df['hour'] = hour
df['minute'] = minute
df['second'] = second

df.head()

Unnamed: 0,Time,smarttemperature1,hour,minute,second
0,2018-04-26 17:23:22,32.0,17,23,22
1,2018-04-26 17:23:20,31.5,17,23,20
2,2018-04-26 17:23:18,31.75,17,23,18
3,2018-04-26 17:23:16,31.75,17,23,16
4,2018-04-26 17:23:14,31.75,17,23,14


In [30]:
df.drop(['Time'], axis = 1, inplace = True)
df.head()

Unnamed: 0,smarttemperature1,hour,minute,second
0,32.0,17,23,22
1,31.5,17,23,20
2,31.75,17,23,18
3,31.75,17,23,16
4,31.75,17,23,14


In [33]:
X = df.drop(['smarttemperature1'], axis = 1)
y = df['smarttemperature1']

In [35]:
X.shape, y.shape

((4594, 3), (4594,))

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [43]:
models = [
  [LinearRegression(), {"fit_intercept": [True, False]}], 
  [KNeighborsRegressor(), {"n_neighbors": [1,2], "weights": ["uniform", "distance"]}], 
  [DecisionTreeRegressor(), {"criterion": ["mse", "friedman_mse"], "splitter": ["best", "random"],
    "min_samples_split": [x for x in range(2,6)] # generates a list [2,3,4,5]
  }],
  [GradientBoostingRegressor(), {"loss": ["ls", "lad", "huber", "quantile"]}],
  [GaussianProcessRegressor(), {}],
  [PLSRegression(), {}],
  [AdaBoostRegressor(), {}],
    [Lasso(), {"alpha": [0.2, 0.3, 0.7, 0.75, 0.8]}],
    [Ridge(), {"alpha": [0.2, 0.3, 0.7, 0.75, 0.8]}]
]

In [44]:
for model in models:
    regressor = model[0]
    param_grid = model[1]
    model = GridSearchCV(regressor, param_grid, cv = 10)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.best_params_, model.best_estimator_)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(mse, r2)

{'fit_intercept': True} LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
26.434672094848313 0.012217429753806797
{'n_neighbors': 2, 'weights': 'uniform'} KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=2, p=2,
          weights='uniform')
37.947214673913045 -0.4179709553343325
{'criterion': 'friedman_mse', 'min_samples_split': 2, 'splitter': 'random'} DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='random')
6.380298913043478 0.7615878102572226
{'loss': 'ls'} GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
      

In [48]:
model = DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='random')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))

6.380298913043478 0.7615878102572226


In [46]:
model = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))

8.167835866133194 0.6947930401311686
