# Linear regression with scikit-learn

Importing the modules

In [14]:
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.dates import DateFormatter

import numpy as np
from sklearn import linear_model

from cassandra.cluster import Cluster
from cassandra.metadata import KeyspaceMetadata

Connecting to SensorHUB

In [15]:
cluster = Cluster(["172.20.16.137"], port=30241)
keyspace_name = "sensorhub"
session = cluster.connect(keyspace_name)

Getting data

In [16]:
query = "SELECT sensor, datum, temperatureat2meter FROM erti_new WHERE datum >= '2017-01-01 00:00:00' AND  datum <= '2017-01-31 23:59:00' AND sensor=1"
df = pd.DataFrame(list(session.execute(query)))

Clean the dataframe

In [17]:
df['temperatureat2meter'] = df['temperatureat2meter'].replace(to_replace=[-888.8, -777.7, -999.9], value=np.NaN)

In [18]:
df.isnull().values.any()

False

In [19]:
df.tail()

Unnamed: 0,sensor,datum,temperatureat2meter
3842,1,2017-01-31 23:00:01,0.0
3843,1,2017-01-31 23:10:01,0.1
3844,1,2017-01-31 23:20:01,0.1
3845,1,2017-01-31 23:30:01,0.2
3846,1,2017-01-31 23:50:11,0.0


In [20]:
df_train = pd.DataFrame(df.iloc[:1500])
df_test = pd.DataFrame(df.iloc[1501:])

In [21]:
df_train.tail()

Unnamed: 0,sensor,datum,temperatureat2meter
1495,1,2017-01-13 01:20:01,-3.2
1496,1,2017-01-13 01:30:01,-3.1
1497,1,2017-01-13 01:40:01,-3.2
1498,1,2017-01-13 02:00:11,-1.7
1499,1,2017-01-13 02:10:01,-1.6


In [22]:
x = np.arange(1500)

In [23]:
x

array([   0,    1,    2, ..., 1497, 1498, 1499])

In [26]:
lin_reg_model = linear_model.LinearRegression()

In [27]:
x = x.reshape(-1,1)
y = np.array(df_train.temperatureat2meter)
y = y.reshape(-1,1)

In [28]:
lin_reg_model.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [29]:
x_new = [[1600]]
print(lin_reg_model.predict(x_new))

[[-10.04657056]]


In [30]:
df.iloc[1600]

sensor                                   1
datum                  2017-01-13 21:30:01
temperatureat2meter                    0.8
Name: 1600, dtype: object