# Linear Regression

En este Notebook aplicaremos el modelo de regresion lineal para predecir el precio del bitcoin en base al sentimiento en twitter en ese instante.

## Imports

In [1]:
import datetime
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import Normalizer, MinMaxScaler

import matplotlib.pylab as plt
%matplotlib inline 

from JABA.service.scrapper import DFPicker

### Pick the data

Haremos uso de la funcion **get_complete_df(dateFrom, dateTo)** de **DFPicker** para recopilar los datos de las fechas de las que queremos realizar la observacion

In [2]:
date_init = "2017-01-02"
date_limit = "2017-02-02"
df = DFPicker.get_complete_df(date_init, date_limit)
price = df['Close']
sentiment = df.loc[:,'sentiment_v':'sentiment_v2']


Extraction Completed!


In [3]:
print(price.shape)
print(sentiment.shape)

(1488,)
(1488, 3)


### Remove Outliers

In [4]:
def remove_outliers(data):
    # For each feature find the data points with extreme high or low values
    mios = pd.Series([False])
    mios = mios.repeat(data.shape[0]).reset_index(drop=True)

    for feature in data.keys():

        # Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(data[feature], q=25)

        # Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(data[feature], q=75)

        # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        interquartile_range = Q3 - Q1
        step = 1.5 * interquartile_range

        # Display the outliers
        print("Data points considered outliers for the feature '{}':".format(feature))
        display(data[~((data[feature] >= Q1 - step) & (data[feature] <= Q3 + step))])
        aux = ~((data[feature] >= Q1 - step) & (data[feature] <= Q3 + step))
        mios = mios | aux

    # Meaning of ~ character in Python: https://docs.python.org/3/reference/expressions.html#unary-arithmetic-and-bitwise-operations

    # OPTIONAL: Select the indices for data points you wish to remove
    outliers  = []
    outliers = mios
    # Remove the outliers, if any were specified
    good_data = data.drop(data.index[outliers]).reset_index(drop = True)
    
    return good_data

### Model Evaluation

#### 1- Prueba standard de modelos

In [12]:
X_train, X_test, y_train, y_test = DFPicker.train_test_splitter(sentiment, price, 0.2)

########  LINEAR REGRESSION  ########      
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Linear RMSE: {}".format(rmse))
accuracy = model.score(X_test,y_test)
print("Accuracy Linear: {}".format(accuracy))
R2 = r2_score(y_test, y_pred)
print("R2 Linear: {}".format(R2))
print("")

########  lASSO  ########      
model2 = Lasso()
model2.fit(X=X_train, y=y_train)
y_pred2 = model2.predict(X_test)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred2))
print("Lasso RMSE: {}".format(rmse_lasso))
accuracy_lasso = model2.score(X_test,y_test)
print("Accuracy Lasso: {}".format(accuracy_lasso))
R2_lasso = r2_score(y_test, y_pred2)
print("R2 Lasso: {}".format(R2_lasso))
print("")

########  RIDGE  ########      
model_ridge = Ridge()
model_ridge.fit(X=X_train, y=y_train)
y_pred_ridge = model_ridge.predict(X_test)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print("Ridge RMSE: {}".format(rmse_ridge))
accuracy_ridge = model_ridge.score(X_test,y_test)
print("Accuracy Ridge: {}".format(accuracy_ridge))
R2_ridge = r2_score(y_test, y_pred_ridge)
print("R2 Ridge: {}".format(R2_ridge))
print("")


Training set has 1190 samples.
Testing set has 298 samples.
Linear RMSE: 61.59889840270662
Accuracy Linear: 0.202132716935155
R2 Linear: 0.202132716935155

Lasso RMSE: 61.621840802404584
Accuracy Lasso: 0.2015382777675988
R2 Lasso: 0.2015382777675988

Ridge RMSE: 61.59892361739
Accuracy Ridge: 0.20213206374246617
R2 Ridge: 0.20213206374246617



#### 2- Normalized Data

In [14]:
price_normalized = np.log1p(price)

X_train, X_test, y_train, y_test = DFPicker.train_test_splitter(sentiment, price_normalized, 0.2)

########  LINEAR REGRESSION  ########      
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Linear RMSE: {}".format(rmse))
accuracy = model.score(X_test,y_test)
print("Accuracy Linear: {}".format(accuracy))
R2 = r2_score(y_test, y_pred)
print("R2 Linear: {}".format(R2))
print("")

########  lASSO  ########      
model2 = Lasso()
model2.fit(X=X_train, y=y_train)
y_pred2 = model2.predict(X_test)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred2))
print("Lasso RMSE: {}".format(rmse_lasso))
accuracy_lasso = model2.score(X_test,y_test)
print("Accuracy Lasso: {}".format(accuracy_lasso))
R2_lasso = r2_score(y_test, y_pred2)
print("R2 Lasso: {}".format(R2_lasso))
print("")

########  RIDGE  ########      
model_ridge = Ridge()
model_ridge.fit(X=X_train, y=y_train)
y_pred_ridge = model_ridge.predict(X_test)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print("Ridge RMSE: {}".format(rmse_ridge))
accuracy_ridge = model_ridge.score(X_test,y_test)
print("Accuracy Ridge: {}".format(accuracy_ridge))
R2_ridge = r2_score(y_test, y_pred_ridge)
print("R2 Ridge: {}".format(R2_ridge))
print("")


Training set has 1190 samples.
Testing set has 298 samples.
Linear RMSE: 0.06762843659261392
Accuracy Linear: 0.18965258493756398
R2 Linear: 0.18965258493756398

Lasso RMSE: 0.07475435170445925
Accuracy Lasso: 0.009885285733739368
R2 Lasso: 0.009885285733739368

Ridge RMSE: 0.06762846003857126
Accuracy Ridge: 0.18965202306225926
R2 Ridge: 0.18965202306225926



#### 3- Sin outliers y con escalado

In [15]:
price_reshaped = price.values.reshape(-1,1)
transformer = MinMaxScaler().fit(price_reshaped)
price_scaled = transformer.transform(price_reshaped)

X_train, X_test, y_train, y_test = DFPicker.train_test_splitter(sentiment, price_scaled, 0.2)

########  LINEAR REGRESSION  ########      
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Linear RMSE: {}".format(rmse))
accuracy = model.score(X_test,y_test)
print("Accuracy Linear: {}".format(accuracy))
R2 = r2_score(y_test, y_pred)
print("R2 Linear: {}".format(R2))
print("")

########  lASSO  ########      
model2 = Lasso()
model2.fit(X=X_train, y=y_train)
y_pred2 = model2.predict(X_test)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred2))
print("Lasso RMSE: {}".format(rmse_lasso))
accuracy_lasso = model2.score(X_test,y_test)
print("Accuracy Lasso: {}".format(accuracy_lasso))
R2_lasso = r2_score(y_test, y_pred2)
print("R2 Lasso: {}".format(R2_lasso))
print("")

########  RIDGE  ########      
model_ridge = Ridge()
model_ridge.fit(X=X_train, y=y_train)
y_pred_ridge = model_ridge.predict(X_test)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print("Ridge RMSE: {}".format(rmse_ridge))
accuracy_ridge = model_ridge.score(X_test,y_test)
print("Accuracy Ridge: {}".format(accuracy_ridge))
R2_ridge = r2_score(y_test, y_pred_ridge)
print("R2 Ridge: {}".format(R2_ridge))
print("")


Training set has 1190 samples.
Testing set has 298 samples.
Linear RMSE: 0.1653738654578288
Accuracy Linear: 0.20213271693515522
R2 Linear: 0.20213271693515522

Lasso RMSE: 0.17919356176965393
Accuracy Lasso: 0.06321116663912818
R2 Lasso: 0.06321116663912818

Ridge RMSE: 0.16537393315140397
Accuracy Ridge: 0.20213206374246617
R2 Ridge: 0.20213206374246617

