# Linear Models I

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import operator

## Quantile-Quantile Plot

In [None]:
import statsmodels.api as sm

In [None]:
# Generate 10000 random numbers from a normal distribution with 0 mean and standard deviation 5
sample = np.random.normal(0, 5, 10000)

In [None]:
# Compare the samples to a theoretical normal distribution
sm.qqplot(sample,line='s')

## Data Transformations

Sometimes our data does not look strictly linear. In some cases, we can transform our data so that we can easily apply linear regression. To understand various transforms, we will create a data series.

In [None]:
x = np.arange(1,100).reshape(-1,1)
y = np.array([(i**2)+(10*i)*(np.sin(i)+1) for i in x]) # + np.random()

In [None]:
plt.scatter(x,y)

In [None]:
plt.hist(y)

In [None]:
sm.qqplot(y,line='s')

Our data does not look strictly linear, but let's go ahead and fit a linear model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=100)
model = LinearRegression() # y=m*x + b

In [None]:
model.fit(x_train, y_train)

In [None]:
print("R^2: ",model.score(x_test, y_test))
print("Slope: ", model.coef_)
print("Intercept: ", model.intercept_)

In [None]:
pred_test = model.predict(x_test)
res_test = pred_test - y_test

plt.subplot(121)
plt.scatter(x_test,y_test,label='Data')
plt.plot(x_test,pred_test,label='Fit')
plt.legend()
plt.subplot(122)
plt.scatter(x_test,res_test)
plt.title("Residuals")

That looks pretty good! And the coefficient of determiniation is close to 1. Looks like we have a nice model. Let's plot the model for all of the input data (x).

In [None]:
pred = model.predict(x)
res = pred - y

plt.subplot(121)
plt.scatter(x,pred,label='Fit')
plt.scatter(x,y,label='Data')
plt.legend()
plt.subplot(122)
plt.scatter(x,res)
plt.title("Residuals")

Wow! Look at those residuals. I don't think the model we have obtained is great. Looking at the residuals, the error seems to increase as the value of x increases. This could be a sign of heteroscedasticity, which violates one of our assumptions.

Let's try to transform the data (y) and see if this helps make the residuals more normal.

###  Reciprocal Transform

While not very useful for the current example, a reciprocal transforms can be useful for changing the scale of the data if there is a need to make the values more manageable and understandable. It can be particularly useful with data expressed as ratios. 

In [None]:
transform_rec = 1/y

In [None]:
plt.scatter(x,transform_rec)

### Log Transform

In nature, various phenomena have been shown to exhibit an exponential relationship. In the physical sciences, the exponential function shows up often enough that when we think about transformations the log transform should come to mind. The log transform is a special case of a change of base transform. From looking at the residuals, the errors are showing an interesting pattern that we want to try and account for in the model.

In [None]:
transform_log = np.log(y)

In [None]:
plt.scatter(x,transform_log)

In [None]:
plt.hist(transform_log)

In [None]:
def make_model(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=100)
    model = LinearRegression()
    model.fit(x_train, y_train)
    print("R^2: ",model.score(x_test, y_test))
    print("Slope: ", model.coef_)
    print("Intercept: ", model.intercept_)
    pred_test = model.predict(x_test)
    res_test = pred_test - y_test

    if(x.shape[1] > 1):
        sort_axis = operator.itemgetter(0)
        sorted_zip = sorted(zip(x_test[:,1],pred_test), key=sort_axis)
        xplt, yplt = zip(*sorted_zip)
    else:
        xplt = x_test
        yplt = pred_test
        
    plt.subplot(121)
    plt.scatter(xplt,y_test,label='Test data')
    plt.plot(xplt,yplt,label='Model fit',color='red',linewidth=2)
    plt.legend()
    plt.subplot(122)
    plt.scatter(xplt,res_test)
    plt.title("Residuals (test data)")
    plt.show()

    pred = model.predict(x)
    res = pred - y

    if(x.shape[1] > 1):
        sort_axis = operator.itemgetter(0)
        sorted_zip = sorted(zip(x[:,1],pred), key=sort_axis)
        xplt, yplt = zip(*sorted_zip)
    else:
        xplt = x
        yplt = pred

    plt.subplot(121)
    plt.plot(xplt,yplt,label='Fit',color='red',linewidth=2)
    plt.scatter(xplt,y,label='Data')
    plt.legend()
    plt.subplot(122)
    plt.scatter(xplt,res)
    plt.title("Residuals")

    return model

In this case it does not look like a log transform is a good idea. It shifted the skew of the data distribution and did not address the underlying issue with the trend still seen in the residuals.

### Square Root Transform

Let's try a square root transform

In [None]:
transform_sqrt = np.sqrt(y) #

In [None]:
sqrt_model = make_model(x,transform_sqrt)

Excellent! I would say this looks better. The errors look relatively uniform and centered.

An alternative way to do this is via the [preprocessing](https://scikit-learn.org/stable/modules/preprocessing.html) toolkit in scikit-learn. Just for fun, let's make our own square root transformer (which could be used in a scikit-learn pipeline workflow).

In [None]:
from sklearn import preprocessing
transformer = preprocessing.FunctionTransformer(np.sqrt, validate=True)

In [None]:
transformer.transform(y)[0:5]

In [None]:
np.sum(transformer.transform(y) - transform_sqrt) # to convince ourselves this is yielding the same data

### Box-Cox Transform

The Box-Cox is a general power transform. The log and square root transforms are specific cases of a Box-Cox transform.

In [None]:
boxcox_transformer = preprocessing.PowerTransformer(method='box-cox', standardize=True).fit(y)
boxcox_transformer.lambdas_ # this is the exponent

In [None]:
y_boxcox = boxcox_transformer.transform(y)

In [None]:
boxcoxmodel = make_model(x,y_boxcox)

The fit here is perhaps not as good as the square root transform, but would help us hone in on the correct transform to use if we did not know where to start. When you have time, explore some of the preprocessing transforms that might be of use such as the [quantile](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html#sklearn.preprocessing.quantile_transform) transformer.

## Polynomial regression

If power transforms are not sufficient, we can look at introducing additional terms into our linear model and using polynomial regression. 

In [None]:
polynomial = preprocessing.PolynomialFeatures(degree=2)
x_poly = polynomial.fit_transform(x)

In [None]:
polymodel = make_model(x_poly,y)

While it visually looks like the polynomial has a better fit to the data, the residuals show a different pattern. 

## Takeaway

Overall, based on all the linear models that we have fit to transformed data it seems the one with the square root transform has the best fit and satisfies the assumptions so that we have confidence in using the model. Applying more complex data transformations, via Box-Cox, and using a polynomial to fit the data did not address the underlying issues we sought out to address.

## Resampling

Next we will look at the use of resampling of data. Let's load data on the duration of trips from bicyle users from [Capital Bikeshare](https://www.capitalbikeshare.com/system-data) available [here](https://s3.amazonaws.com/capitalbikeshare-data/index.html).

In [None]:
os.getcwd()

In [None]:
data_folder = os.path.join(os.path.abspath('..'), 'data')

In [None]:
datafile = os.path.join(data_folder,'202001-capitalbikeshare-tripdata.csv')
bikes = pd.read_csv(datafile)

In [None]:
bikes.head()

In [None]:
bikes.plot('Start date','Duration')

Yikes! Let's try to sample the data at a more meaningful level. The first thing we need to do is ensure that the dataframe columns are appropriately typed.

In [None]:
bikes.info()

In [None]:
bikes['Start date']= pd.to_datetime(bikes['Start date'])
bikes['End date']= pd.to_datetime(bikes['End date'])

In [None]:
bikes.info()

In [None]:
bikes.plot('Start date','Duration')

In [None]:
bikes.set_index('Start date',inplace=True)

In [None]:
#bikes.plot('Start date','Duration')
daily_summary = pd.DataFrame()
daily_summary['Duration'] = bikes.Duration.resample('D').mean()
daily_summary.plot()

In [None]:
weekly_summary = pd.DataFrame()
weekly_summary['Duration'] = bikes.Duration.resample('W',label='right').mean()
weekly_summary.plot()

By resampling the data, we can now get a better idea of the average length of rides from bicycle riders for the month.

### Resampling and fitting a model

Back to our toy example. Let's treat it like a set of timeseries data.

In [None]:
df = pd.DataFrame()
df['date'] = pd.date_range(start='1/1/2019', end='12/31/2019', periods=len(y))
df['y'] = y
df.set_index('date', inplace=True, drop=False)

In [None]:
df.head()

In [None]:
df.y.plot()

In [None]:
monthly_summary = pd.DataFrame()
monthly_summary['y'] = df.y.resample('M',label='right').mean()
monthly_summary.plot()

In [None]:
monthly_summary.reset_index(inplace=True)

In [None]:
monthly_summary['date_delta'] = (monthly_summary['date'] - monthly_summary['date'].min()) / np.timedelta64(1,'D')
monthly_summary

In [None]:
newx = np.array(monthly_summary['date_delta']).reshape(-1, 1)
month_model = make_model(newx,monthly_summary['y'])

In [None]:
make_model(newx, np.sqrt(monthly_summary['y']))