
# LINEAR REGRESSION WITH PYTHON



# Import libraries


In [None]:
%pylab inline

In [None]:
import dataiku                               # Access to Dataiku datasets
import pandas as pd, numpy as np             # Data manipulation 
from matplotlib import pyplot as plt         # Graphing
import seaborn as sns                        # Graphing
import statsmodels.api as sm                    # Statistical analysis
#sns.set(style="white")                       # Tuning the style of charts
import warnings                              # Disable some warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from scipy import stats                      # Stats


# Check out data


In [None]:
# Example: load a DSS dataset as a Pandas dataframe
mydataset = dataiku.Dataset("wage")
mydataset_df = mydataset.get_dataframe()

In [None]:
mydataset_df.head() 

In [None]:
mydataset_df.describe()

In [None]:
selected_fields=mydataset_df.drop(labels=["BRTHORD","MEDUC","FEDUC"],axis=1) #remove fields with missing values

In [None]:
selected_fields=selected_fields.drop(labels=["SIBS","AGE"],axis=1) #remove fields not significant (from stepwise)

In [None]:
metric_fields=mydataset_df.drop(labels=["AFROAMERICAN","MARRIED", "SOUTH","URBAN", "BRTHORD","MEDUC","FEDUC","SIBS"],axis=1) #select only metric fields

In [None]:
selected_fields.columns

# Exploratory Data Analysis


In [None]:
metric_fields.columns

In [None]:
sns.pairplot(metric_fields)

In [None]:
sns.distplot(selected_fields['WAGE'])

In [None]:
sns.heatmap(selected_fields.corr())

## Linear Regression Model with SKLEARN

### X and y arrays

In [None]:
X = selected_fields.drop(labels=["WAGE"],axis=1) # wage variable cannot be used as explanatory
y = selected_fields['WAGE'] # dependent variable

## Train Test Split


In [None]:
# from sklearn.model_selection import train_test_split - to include if we want to split the data

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=101) - to include if we want to split the data

## Creating and Training the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
model=lm.fit(X,y)

## Model Evaluation

Let's evaluate the model by checking out it's coefficients and how we can interpret them.

In [None]:
# print the intercept
print("Constant: ", model.intercept_)
print("R2: ", model.score(X,y))

In [None]:
coeff_df = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_df

## Predictions from our Model

Let's grab predictions off our test set and see how well it did!

In [None]:
predictions = lm.predict(X)

In [None]:
plt.scatter(y,predictions)

**Residual Histogram**

In [None]:
sns.distplot((y-predictions),bins=50);

## Regression Evaluation Metrics


Here are three common evaluation metrics for regression problems:

**Mean Absolute Error** (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

**Mean Squared Error** (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

**Root Mean Squared Error** (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

Comparing these metrics:

- **MAE** is the easiest to understand, because it's the average error.
- **MSE** is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
- **RMSE** is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are **loss functions**, because we want to minimize them.

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y, predictions))
print('MSE:', metrics.mean_squared_error(y, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y, predictions)))



## ADITIONAL INFO USING STATSMODELS

In [None]:
X = sm.add_constant(X) # additional model

In [None]:
result = sm.OLS( y, X).fit()
print (result.summary())