In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

#import our scripts that do data science workflow
import wrangle
import split_scale
import evaluate
import features

In [3]:
#Use the very specifically defined funciton that returns the columns from telco_churn DB
#that have continuous variables. Cuz that's what regression compares. 2 or more continous
#variables
df=wrangle.wrangle_telco()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
total_charges      1685 non-null float64
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.8+ KB


In [5]:
#Split up to the training and testing data
train,test=split_scale.split_my_data(df)

In [7]:
#Scale them. Usually i dont do that with y, but c'est la vie
scaler, train_scaled, test_scaled=split_scale.standard_scaler(train.drop(columns=['customer_id','total_charges']),test.drop(columns=['customer_id','total_charges']))

In [8]:
#Now split train/test into their feature_matrix and target_variable, X and y
X_train=train_scaled
y_train=train[['total_charges']]
X_test=test_scaled
y_test=test[['total_charges']]

In [9]:
features.optimal_number_of_features(X_train,y_train,X_test,y_test)

1

In [10]:
features.optimal_features(X_train,y_train,1)

Index(['monthly_charges'], dtype='object')

## Let's train two models with the optimal amount of features

In [11]:
#Creates a DF that is one column. Actual is the column name and its values are the 
#'total_charges' from y_train. Why the reset_index?
predictions=pd.DataFrame({'actual':y_train.total_charges}).reset_index(drop=True)

In [16]:
#Using the one selected by the optimal_features function
lm1=LinearRegression()
lm1.fit(X_train[['monthly_charges']],y_train)
lm1_predictions=lm1.predict(X_train[['monthly_charges']])
predictions['lm1']=lm1_predictions

## Predictions is getting assigned new columns that is the target feature's values predicted with the LinReg model fitted with the monthly charges as the feature matrix(X).

In [17]:
#There is only one other column choice, so let's try that.
lm2=LinearRegression()
lm2.fit(X_train[['tenure']],y_train)
lm2_predictions=lm2.predict(X_train[['tenure']])
predictions['lm2']=lm2_predictions

In [18]:
#baseline model
predictions['baseline'] = y_train.mean()[0]
#y_pred_baseline

### Predictions should have actual, lm1, lm2 and baseline

In [19]:
#Each column is the target array of y_train, yhat for lm1, yhat for lm2 and yhat using the 
#mean of y_train.
predictions.head()

Unnamed: 0,actual,lm1,lm2,baseline
0,3580.95,2911.298265,5135.054403,3759.661944
1,538.2,881.205753,974.514505,3759.661944
2,6376.55,5730.286831,5135.054403,3759.661944
3,478.75,891.724367,604.688736,3759.661944
4,1797.1,1242.344836,5135.054403,3759.661944


In [20]:
#Using the predictions DF, we can derive the error scores for each column.
#These are the error scores for using the mean of y_train as the only feature in X
MSE_baseline = mean_squared_error(predictions.actual, predictions.baseline)
SSE_baseline = MSE_baseline*len(predictions.actual)
RMSE_baseline = sqrt(MSE_baseline)
r2_baseline = r2_score(predictions.actual, predictions.baseline)
print(MSE_baseline,SSE_baseline,RMSE_baseline,r2_baseline)

6612927.089508685 8914225716.657707 2571.5612163642313 0.0


In [21]:
#Do the same thing with the other linear models. Use the columns in the predictions DF to
#derive the error scores.
MSE_1 = mean_squared_error(predictions.actual, predictions.lm1)
SSE_1 = MSE_1*len(predictions.actual)
RMSE_1 = sqrt(MSE_1)
r2_1 = r2_score(predictions.actual, predictions.lm1)
print(MSE_1,SSE_1,RMSE_1,r2_1)

710410.3104002099 957633098.4194828 842.8584165802758 0.8925724870719858


In [22]:
MSE_2 = mean_squared_error(predictions.actual, predictions.lm2)
SSE_2 = MSE_2*len(predictions.actual)
RMSE_2 = sqrt(MSE_2)
r2_2 = r2_score(predictions.actual, predictions.lm2)
print(MSE_2,SSE_2,RMSE_2,r2_2)

3930986.4171300903 5298969690.291362 1982.671535360835 0.40556029668517823


## Looks like that r2 over on lm1 looks like its the winner. RFE was wise to say that is optimal feature to use