### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
#additional visuzalization libraries

import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import missingno as msno

### 2.  Import data

In [None]:
data = pd.read_csv('Ecommerce Customers')

In [None]:
data.head(10)

Avg. Session Length: Average session of in-store style advice sessions.

Time on App: Average time spent on App in minutes

Time on Website: Average time spent on Website in minutes

Length of Membership: How many years the customer has been a member.


### 3. EDA

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
sns.set(rc={'figure.figsize':(8,6)})
sns.set_style('whitegrid')

In [None]:
sns.heatmap(data.corr(), annot = True, cmap = 'Blues')

In [None]:
sns.pairplot(data = data, diag_kind = 'kde')

In [None]:
sns.set_palette('Set2')

In [None]:
sns.regplot(data = data, x = 'Length of Membership', y = 'Yearly Amount Spent')

### 4. Model 

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
model_data = data[['Avg. Session Length', 'Time on App',
       'Time on Website', 'Length of Membership', 'Yearly Amount Spent']]

X = model_data.drop('Yearly Amount Spent', axis = 1)
y = model_data['Yearly Amount Spent']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
lm = LinearRegression().fit(X_train, y_train)
pls = PLSRegression().fit(X_train, y_train)
ridge = Ridge().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
elasticnet = ElasticNet().fit(X_train, y_train)
knnr = KNeighborsRegressor().fit(X_train, y_train)
cartr = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)
baggr = BaggingRegressor(random_state=42,bootstrap_features=True,verbose=False).fit(X_train, y_train)
rfr = RandomForestRegressor(random_state=42,verbose=False).fit(X_train, y_train)
gbmr = GradientBoostingRegressor(verbose=False).fit(X_train, y_train)
xgbr = XGBRegressor().fit(X_train, y_train)
lgbmr = LGBMRegressor().fit(X_train, y_train)
catbr = CatBoostRegressor(verbose=False).fit(X_train, y_train)

In [None]:
models = [lm,pls,ridge,lasso,elasticnet,knnr,
         cartr,baggr,rfr,gbmr,xgbr,lgbmr,catbr]

In [None]:
sc = pd.DataFrame(columns=["MODELS","r2"])
for model in models:
    name = model.__class__.__name__
    r2 = cross_val_score(model,X_test,y_test,cv=10,scoring="r2").mean()
    result = pd.DataFrame([[name,r2*100]],columns=["MODELS","r2"])
    sc = sc.append(result)
    sc = sc.sort_values('r2', ascending = False)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="r2",y="MODELS",data=sc, palette = 'viridis')
plt.xlabel("r2")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL ACCURACY COMPARISON")
plt.show()

display(sc)


In [None]:
linear = LinearRegression()

In [None]:
linear.fit(X_train, y_train)

In [None]:
y_pred = linear.predict(X_test)
y_train_predict = linear.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
r2 = r2_score(y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('R2 score is {}'.format(r2))
print('RMSE is {}'.format(rmse))
print("\n")

# model evaluation for testing set
y_test_predict = linear.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
r2 = r2_score(y_test, y_pred)

print("The model performance for testing set")
print("--------------------------------------")
print('R2 score is {}'.format(r2))
print('RMSE is {}'.format(rmse))


In [None]:
linear.intercept_

In [None]:
temp = pd.DataFrame({'Actual':y_test, 'Predict':y_pred})
temp.head(20)

In [None]:
sns.scatterplot(data = temp, x = 'Actual', y = 'Predict', palette=['green','blue'])

In [None]:

coeff = pd.DataFrame({'Features' : X.columns, 'Coefficient' : linear.coef_})
coeff = coeff.sort_values('Coefficient', ascending = False)
coeff

In [None]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))