In [1]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

In [2]:
df = pd.read_csv('../data/newdf.csv')

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df=df.rename( columns={"User_ID_x": "User_ID","Purchase_x": "Purchase"})

In [5]:
df.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Purchase,User_prod_bought,Quantity_sold,Popular
0,1000001,P00069042,F,0-17,10,A,2,0,3,8370,35,227,0
1,1000149,P00069042,M,26-35,1,B,2,1,3,10715,334,227,0


In [6]:
df=pd.get_dummies(df, columns=['Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Product_Category_1'])


In [7]:

df['Product_ID'] = df['Product_ID'].str[1:] # Drop the first character ‘P’ from product id
df = df.astype({'Product_ID':int}) # Recast all strings as ints

In [8]:
X = df.drop('Purchase',axis=1)
y = df.Purchase

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=42)

In [9]:
#code to calculate RMSE
def rmse(predictor,target):
    return np.sqrt(((predictor - target) ** 2).mean())

In [None]:
#using linear regression

multivar_model = sm.OLS(y_train,sm.add_constant(X_train)).fit()

In [11]:
multivar_model.summary()

0,1,2,3
Dep. Variable:,Purchase,R-squared:,0.669
Model:,OLS,Adj. R-squared:,0.669
Method:,Least Squares,F-statistic:,15320.0
Date:,"Mon, 03 Dec 2018",Prob (F-statistic):,0.0
Time:,19:50:19,Log-Likelihood:,-4131400.0
No. Observations:,440054,AIC:,8263000.0
Df Residuals:,439995,BIC:,8264000.0
Df Model:,58,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1067.6908,1128.029,0.947,0.344,-1143.212,3278.593
User_ID,0.0060,0.003,2.342,0.019,0.001,0.011
Product_ID,-0.0005,4.44e-05,-10.174,0.000,-0.001,-0.000
Marital_Status,-42.5722,9.482,-4.490,0.000,-61.156,-23.988
User_prod_bought,-0.8077,0.031,-26.349,0.000,-0.868,-0.748
Quantity_sold,2.5948,0.022,116.856,0.000,2.551,2.638
Popular,-229.7916,18.075,-12.713,0.000,-265.219,-194.364
Gender_F,538.0808,564.217,0.954,0.340,-567.768,1643.929
Gender_M,529.6100,563.862,0.939,0.348,-575.543,1634.763

0,1,2,3
Omnibus:,34880.131,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55846.732
Skew:,-0.612,Prob(JB):,0.0
Kurtosis:,4.244,Cond. No.,9.11e+16


This shows that Marital Status, Number of products user bought, Quantity of item sold were significant features.
Also customers from Occupation 3 ,6,19 seem to be siginficant

**Also Product category seems to be a siginificant feature .**

In [12]:
predict = multivar_model.predict(sm.add_constant(X_test))
print(rmse(predict,y_test))

2894.094704449546


Feature engineering has improved the RMSE slightly,than without feature engineerign which gave us a higher score.
We tried Lasso but it didn't help improve our RMSE values.

In [16]:
# code to do individidual transformtions to see if RMSE increases
# for c in df.columns:
#     if (c == 'Purchase'): continue
#     print(c)
#     for t in ['square','sqrt','log']:
#         loopDf = df.copy()
#         if t == 'square': loopDf[c+'_'+t] = loopDf[c]**2
#         elif t == 'sqrt': loopDf[c+'_'+t] = np.sqrt(loopDf[c])
#         elif t == 'log': 
#             if loopDf[c].eq(0).any() == True: #np.log will throw a divide by 0 error if 0 exists
#                 print('0 in '+c+', skipping')
#                 continue
#             else: loopDf[c+'_'+t] = np.log(loopDf[c])
#         loopDf.drop(c,axis=1,inplace=True)
#         loopTrain, loopTest = train_test_split(loopDf,test_size=0.3)
#         X = loopTrain.drop('Purchase',axis=1)
#         y = loopTrain.Purchase
#         model = sm.OLS(y,sm.add_constant(X)).fit()
#         print(t)
#         print('Adjusted R-Squared: '+str(model.rsquared_adj))
#         prediction = model.predict(sm.add_constant(loopTest.drop('Purchase',axis=1)))
#         print('RMSE: '+str(rmse(prediction,loopTest['Purchase'])))
#     print('\n')