# Predicting Purchase 

This is a regression problem where we need to predict the Purchase amount from the given feature variables. We also need to perform ananlysis on data to find best features for prediction.

In [22]:
import seaborn as sns
import  matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble  import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [2]:
train_data = pd.read_csv('train_data/train.csv')
test_data = pd.read_csv('test_data/test.csv')

# train_data=train_data.drop(columns=['Gender','Age','Stay_In_Current_City_Years','Occupation'])

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [4]:
# sns.pairplot(train_data, diag_kind="kde")
# plt.show()

In [5]:
# #visualization using boxplot
# # %matplotlib qt
# for i,x in enumerate(train_data.columns[2:-1]):
#     ax = plt.subplot(3,3,i+1)
#     sns.boxplot(x=x,y='Purchase',data=train_data)
# plt.show()

# Analysis from boxplot

From boxplot we can see that the features Marital Status & Stay in Current City(Years) have same distribution for every category with resprect to Purchase. So, we do not use those features because they do not add value for prediction of Purchase. 

In [6]:
def preprocessing(data):
    data = data.copy()
    data.fillna(99999,inplace=True)
    # remove rows with NaN
    assert data.isna().sum().sum() <= 0  
    #drop unwanted features
    data.drop(columns=['User_ID','Product_ID','Marital_Status','Stay_In_Current_City_Years'],inplace=True)
    #convert categorical values to category dtype
    for column in data.columns[:-1]:
        data[column] = data[column].astype('category') 
    #one hot encoding
    data = pd.get_dummies(data,columns=['Age','City_Category'],prefix=['Age','City_Category'])
    #label encoding
    data['Gender'] = data['Gender'].cat.codes

    return data

In [7]:
train_data = preprocessing(train_data)
test_data = preprocessing(test_data)

In [8]:
train_data.head()

Unnamed: 0,Gender,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_A,City_Category_B,City_Category_C
0,0,10,3,99999.0,99999.0,8370,1,0,0,0,0,0,0,1,0,0
1,0,10,1,6.0,14.0,15200,1,0,0,0,0,0,0,1,0,0
2,0,10,12,99999.0,99999.0,1422,1,0,0,0,0,0,0,1,0,0
3,0,10,12,14.0,99999.0,1057,1,0,0,0,0,0,0,1,0,0
4,1,16,8,99999.0,99999.0,7969,0,0,0,0,0,0,1,0,0,1


In [9]:
#shift target variable to end 
cols = list(train_data.columns)
cols.remove('Purchase')
new_cols = cols + ['Purchase']
train_data = train_data[new_cols]

In [10]:
train_data.head()

Unnamed: 0,Gender,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_A,City_Category_B,City_Category_C,Purchase
0,0,10,3,99999.0,99999.0,1,0,0,0,0,0,0,1,0,0,8370
1,0,10,1,6.0,14.0,1,0,0,0,0,0,0,1,0,0,15200
2,0,10,12,99999.0,99999.0,1,0,0,0,0,0,0,1,0,0,1422
3,0,10,12,14.0,99999.0,1,0,0,0,0,0,0,1,0,0,1057
4,1,16,8,99999.0,99999.0,0,0,0,0,0,0,1,0,0,1,7969


In [11]:
#split featrues and target variables
X = np.asarray(train_data.iloc[:,:-1])
Y = np.asarray(train_data.iloc[:,-1:])
Y = [x[0] for x in Y]

In [12]:

#split training data and testing data
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [17]:
#train model
clf = GradientBoostingRegressor()
# clf = make_pipeline(PolynomialFeatures(degree=5),Linea())

clf.fit(x_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [18]:
#calcualte accuracy of model
acc = clf.score(x_test,y_test)

In [19]:
print("Accuracy of model: %.2f"%acc)

Accuracy of model: 0.65
