# <center>**Problem Statement**</center>
<div>
A D2C startup develops products using cutting edge technologies like Web 3.0. Over the past few months, the company has started multiple marketing campaigns offline and digital both. As a result, the users have started showing interest in the product on the website. These users with intent to buy product(s) are generally known as leads (Potential Customers). 


Leads are captured in 2 ways - Directly and Indirectly. 


Direct leads are captured via forms embedded in the website while indirect leads are captured based on certain activity of a user on the platform such as time spent on the website, number of user sessions, etc.


Now, the marketing & sales team wants to identify the leads who are more likely to buy the product so that the sales team can manage their bandwidth efficiently by targeting these potential leads and increase the sales in a shorter span of time.


Now, as a data scientist, your task at hand is to predict the propensity to buy a product based on the user's past activities and user level information.
</div>

## <center>**Import Libraries**</center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

In [None]:
#Display all columns of dataframe
pd.pandas.set_option('display.max_columns',None)

In [None]:
pd.pandas.set_option('display.max_rows',None)

## <center>**Load Dataset**</center>

In [None]:
# Root folder
root_path = '/content/drive/MyDrive/Hackathon_Data_June'

In [None]:
# Read data
df = pd.read_csv(os.path.join(root_path, "data", "train.csv"))

In [None]:
# First five rows
df.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,1,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,0,0,1,0,0,0,0,0,0
2,3,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2021-01-01,6,7,2.0,2017-10-04,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2021-01-01,4,6,,2020-06-08,0,0,0,0,0,0,1,0,0,0,1,0,0


In [None]:
# Last five rows
df.tail()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
39156,39157,2021-12-31,11,11,2.0,2017-10-19,1,0,0,0,1,1,1,0,0,0,0,0,0
39157,39158,2021-12-31,3,9,3.0,,0,0,0,0,0,0,0,0,0,0,0,0,0
39158,39159,2021-12-31,8,7,2.0,,1,0,0,0,1,0,1,0,0,0,0,0,0
39159,39160,2021-12-31,7,12,2.0,,0,0,0,0,0,1,0,0,0,0,1,0,0
39160,39161,2021-12-31,2,5,,2019-08-11,1,0,0,0,0,0,1,0,0,0,0,0,0


## <center>**EDA**</center>

In [None]:
# Shape of dataset
print("Shape of the dataset: ", df.shape)

Shape of the dataset:  (39161, 19)


In [None]:
# NULL values per column
df.isnull().sum()

id                          0
created_at                  0
campaign_var_1              0
campaign_var_2              0
products_purchased      20911
signup_date             15113
user_activity_var_1         0
user_activity_var_2         0
user_activity_var_3         0
user_activity_var_4         0
user_activity_var_5         0
user_activity_var_6         0
user_activity_var_7         0
user_activity_var_8         0
user_activity_var_9         0
user_activity_var_10        0
user_activity_var_11        0
user_activity_var_12        0
buy                         0
dtype: int64

In [None]:
# Features with null values
features_with_na = [feature for feature in df.columns if df[feature].isnull().sum()>=1]

In [None]:
# Percentage of missing values
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(), 4),  ' % missing values')

products_purchased 0.534  % missing values
signup_date 0.3859  % missing values


*   product_purchased has most of the values as missing values, so it's better to drop it if it hasn't significant contribution for prediction.
*   Fist one is discrete in nature and second one is Date data type.



In [None]:
# Missing values of signup_date
data = df.copy()

In [None]:
data['signup_date'].fillna(method='pad', inplace=True)

In [None]:
df = data.copy()

In [None]:
# Missing values of products_purchased
data = df.drop(columns=['created_at','signup_date', 'buy'])
cols = data.columns

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)

In [None]:
data = pd.DataFrame(imputer.fit_transform(data), columns=cols)

In [None]:
# Save imputer
import pickle
file_name=os.path.join(root_path, "models", 'imputer.pkl')
f = open(file_name,'wb')
pickle.dump(imputer,f)
f.close()

In [None]:
import math
def filter_products_purchased(n):
  return int(round(n))
data['products_purchased'] = data['products_purchased'].apply(filter_products_purchased)

In [None]:
data['products_purchased'].astype('int32')
data['created_at'] = df['created_at']
data['signup_date'] = df['signup_date']
data['buy'] = df['buy']
data.isnull().sum()

id                      0
campaign_var_1          0
campaign_var_2          0
products_purchased      0
user_activity_var_1     0
user_activity_var_2     0
user_activity_var_3     0
user_activity_var_4     0
user_activity_var_5     0
user_activity_var_6     0
user_activity_var_7     0
user_activity_var_8     0
user_activity_var_9     0
user_activity_var_10    0
user_activity_var_11    0
user_activity_var_12    0
created_at              0
signup_date             0
buy                     0
dtype: int64

In [None]:
print("Shape of data after cleaning: ",data.shape)

Shape of data after cleaning:  (39161, 19)


In [None]:
# Copy the data
df = data.copy()

In [None]:
# Split created_at in year, month and day
df[["created_at_year", "created_at_month", "created_at_day"]] = df["created_at"].str.split("-", expand = True)

In [None]:
# Split signup_date in year, month and day
df[["signup_date_year", "signup_date_month", "signup_date_day"]] = df["signup_date"].str.split("-", expand = True)

In [None]:
df.isnull().sum()

id                      0
campaign_var_1          0
campaign_var_2          0
products_purchased      0
user_activity_var_1     0
user_activity_var_2     0
user_activity_var_3     0
user_activity_var_4     0
user_activity_var_5     0
user_activity_var_6     0
user_activity_var_7     0
user_activity_var_8     0
user_activity_var_9     0
user_activity_var_10    0
user_activity_var_11    0
user_activity_var_12    0
created_at              0
signup_date             0
buy                     0
created_at_year         0
created_at_month        0
created_at_day          0
signup_date_year        0
signup_date_month       0
signup_date_day         0
dtype: int64

In [None]:
# Drop date columns
df.drop(columns=['created_at', 'signup_date'], inplace=True)

In [None]:
#  Change features data type to integer
for feature in df.columns:
  df[feature] = df[feature].astype('int32')

In [None]:
df.dtypes

id                      int32
campaign_var_1          int32
campaign_var_2          int32
products_purchased      int32
user_activity_var_1     int32
user_activity_var_2     int32
user_activity_var_3     int32
user_activity_var_4     int32
user_activity_var_5     int32
user_activity_var_6     int32
user_activity_var_7     int32
user_activity_var_8     int32
user_activity_var_9     int32
user_activity_var_10    int32
user_activity_var_11    int32
user_activity_var_12    int32
buy                     int32
created_at_year         int32
created_at_month        int32
created_at_day          int32
signup_date_year        int32
signup_date_month       int32
signup_date_day         int32
dtype: object

In [None]:
# Numerical Features
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
print('Number of numerical variables: ', len(numerical_features))
print("Numerical Features: ",numerical_features)

Number of numerical variables:  23
Numerical Features:  ['id', 'campaign_var_1', 'campaign_var_2', 'products_purchased', 'user_activity_var_1', 'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4', 'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7', 'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10', 'user_activity_var_11', 'user_activity_var_12', 'buy', 'created_at_year', 'created_at_month', 'created_at_day', 'signup_date_year', 'signup_date_month', 'signup_date_day']


* Here all features are numerical.

In [None]:
# Temoral Variables
temporal_features = [feature for feature in df.columns if 'year' in feature or 'month' in feature or 'day' in feature]

In [None]:
print("Temporal features are:",temporal_features)

Temporal features are: ['created_at_year', 'created_at_month', 'created_at_day', 'signup_date_year', 'signup_date_month', 'signup_date_day']


* All data points are from 2021, therefore year features have no great significance.

In [None]:
# Drop all year columns as the value odf year is uniqie
print("Created at Year: ",df.created_at_year.unique())
df.drop(columns=['created_at_year','signup_date_year'], inplace=True)

Created at Year:  [2021]


In [None]:
# Relation between created_at_month and buy
df.groupby('created_at_month')['buy'].value_counts()

created_at_month  buy
1                 0      1206
                  1        66
2                 0      1350
                  1        92
3                 0      1941
                  1       107
4                 0      2228
                  1       128
5                 0      2654
                  1       145
6                 0      3034
                  1       183
7                 0      3342
                  1       198
8                 0      3871
                  1       223
9                 0      3937
                  1       223
10                0      4465
                  1       220
11                0      4461
                  1       197
12                0      4674
                  1       216
Name: buy, dtype: int64

* Here, it is clear that there is no visible relationship between month and buy

In [None]:
# Relation between created_at_month, created_at_day and buy
df.groupby(['created_at_month', 'created_at_day'])['buy'].value_counts()

created_at_month  created_at_day  buy
1                 1               0       21
                  2               0       43
                  3               0       38
                  4               0       36
                  5               0       36
                                  1        3
                  6               0       28
                                  1        1
                  7               0       34
                  8               0       36
                                  1        1
                  9               0       37
                                  1        2
                  10              0       48
                                  1        5
                  11              0       38
                                  1        2
                  12              0       39
                                  1        3
                  13              0       38
                                  1        7
                 

In [None]:
# Relation between created_at_month, created_at_day and buy
df.groupby(['created_at_day'])['buy'].value_counts()

created_at_day  buy
1               0      1122
                1        58
2               0      1156
                1        67
3               0      1145
                1        57
4               0      1217
                1        68
5               0      1139
                1        51
6               0      1219
                1        64
7               0      1180
                1        60
8               0      1152
                1        60
9               0      1190
                1        71
10              0      1300
                1        80
11              0      1167
                1        57
12              0      1187
                1        69
13              0      1196
                1        78
14              0      1199
                1        73
15              0      1262
                1        63
16              0      1198
                1        60
17              0      1218
                1        73
18              0      1198


* There is no particular relationship between created_data_day and buy

In [None]:
df.drop(columns=['created_at_month', 'created_at_day'], inplace=True)

In [None]:
# Relation between signup_date_month and buy
df.groupby('signup_date_month')['buy'].value_counts()

signup_date_month  buy
1                  0      3631
                   1       226
2                  0      3287
                   1       184
3                  0      3588
                   1       160
4                  0      2558
                   1       140
5                  0      2683
                   1       145
6                  0      2735
                   1       146
7                  0      3005
                   1       153
8                  0      2933
                   1       161
9                  0      2999
                   1       146
10                 0      3230
                   1       184
11                 0      3127
                   1       168
12                 0      3387
                   1       185
Name: buy, dtype: int64

In [None]:
# Relation between signup_date_day and buy
df.groupby('signup_date_day')['buy'].value_counts()

signup_date_day  buy
1                0      1237
                 1        64
2                0      1173
                 1        65
3                0      1304
                 1        81
4                0      1208
                 1        55
5                0      1116
                 1        59
6                0      1124
                 1        73
7                0      1215
                 1        68
8                0      1259
                 1        73
9                0      1221
                 1        64
10               0      1246
                 1        66
11               0      1260
                 1        79
12               0      1263
                 1        74
13               0      1116
                 1        66
14               0      1231
                 1        56
15               0      1225
                 1        71
16               0      1182
                 1        59
17               0      1252
                 1    

In [None]:
# Relation between signup_date_day and buy
df.groupby(['signup_date_month', 'signup_date_day'])['buy'].value_counts()

signup_date_month  signup_date_day  buy
1                  1                0      118
                                    1        2
                   2                0      106
                                    1        6
                   3                0      119
                                    1       11
                   4                0      126
                                    1       11
                   5                0      130
                                    1       11
                   6                0       95
                                    1        9
                   7                0      122
                                    1       10
                   8                0      116
                                    1        6
                   9                0      106
                                    1        5
                   10               0      133
                                    1        8
                   1

* There is no particular relationship between signup_month and signup_day with buy

In [None]:
df.drop(columns=['signup_date_day', 'signup_date_month'], inplace=True)

In [None]:
# Drop id
df.drop(columns='id', inplace=True)

In [None]:
# Analyze Products Purchased
for feature in df.columns:
  print(feature)
  print(df[feature].value_counts())
  print("______________________________________________")

campaign_var_1
6     4061
5     4060
4     4018
3     3784
7     3756
2     3486
8     3407
9     2866
10    2392
11    1873
1     1593
12    1455
13    1059
14     714
15     456
16     181
Name: campaign_var_1, dtype: int64
______________________________________________
campaign_var_2
6     5744
7     5472
5     5250
8     4690
4     4213
9     3537
3     2883
10    2298
2     1788
11    1410
12     761
1      589
13     351
14     151
15      24
Name: campaign_var_2, dtype: int64
______________________________________________
products_purchased
2    23908
3     9474
1     5030
4      749
Name: products_purchased, dtype: int64
______________________________________________
user_activity_var_1
0    23869
1    14917
2      374
3        1
Name: user_activity_var_1, dtype: int64
______________________________________________
user_activity_var_2
0    38898
1      263
Name: user_activity_var_2, dtype: int64
______________________________________________
user_activity_var_3
0    35134
1    

* Distrution of subclasses of features isn't balanced.
* Although these features seem discrete data but these are actually categorical in nature.
* Some of are rare categorical variables but elementing them from train data can be a problem if the same data can be  found in test data.

In [None]:
# Correlation of variables
df.corr(method='pearson')

Unnamed: 0,campaign_var_1,campaign_var_2,products_purchased,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
campaign_var_1,1.0,0.561489,-0.014247,0.0241,-0.034548,-0.01267,-0.03351,-0.023116,0.029665,-0.0726,-0.014276,-0.058264,-0.004831,-0.022,-0.006369,-0.087202
campaign_var_2,0.561489,1.0,0.012345,-0.032969,-0.039833,-0.008046,-0.038928,-0.03099,0.067304,-0.053462,-0.033848,-0.057942,-0.009879,-0.00389,-0.008228,-0.080064
products_purchased,-0.014247,0.012345,1.0,-0.003339,0.004848,-0.002844,0.004374,-0.004496,0.00274,0.005418,0.000156,0.007385,0.009469,-0.003086,-0.00387,0.001754
user_activity_var_1,0.0241,-0.032969,-0.003339,1.0,0.014599,-0.02595,0.016656,-0.015441,-0.190784,-0.085343,-0.047157,0.034783,0.012811,-0.062915,0.002535,0.044811
user_activity_var_2,-0.034548,-0.039833,0.004848,0.014599,1.0,-0.001075,0.132108,0.060982,-0.005297,-0.020153,-0.035306,0.155794,0.046324,0.085794,0.02444,0.354627
user_activity_var_3,-0.01267,-0.008046,-0.002844,-0.02595,-0.001075,1.0,0.003023,0.023117,-0.050132,-0.006306,-0.025562,0.003898,0.019152,-0.027212,0.013261,0.005174
user_activity_var_4,-0.03351,-0.038928,0.004374,0.016656,0.132108,0.003023,1.0,0.071339,-0.021669,-0.004572,-0.03937,0.204941,0.046924,0.067085,0.037937,0.394706
user_activity_var_5,-0.023116,-0.03099,-0.004496,-0.015441,0.060982,0.023117,0.071339,1.0,-0.078928,-0.007705,-0.031585,0.074189,0.028037,0.021225,0.022972,0.164972
user_activity_var_6,0.029665,0.067304,0.00274,-0.190784,-0.005297,-0.050132,-0.021669,-0.078928,1.0,-0.169651,-0.06246,-0.013304,0.001175,-0.076816,0.001938,-0.010951
user_activity_var_7,-0.0726,-0.053462,0.005418,-0.085343,-0.020153,-0.006306,-0.004572,-0.007705,-0.169651,1.0,-0.035982,-0.010222,0.002007,-0.06341,0.006371,-0.028428


In [None]:
print("Shape of  filtered dataframe: ",df.shape)

Shape of  filtered dataframe:  (39161, 16)


In [None]:
# Data is balanced or not
df['buy'].value_counts()

0    37163
1     1998
Name: buy, dtype: int64

* It is clear that the data is imbalanced. If undersampling can be done a lot of data will be lost, therefore oversampling the only option.

In [None]:
# delete duplicate columns
df.drop_duplicates(keep='first', inplace=True)

In [None]:
# Shape of the dataset
print("Shape of the dataset: ",df.shape)

Shape of the dataset:  (15822, 16)


In [None]:
# Save the dataset
df.to_csv(os.path.join(root_path, "data", "filtered_data1.csv"), index=False)

In [None]:
#  Load Training dataset

In [None]:
x = df.drop(columns='buy')
y = df['buy']

In [None]:
x = np.array(x)
y = np.array(y)

In [None]:
# Split the data into train and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=32)

In [None]:
# Balance train set
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
x_train, y_train = sm.fit_resample(x_train, y_train.ravel())