I load all the libraries I am going to use to explore the data, create a model and evaluate it.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn import metrics

I upload the dataset into a pandas dataframe.
I then check the data types of the columns (I will have to hot-encode non-numerical columns) and the percentage of NaN in the columns.

In [None]:
data = pd.read_csv("./conversion_data.csv")

#print data types and percentage of Nan in columns
print(data.dtypes)
for col in data:
    sum_nan = data[col].isnull().sum()
    print(col, sum_nan/len(data[col])*100, ' %')

There are non Nan values, so I don't have to worry about them.
I can proceed with hot-encoding the two object-type columns:

In [None]:
#First I create a list of the object-type column names.
obj_cols = []
for col in data.columns:
    if data[col].dtype == 'object':
        obj_cols.append(col)

#Then I hot-encode the columns using get_dummies, and I join the new hot-encoded columns to the old dataset.
one_hot = pd.get_dummies(data[obj_cols])
data = data.join(one_hot)

I usually start exploring the data using a correlation matrix.
The matrix will tell me which variables are correlated, so that I can see what influences the conversion rate very quickly:

In [None]:
corrMatrix = data.corr()
plt.figure(figsize=(15,15))
sn.heatmap(corrMatrix, annot=True)
plt.show()

There is a high correlation between the number of pages visited and the correlation:

In [None]:
plt.hist(x = [data[data['converted']==1]['total_pages_visited'], data[data['converted']==0]['total_pages_visited']],
         stacked=True, color = ['tab:green','tab:orange'],label = ['yes','no'])

plt.legend()
plt.show()

The higher number of pages visited in a session the higher the chance of conversion.

I also notice a slight correlation between conversion and respectively country, age and wheter the user is new or not:
exploring these relations may give some insight into how to improve the conversion rate.

conversion (yes, no) vs country:

In [None]:
plt.hist(x = [data[data['converted']==1]['country'], data[data['converted']==0]['country']],
         stacked=True, color = ['tab:green','tab:orange'],label = ['yes','no'])
plt.legend()
plt.show()

conversion (yes, no) vs age:

In [None]:
plt.hist(x = [data[data['converted']==1]['age'], data[data['converted']==0]['age']],
         stacked=True, color = ['tab:green','tab:orange'],label = ['yes','no'])
plt.legend()
plt.show()

conversion (yes, no) vs new_user:

In [None]:
plt.hist(x = [data[data['converted']==1]['new_user'], data[data['converted']==0]['new_user']],
         stacked=True, color = ['tab:green','tab:orange'],label = ['yes','no'])
plt.xticks([0,1])

plt.legend()
plt.show()

The conversion vs age plot suggests the presence of odd ages in the dataset, so I will
limit the age to 100:

In [None]:
data = data[data['age'] < 100]

I now can train a Logistic Regression model, using the Recall as metric to evaluate its success.
Before splitting the dataset in train and test datasets, I drop the object-type columns that have been hot-encoded and
have been used for data exploration, but will not be used for modeling purposes.

In [None]:
data = data.drop(obj_cols, axis = 1)

#Divide the dataset using the 'converted' columns as label and all the others as features.
X = data.drop(columns=['converted'])
y = data['converted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train the Logistic Regression classifier
clf = LogisticRegressionCV(cv=5, class_weight='balanced', max_iter = 2000, scoring = 'recall').fit(X_train, y_train)
res = clf.predict(X_test)


print(metrics.recall_score(y_test, res))

