In [106]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
df = pd.read_csv('https://raw.githubusercontent.com/clearbrain/clearbrain-data-science-challenge-yash2602/master/data/conversion_data.csv')


In [144]:
df.head(200)
#Trying to check if there are any missing values
df.apply(lambda x: sum(x.isnull()),axis=0) 


country                0
age                    0
new_user               0
source                 0
total_pages_visited    0
converted              0
dtype: int64

In [109]:
#Dealing with categorical data
var_mod = ['country','source']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])
df.head(10)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,2,25,1,0,1,0
1,3,23,1,2,5,0
2,3,28,1,2,4,0
3,0,39,1,2,5,0
4,3,30,1,2,6,0
5,3,31,0,2,1,0
6,0,27,1,2,4,0
7,3,23,0,0,4,0
8,2,29,0,1,4,0
9,3,25,0,0,2,0


In [145]:
#This is a critical case of imbalanced class classification
df['converted'].value_counts()

0    306000
1     10200
Name: converted, dtype: int64

In [146]:
# Random undersampling to deal with imbalanced class classification
conv = 4* len(df[df['converted'] == 1])
nconv = df[df.converted == 0].index
random_usmpleindices = np.random.choice(nconv,conv, replace=False)
conv_ind = df[df.converted == 1].index
under_sample_indices = np.concatenate([conv_ind,random_usmpleindices])
under_sample = df.loc[under_sample_indices]

In [147]:
#They are of comparable zize now
under_sample['converted'].value_counts()

0    40800
1    10200
Name: converted, dtype: int64

In [148]:
x_data = under_sample.loc[:,under_sample.columns!='converted']
y_data = under_sample.loc[:,under_sample.columns=='converted']
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.3, random_state = 0)



In [149]:
#Question 1 Prediction of conversion rate.
#Model 1 Logistic Regression 
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(recall_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

0.8614281056406913
0.9534640522875817


  y = column_or_1d(y, warn=True)


In [150]:
#Model 2 Decision tree Classifier
model2 = DecisionTreeClassifier()
model2.fit(x_train,y_train)
y_pred2 = model2.predict(x_test)
print(recall_score(y_test,y_pred2))
print(accuracy_score(y_test,y_pred2))

0.8271926964460384
0.9438562091503268


In [151]:
#Model 3 Randon Forest Classifier
model3 = RandomForestClassifier(n_estimators=10)
model3.fit(x_train,y_train)
y_pred3 = model3.predict(x_test)
print(recall_score(y_test,y_pred3))
print(accuracy_score(y_test,y_pred3))

0.8457776328659928
0.9458823529411765


  This is separate from the ipykernel package so we can avoid doing imports until


In [152]:
#QUESTION 2: To find out the most important features.
predictor_var= ['country','source','age','new_user', 'total_pages_visited'] 
featimp = pd.Series(model2.feature_importances_, index=predictor_var).sort_values(ascending=False)
print (featimp)


total_pages_visited    0.849562
source                 0.064672
country                0.037544
age                    0.030312
new_user               0.017910
dtype: float64


In [153]:
predictor_var= ['country','source','age','new_user', 'total_pages_visited'] 
featimp = pd.Series(model3.feature_importances_, index=predictor_var).sort_values(ascending=False)
print (featimp)

total_pages_visited    0.802605
source                 0.087040
age                    0.053226
country                0.043649
new_user               0.013480
dtype: float64
