
#  Lead Scoring Case Study by Gloriya and Bala



### Import Necessary Libraries


In [1]:

import pandas as pd

import numpy as np

import warnings

warnings.filterwarnings('ignore')


In [2]:
# Import visualisation libraries

import seaborn as sns

import plotly.express as px

import matplotlib.pyplot as plt


In [3]:
# Import all necessary scikit-learn libraries

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import r2_score,roc_auc_score,roc_curve,accuracy_score,confusion_matrix,classification_report

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.metrics import precision_score, recall_score,precision_recall_curve,f1_score

from sklearn import metrics


In [4]:

# Expand output display to see more rows and columns

pd.set_option('display.max_rows',200)

pd.set_option('display.max_columns',160)



### Inspecting dataset


In [5]:
# Read the dataset

leads_df=pd.read_csv('Leads.csv')


In [6]:
# Check the number of rows and columns

leads_df.shape


(9240, 37)

In [7]:
# View the dataframe 

leads_df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Country,Specialization,How did you hear about X Education,What is your current occupation,What matters most to you in choosing a course,Search,Magazine,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,Receive More Updates About Our Courses,Tags,Lead Quality,Update me on Supply Chain Content,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,Page Visited on Website,,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Interested in other courses,Low in Relevance,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,Email Opened,India,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,Email Opened,India,Business Administration,Select,Student,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,Unreachable,India,Media and Advertising,Word Of Mouth,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,Not Sure,No,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,Converted to Lead,India,Select,Other,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [8]:
# Check the datatype and null value counts of each column

leads_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 


#### Finding : There are null values in a number of columns


In [9]:
# Descriptive statistics of numeric variables

leads_df.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Lead Number,9240.0,617188.435606,23405.995698,579533.0,596484.5,615479.0,637387.25,660737.0
Converted,9240.0,0.38539,0.486714,0.0,0.0,0.0,1.0,1.0
TotalVisits,9103.0,3.445238,4.854853,0.0,1.0,3.0,5.0,251.0
Total Time Spent on Website,9240.0,487.698268,548.021466,0.0,12.0,248.0,936.0,2272.0
Page Views Per Visit,9103.0,2.36282,2.161418,0.0,1.0,2.0,3.0,55.0
Asymmetrique Activity Score,5022.0,14.306252,1.386694,7.0,14.0,14.0,15.0,18.0
Asymmetrique Profile Score,5022.0,16.344883,1.811395,11.0,15.0,16.0,18.0,20.0


In [10]:
# List Numeric columns

num_cols=leads_df.select_dtypes(include=np.number).columns

print('Total Numeric columns  :',len(num_cols))
      
num_cols


Total Numeric columns  : 7


Index(['Lead Number', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit',
       'Asymmetrique Activity Score', 'Asymmetrique Profile Score'],
      dtype='object')

In [11]:
# List Non-numeric columns

cat_cols=leads_df.select_dtypes(exclude=np.number).columns

print('Total Categorical columns  :',len(cat_cols))

cat_cols


Total Categorical columns  : 30


Index(['Prospect ID', 'Lead Origin', 'Lead Source', 'Do Not Email',
       'Do Not Call', 'Last Activity', 'Country', 'Specialization',
       'How did you hear about X Education', 'What is your current occupation',
       'What matters most to you in choosing a course', 'Search', 'Magazine',
       'Newspaper Article', 'X Education Forums', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Courses', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [12]:

# Columns with Missing value

sum(leads_df.isnull().sum().values>0)


17


#### Finding : There are 7 numeric columns and 30 categorical columns . Also there are 17 columns with missing values.



### Check for duplicate rows in the dataset


In [13]:

leads_df.duplicated().sum()


0

#### Finding : There are no duplicate rows in the dataset


## Cleaning the dataset


In [14]:
## Checking for columns with unique values. such columns does not add any value to our analysis and we should drop those

sum(leads_df.nunique().values==1)


5


#### Finding :  There are 5 columns which have identical values in all the rows, these columns do not add any value to our analysis, hence we will drop those columns.


In [15]:

unique_cols=leads_df.nunique()

unique_cols


Prospect ID                                      9240
Lead Number                                      9240
Lead Origin                                         5
Lead Source                                        21
Do Not Email                                        2
Do Not Call                                         2
Converted                                           2
TotalVisits                                        41
Total Time Spent on Website                      1731
Page Views Per Visit                              114
Last Activity                                      17
Country                                            38
Specialization                                     19
How did you hear about X Education                 10
What is your current occupation                     6
What matters most to you in choosing a course       3
Search                                              2
Magazine                                            1
Newspaper Article           

In [16]:
# List of unique value columns

unique_cols=list(unique_cols.index[unique_cols.values==1])

unique_cols


['Magazine',
 'Receive More Updates About Our Courses',
 'Update me on Supply Chain Content',
 'Get updates on DM Content',
 'I agree to pay the amount through cheque']

In [17]:
# Drop unique valued columns

leads_df.drop(unique_cols,1,inplace=True)


In [None]:

leads_df.shape



#### Finding : Now we are left with 32 columns


In [None]:
# Converting all the values to lower case to avoid ambiguity

leads_df = leads_df.applymap(lambda x:x.lower() if type(x) == str else x)


In [None]:
# There are a number of columns entries with value as 'Select' which is nothing but null values, replace 'Select' with NaN

leads_df = leads_df.replace('select', np.nan)



### Let's inspect each column in details for a better understanding


In [24]:
leads_df['What is your current occupation'].value_counts()

Unemployed              5600
Working Professional     706
Student                  210
Other                     16
Housewife                 10
Businessman                8
Name: What is your current occupation, dtype: int64

In [22]:
list(leads_df['What is your current occupation'].value_counts().index)

['Unemployed',
 'Working Professional',
 'Student',
 'Other',
 'Housewife',
 'Businessman']

In [None]:

for i in leads_df.columns:
    print(i,'----------------------------------------------------------------------------------------------------------')
    print(leads_df[i].value_counts(dropna=False))
    print('-------------------------------------------------------------------------------------------------------------')
    


#### Finding : There are a couple of columns with values which does not have enough variability


In [None]:
# Let's go ahead and drop the most redundant columns

leads_df.drop(['Prospect ID', 'Lead Number'], 1, inplace = True)



## Missing value treatment


In [None]:
# Look for columns which have missing values

sum(leads_df.isnull().mean()>0)



### There are 17 out of 30 columns with null values


In [None]:
# Look for the percentage  null values in each column

null_cols=round(leads_df.isnull().mean()*100,2)

null_cols


In [None]:
# We will drop all the columns with Null values Greater than 35%

drop_cols=null_cols[null_cols.values>35].index.to_list()

drop_cols


In [None]:

leads_df.drop(drop_cols, axis = 1, inplace = True)


In [None]:
# Remaining list of categorical columns

cat_cols=leads_df.select_dtypes(exclude=np.number).columns

cat_cols


In [None]:
# Lets inspect the variability of all the categorical columns.

# This comes under EDA analysis, however this step is essential here as part of missing value treatment.


def autopct(pct): # only show the label when it is > 10%
    return ('%.2f' % pct) if pct > 10 else ''

fig, ax = plt.subplots(4,4,figsize=(12,12))

ax[0, 0].title.set_text(cat_cols[0])
ax[0, 0].pie(leads_df[cat_cols[0]].value_counts(dropna=False),startangle=90,autopct=autopct) 


ax[0, 1].title.set_text(cat_cols[1])
ax[0, 1].pie(leads_df[cat_cols[1]].value_counts(dropna=False),startangle=90,autopct=autopct) 

ax[0, 2].title.set_text(cat_cols[2])
ax[0, 2].pie(leads_df[cat_cols[2]].value_counts(dropna=False),startangle=90,autopct=autopct) 


ax[0, 3].title.set_text(cat_cols[3])
ax[0, 3].pie(leads_df[cat_cols[3]].value_counts(dropna=False),startangle=90,autopct=autopct) 

ax[1, 0].title.set_text(cat_cols[4])
ax[1, 0].pie(leads_df[cat_cols[4]].value_counts(dropna=False),startangle=90,autopct=autopct) 

ax[1, 1].title.set_text(cat_cols[5])
ax[1, 1].pie(leads_df[cat_cols[5]].value_counts(dropna=False),startangle=90,autopct=autopct) 

ax[1, 2].title.set_text(cat_cols[6])
ax[1, 2].pie(leads_df[cat_cols[6]].value_counts(),startangle=90,autopct=autopct) 

ax[1, 3].title.set_text('What matters most to you \n in choosing a course')
ax[1, 3].pie(leads_df[cat_cols[7]].value_counts(),startangle=90,autopct=autopct) 

ax[2, 0].title.set_text(cat_cols[8])
ax[2, 0].pie(leads_df[cat_cols[8]].value_counts(),startangle=90,autopct=autopct) 

ax[2,1].title.set_text(cat_cols[9])
ax[2,1].pie(leads_df[cat_cols[9]].value_counts(),startangle=90,autopct=autopct) 

ax[2,2].title.set_text(cat_cols[10])
ax[2,2].pie(leads_df[cat_cols[10]].value_counts(),startangle=90,autopct=autopct) 

ax[2,3].title.set_text(cat_cols[11])
ax[2,3].pie(leads_df[cat_cols[11]].value_counts(),startangle=90,autopct=autopct) 

ax[3,0].title.set_text(cat_cols[12])
ax[3,0].pie(leads_df[cat_cols[12]].value_counts(),startangle=90,autopct=autopct) 


ax[3,1].title.set_text(cat_cols[13])
ax[3,1].pie(leads_df[cat_cols[13]].value_counts(),startangle=90,autopct=autopct) 


ax[3,2].title.set_text(cat_cols[14])
ax[3,2].pie(leads_df[cat_cols[14]].value_counts(),startangle=90,autopct=autopct) 

ax[3,3].title.set_text(cat_cols[15])
ax[3,3].pie(leads_df[cat_cols[15]].value_counts(),startangle=90,autopct=autopct) 

fig.tight_layout()

fig.show()

In [None]:
# As per the above pie charts, we will remove columns which do not have much variability , keeping a cut off at 95%

drop_cols = ['Do Not Call','Country','Search','Newspaper Article','What matters most to you in choosing a course','X Education Forums','Newspaper','Digital Advertisement','Through Recommendations']

leads_df.drop(drop_cols, axis = 1, inplace = True)


In [None]:

null_cols=round(leads_df.isnull().mean()*100,2)

null_cols




### Now there are 5 more columns with null values, we will go ahead and impute those with the most appropriate values


In [None]:
#Let's inspect the variable 'Lead Source'

round(leads_df['Lead Source'].value_counts(dropna=False)/leads_df['Lead Source'].value_counts().sum()*100,1)



#### 31% of the categorical variable 'Lead Source' are  'Google ' hence we will go ahead and impute the missing values with this value             

In [None]:

leads_df['Lead Source']=leads_df['Lead Source'].fillna(leads_df['Lead Source'].mode()[0])


In [None]:
# Let's examine 'TotalVisits'

leads_df.TotalVisits.isnull().sum()                                     


In [None]:

leads_df.TotalVisits.describe()


### There seems to be an outlier issue

In [None]:

fig = px.box(leads_df.TotalVisits,width=400, height=300,color_discrete_sequence=['seagreen'])

fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),paper_bgcolor="Aquamarine")

fig.show()


In [None]:
# We will not cap all the outliers since it will remove a significant number of rows from our datset

# Instead we will cap 'TotalVisits' greater than 100

leads_df=leads_df[leads_df['TotalVisits']<100]


In [None]:
# Recheck the box plot

fig = px.box(leads_df.TotalVisits,width=400, height=300,color_discrete_sequence=['seagreen'])

fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),paper_bgcolor="Aquamarine")

fig.show()



#### Findings : There are still a few outliers, but the upper range has come down to an acceptable range


In [None]:
# We will also inspect the target variable

leads_df.Converted.value_counts()


In [None]:
# Traget Inbalance

leads_df.Converted.value_counts()[1]/leads_df.Converted.value_counts().sum()



#### We can see there is also inbalance in the target variable 'Converted'


In [None]:
#Recheck the missing values

null_cols=leads_df.isnull().sum()

null_cols[null_cols.values>0]


In [None]:
# We need impute the missing values of the variable 'What is your current occupation' since occupation of an individual is significant information


leads_df['What is your current occupation'].value_counts(dropna=False)


In [None]:
# Imputing with mode is  not the right choice here since the missing count is quite high

# Rather convert NaN values as 'not provided', instead of dropping the column or imputing with mode

leads_df['What is your current occupation'] = leads_df['What is your current occupation'].fillna('not provided')


In [None]:

leads_df.isnull().sum()


In [None]:

print(len(leads_df.index))

print(round(len(leads_df.index)/9240*100,1))



#### All the missing values has been treated. We are able to maintain 98.5 % rows of the original dataset



## Exploratory Data Analysis



### Univariate analysis for Numeric Variables


In [None]:

# Filter out numeric column for EDA

num_cols=leads_df.select_dtypes(include=np.number).columns

num_cols= num_cols.drop('Converted') # Target variable not needed

num_cols


In [None]:

# Lets analyze numeric variables visually using plotly

for y in list(num_cols):
    
    fig = px.box(leads_df[y],width=400, height=300,color_discrete_sequence=['seagreen'])
       
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),paper_bgcolor="aquamarine")    
    
    fig.show()   
    
    print('\n')
    
for y in list(num_cols):
    
    fig = px.histogram(leads_df, x=y,color_discrete_sequence=['seagreen'], width=900, height=500)
    
    fig.show()
    



#### Finding : There seems to be outliers for the column ''Page Views Per Visit''

In [None]:
plt.figure(figsize=[18,4])
    
plt.subplot(1,3,1)
        
sns.boxplot(data=leads_df,x=uni_numeric[0],palette="Accent",orient='v')
    
plt.title('Client '+uni_numeric[0],fontsize=16,loc='right')
plt.subplot(1,3,2)
        
sns.boxplot(data=leads_df,x=uni_numeric[1],palette="Accent",orient='v')
    
plt.title('Client '+uni_numeric[1],fontsize=16,loc='right')
plt.subplot(1,3,3)
        
sns.boxplot(data=leads_df,x=uni_numeric[2],palette="Accent",orient='h')
    
plt.title('Client '+uni_numeric[2],fontsize=16,loc='right')

plt.show()


In [None]:

leads_df['Page Views Per Visit'].describe()


In [None]:
# We will treat the outliers by capping 'Page Views Per Visit' greater than 15

leads_df=leads_df[leads_df['Page Views Per Visit']<15]


In [None]:

plt.figure(figsize = (10,5))    
    
fig = px.box(y='Page Views Per Visit',data_frame=leads_df, width=400, height=300,color_discrete_sequence=['seagreen'])
    
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),paper_bgcolor="aquamarine")    
    
fig.show()   




## Uni variate Analysis - Categorical Variable


In [None]:

# Filter out categorical  column for EDA

cat_cols=leads_df.select_dtypes(exclude=np.number).columns

for col in cat_cols:
    
    plt.figure(figsize = (10,5))
    sns.countplot(x=col,data= leads_df,palette='gist_ncar').tick_params(axis='x', rotation = 90)
    plt.title(col+'\n')
    plt.show()
    print('\n\n')


#### Finding : 'Last Notable Activity' and 'Last Activity' exhibits similar pattern of values.

#### we will further analyze it in detail with heatmap analysis to find out any possible correlations and will take necessary actions.



### Trying the same plot with Plotly only for learning purpose

cat_cols=leads_df.select_dtypes(exclude=np.number).columns

for col in cat_cols:
    
    fig = px.bar(leads_df[col].value_counts(),height=500,width=700,color=leads_df[col].value_counts(),color_continuous_scale='armyrose',title=col)
    
    fig.show()


## Bivariate Analysis Categorical Variable w.r.t Target


In [None]:

for col in cat_cols:
    
    plt.figure(figsize = (15,5))
    sns.countplot(x=col, hue='Converted', data= leads_df,palette='Accent').tick_params(axis='x', rotation = 90)
    plt.title(col)
    plt.show()
    print('\n\n')


#### Finding : As per the bivariate analysis as well, 'Last Notable Activity' and 'Last Activity' exhibits similar pattern of distribution with the target variable.



## Multivariate Analysis


In [None]:
# Check the correlation among numeric variables

leads_df.corr()


In [None]:
# Heatmap for numeric variables 

sns.heatmap(leads_df.corr(),annot=True, cmap='GnBu')

plt.show()

#### Findings:

1. 'TotalVisits' and 'Page Views Per Visit' have a correlation of .65, indicates possible multucollineariy, hence we will drop one of the variable

2. 'Total Time Spent on Website' and target variable 'Converted' have a correlation of 0.35, indicates that 'Total Time Spent on Website' could a possible predictor of successful leads.

3. 'Total Time Spent on Website' and 'Page Views Per Visit' have a correlation coeffcient of .34

#### We can further analyse these variables with VIF treatment as part of model building.


In [None]:

# Dropping 'Page Views Per Visit' since this variable is least correlated with the target.

leads_df.drop('Page Views Per Visit',1,inplace=True)



### Trying the same scatter plot with Plotly only for learning purpose

fig = px.imshow(leads_df.corr(),width=800, height=500,title ='Correlation Heatmap',color_continuous_scale='armyrose')

#fig.update_layout(margin=dict(l=50, r=20, t=20, b=20),paper_bgcolor="Aquamarine")

fig.show()


In [None]:

# Let's analyze the pairplot for numeric variables

sns.pairplot(leads_df,diag_kind='kde',hue='Converted')

plt.show()



#### Findings : 

1. Only the variables 'TotalVisits' and 'Page Views Per Visit' shows a positive correlation as per the scatter plot.

2. Also it shows that even though the page views are compararively less but if the person visit the website frequently, that indicates a hot lead.

3. If a lead spends more time on the web site it is an indication of a possible lead.

4. More time spend on the website as well more pages viewed per visit is a good indication of successful conversion.


In [None]:
# Craete a dataframe of only numeric variables

num_df=leads_df.select_dtypes(include=np.number)


# Trying the same scatter plot with Plotly only for learning purpose

fig = px.scatter_matrix(data_frame=num_df,color_continuous_scale='armyrose',color ='Converted')

fig.show()


In [None]:
# Once again check null values and confirm

leads_df.isnull().sum()



## Dummy Variable Creation


In [None]:

leads_df.head()


In [None]:

#There is just one variable with Yes/No values

# Convert Yes or No values to 1 and 0 respectively


leads_df['A free copy of Mastering The Interview']=leads_df['A free copy of Mastering The Interview'].replace(['yes','no'],[1,0])


In [None]:
# Filter out remaining categorical columns

cat_cols=list(leads_df.select_dtypes(exclude=np.number).columns)

cat_cols


In [None]:
# Create dummies for the categorical variables

dummies=pd.get_dummies(leads_df[cat_cols],drop_first = True) 

leads_df=pd.concat([leads_df,dummies],axis=1)

leads_df.drop(cat_cols,axis=1,inplace=True)


###  Create the correlation heatmap


In [None]:

# Now all the variables are being converted to numeric

leads_df.head()

In [None]:

plt.figure(figsize=(30,20))

sns.heatmap(leads_df.corr(),cmap='Greens')


# As per the heatmap, variables which are higly correlated with Target are:

1. 'Last Notable Activity_sms sent'


2. 'Last Notable Activity_modified'


3. 'What is your current occupation_working professional'


4.  'What is your current occupation_not provided'


5.  'Last Activity_sms sent'


6.  Last Activity_olark chat conversation'


7. 'Last Activity_page visited on website'


8. Total Time Spent on Website'


9. 'Lead Origin_lead add form'


10. 'Lead Source_olark chat'


11. 'Lead Source_direct traffic'

       
12. 'Lead Source_reference'


13. 'Last Activity_converted to lead'

 

## The below variables are higly correlated to each other and hence we will go ahead and drop one of the variable



1. 'What is your current occupation_not provided' Vs 'What is your current occupation_unemployed'


2. 'Last Activity_email opened' Vs 'Last Activity_sms sent' 


3. 'Lead Source_reference' Vs 'Lead Origin_lead add form'


4. 'Lead Source_facebook' Vs 'Lead Origin_lead import'


5.   'Last Notable Activity_olark chat conversation' Vs 'Last Activity_olark chat conversation'


6.  'Last Activity_sms sent' Vs 'Last Notable Activity_sms sent'


7.	'Last Notable Activity_had a phone conversation' Vs 'Last Activity_had a phone conversation'


8.	'Last Activity_unreachable' Vs 'Last Notable Activity_unreachable'




In [None]:
# Drop the most correlated dummy variables

leads_df.drop(['What is your current occupation_unemployed','Last Activity_email opened','Lead Source_reference','Lead Source_facebook','Last Notable Activity_olark chat conversation','Last Activity_sms sent','Last Notable Activity_had a phone conversation','Last Activity_unreachable'],1,inplace=True)


In [None]:
# Recheck the heatmap

plt.figure(figsize=(30,20))

sns.heatmap(leads_df.corr(),cmap='Greens')



# Trying the same scatter plot with Plotly only for learning purpose

fig = px.imshow(leads_df.corr(),width=1000, height=1000,title ='Correlation Heatmap',color_continuous_scale='armyrose')

fig.update_layout(margin=dict(l=50, r=20, t=20, b=20),paper_bgcolor="gainsboro")

fig.show()


In [None]:
# Split Predictors and target variables for model building

y=leads_df[['Converted']]

X=leads_df.drop('Converted',1)


In [None]:

X.shape


### Test Train Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)


In [None]:

X_train.shape


In [None]:

X_test.shape


In [None]:
# Instantiate the MinMaxScaler to scale numeric variables

scaler=MinMaxScaler()


In [None]:
# 'Page Views Per Visit' was dropped in an earlier step, hence will drop the same from our list of numeric variables

num_cols=num_cols.drop('Page Views Per Visit')


In [None]:
# Scale the training dataset

X_train[num_cols]=scaler.fit_transform(X_train[num_cols])


In [None]:

X_train.head()


In [None]:
#Import statsmodel library

import statsmodels.api as sm



###  Model Building and Evaluation


In [None]:

# Logistic regression model - GLM(Generalized Linear Model)

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())

res=logm1.fit()

res.summary()


In [None]:
# Apply Recursive Feature Elimination for initial screening of variables


from sklearn.feature_selection import RFE

logreg = LogisticRegression() # Create a Logistic regression class object as an input for RFE

logreg.fit(X_train,y_train) # Fit the model with the training data

rfe = RFE(logreg, 15)             # running RFE with 15 variables as output

rfe = rfe.fit(X_train, y_train)


In [None]:
# Create a datframe to view the RFE output with the selected variables and their respective ranking

rfe_leads_df = pd.DataFrame({'Predictor': X_train.columns, 'Select Status': rfe.support_, 'Ranking': rfe.ranking_})

rfe_leads_df.sort_values(by='Ranking')


In [None]:
# List of variables selected by RFE

rfe_cols = X_train.columns[rfe.support_]

rfe_cols


In [None]:
# Apply GLM model to the selected features

X_train_sm = sm.add_constant(X_train[rfe_cols])

logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

res=logm2.fit()

res.summary()


In [None]:
# Function to calculate VIF for multicollinearity among variables

def fetch_vif_df(local_df):
    
    vif_df = pd.DataFrame()
    
    vif_df['Features'] = local_df.columns
    
    vif_df['VIF'] = [variance_inflation_factor(local_df.values, i) for i in range(local_df.shape[1])]
    
    vif_df['VIF'] = round(vif_df['VIF'], 2)
    
    vif_df = vif_df.sort_values(by='VIF', ascending=False)
    
    vif_df = vif_df.reset_index(drop=True)
    
    return vif_df


In [None]:

# VIF for Feature elimination

fetch_vif_df(X_train[rfe_cols])


In [None]:
# Predict the target variable for the training data

y_train_pred = res.predict(sm.add_constant(X_train[rfe_cols]))


In [None]:
# Creating a dataframe with the actual Conversion and the predicted probabilities

y_train_pred_df = pd.DataFrame({'Original_Conver':y_train.Converted, 'Conver_Prob':y_train_pred,'ID': y_train.index})

y_train_pred_df.head()


In [None]:
# Create a new column for the predictions

y_train_pred_df['predicted'] = y_train_pred_df.Conver_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_df.head()


In [None]:
# Confusion matrix 

confusion = metrics.confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df.predicted )

print(confusion)


In [None]:
# Let's check the model accuracy

print("Accuracy of the model is :  ",round(metrics.accuracy_score(y_train_pred_df.Original_Conver, y_train_pred_df.predicted),2)*100, '%')


In [None]:
# 'What is your current occupation_housewife' is having a high p-value - makes it an insignificant variable

# hence will drop this variable

rfe_cols=rfe_cols.drop('What is your current occupation_housewife')


In [None]:
# Rerun the model with new set of variables

X_train_sm = sm.add_constant(X_train[rfe_cols])

logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

res=logm3.fit()

res.summary()


In [None]:
# VIF for Feature elimination

fetch_vif_df(X_train[rfe_cols])


In [None]:
# Predict the target with the current model

y_train_pred = res.predict(sm.add_constant(X_train[rfe_cols]))

y_train_pred_df['Conver_Prob']=y_train_pred


In [None]:
# Update the 'predicted' column with the new predictions

y_train_pred_df['predicted'] = y_train_pred_df.Conver_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_df.head()


In [None]:
# Confusion matrix 

confusion = metrics.confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df.predicted )

print(confusion)

In [None]:

# Let's check the model accuracy

print("Accuracy of the model is :  ",round(metrics.accuracy_score(y_train_pred_df.Original_Conver, y_train_pred_df.predicted),2)*100, '%')


In [None]:
# 'Last Activity_email bounced' got a comparatively high p-value, hence dropping

rfe_cols=rfe_cols.drop('Last Activity_email bounced')


In [None]:
# Rerun the model with new set of variables

X_train_sm = sm.add_constant(X_train[rfe_cols])

logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

res=logm4.fit()

res.summary()


In [None]:
# VIF for Feature elimination

fetch_vif_df(X_train[rfe_cols])


In [None]:
# Predict the target with the current model

y_train_pred = res.predict(sm.add_constant(X_train[rfe_cols]))

y_train_pred_df['Conver_Prob']=y_train_pred


In [None]:
# Update the 'predicted' column with the new predictions

y_train_pred_df['predicted'] = y_train_pred_df.Conver_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_df.head()

In [None]:
# Confusion matrix 

confusion = metrics.confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df.predicted )

print(confusion)

In [None]:

# Let's check the model accuracy

print("Accuracy of the model is :  ",round(metrics.accuracy_score(y_train_pred_df.Original_Conver, y_train_pred_df.predicted),2)*100, '%')


In [None]:
# 'Lead Source_welingak website' got a comparatively high p-value, hence dropping

rfe_cols=rfe_cols.drop('Lead Source_welingak website')


In [None]:
# Rerun the model with new set of variables

X_train_sm = sm.add_constant(X_train[rfe_cols])

logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

res=logm5.fit()

res.summary()


In [None]:
# VIF for Feature elimination

fetch_vif_df(X_train[rfe_cols])


In [None]:
# Predict the target with the current model

y_train_pred = res.predict(sm.add_constant(X_train[rfe_cols]))

y_train_pred_df['Conver_Prob']=y_train_pred


In [None]:
# Update the 'predicted' column with the new predictions

y_train_pred_df['predicted'] = y_train_pred_df.Conver_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_df.head()

In [None]:
# Confusion matrix 

confusion = metrics.confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df.predicted )

print(confusion)

In [None]:

# Let's check the overall accuracy

accuracy = round(metrics.accuracy_score(y_train_pred_df.Original_Conver, y_train_pred_df.predicted),2)*100

print("Accuracy of the model is :  ",accuracy,'%')


rfe_cols=rfe_cols.drop('Lead Source_welingak website')

In [None]:
# Rerun the model with new set of variables

X_train_sm = sm.add_constant(X_train[rfe_cols])

logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

res=logm5.fit()

res.summary()

In [None]:
# VIF for Feature elimination

fetch_vif_df(X_train[rfe_cols])


## The remaining 12 columns are statistically significant, hence this will be our final model

All P-values negligibly small(less than 0.05) and we can see that all features are having vif values less than 5, hence there is no multicollinearity issue in the dataset.


In [None]:

confusion = confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df.predicted)

confusion


In [None]:

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives


In [None]:
# Let's calculate the sensitivity

sensitivity= round(TP / float(TP+FN)*100,2)

print("Sensitivity of the model is :", sensitivity,'%')



#### Sensitivity of 70% is comparatively smaller for this analysis, which is not we are aiming for.

#### However we will try to improve it with an optimum cut off.


In [None]:
# Let us calculate specificity

specificity= round(TN / float(TN+FP)*100,2)

print("Specificity of the model is :", specificity,'%')


In [None]:
precision = round(precision_score(y_train_pred_df.Original_Conver, y_train_pred_df.predicted)*100,2)

print("Precision of the model is :", precision,'%')


In [None]:
recall=round(recall_score(y_train_pred_df.Original_Conver, y_train_pred_df.predicted)*100,2)

print("Recall of the model is :", recall,'%')


In [None]:
# Function to draw Receiver operating characteristic Curve - True positive rate (TPR) Vs false positive rate (FPR)

def draw_roc( actual, probs ):
    
    fpr, tpr, thresholds = roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = roc_auc_score( actual, probs )
    plt.figure(figsize=(6,6))
    
    plt.plot(fpr, tpr,'g-', label='ROC curve (area = %0.2f)' % auc_score )
    
    plt.plot([0, 1], [0, 1], 'k--',c='r')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic Curve \n',fontdict={'fontsize': 15})
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
# Define false positive rate (FPR), true positive rate (TPR) and Threshold values using roc_curve

fpr, tpr, thresholds = roc_curve( y_train_pred_df.Original_Conver, y_train_pred_df.predicted, drop_intermediate = False )



## Plot ROC curve and find the AOC


In [None]:

draw_roc(y_train_pred_df.Original_Conver, y_train_pred_df.Conver_Prob)


## Points to be concluded from above roc curve

1. The curve is closer to the top left corner  of the border and this is a measure of good accuracy.

2. Here the area under the curve is 89 % of the total area.


In [None]:
# Let's create columns with different probability cutoffs 

numbers = [float(x)/10 for x in range(10)]

for i in numbers:
    
    y_train_pred_df[i]= y_train_pred_df.Conver_Prob.map(lambda x: 1 if x > i else 0)
    
y_train_pred_df.head()


In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.

cutoff_leads_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for i in num:
    
    cm1 = confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_leads_df.loc[i] =[ i ,accuracy,sensi,speci]
    
print(cutoff_leads_df)


In [None]:
# Let's draw Accuracy Vs Sensitivity Vs Specificity to see the optimal cut off

cutoff_leads_df.plot.line(x='prob', y=['accuracy','sensi','speci'])

plt.grid()

plt.show()



#### From the curve above, it's around 0.32 we have an optimal value


In [None]:
# Apply the new threshold value of .32

y_train_pred_df['final_predicted'] = y_train_pred_df.Conver_Prob.map( lambda x: 1 if x > .34 else 0)

y_train_pred_df.head()


In [None]:

# Let's check the accuracy with the revised threshold

train_accuracy=round(metrics.accuracy_score(y_train_pred_df.Original_Conver, y_train_pred_df.final_predicted)*100,2)

print("Accuracy :", train_accuracy,'%')


In [None]:
# New confusion matrix

confusion2 = metrics.confusion_matrix(y_train_pred_df.Original_Conver, y_train_pred_df.final_predicted )
confusion2


In [None]:

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives


In [None]:

# Let's check the evaluation scores of the training data

train_sensitivity = round(TP / float(TP+FN)*100,2)

train_specificity= round(TN / float(TN+FP)*100,2)

train_precision=round(precision_score(y_train_pred_df.Original_Conver, y_train_pred_df.final_predicted)*100,2)

train_recall= round(recall_score(y_train_pred_df.Original_Conver, y_train_pred_df.final_predicted)*100,2)

train_f1_Score=round(f1_score(y_train_pred_df.Original_Conver, y_train_pred_df.final_predicted)*100,2)


In [None]:

print('SCORES FOR THE TRAINING DATA SET \n')  

print( 'Accuracy  : ', train_accuracy,'%\n')

print( 'Sensitivity : ', train_sensitivity,'%\n')

print( 'Specificity : ', train_specificity,'%\n')

print( 'Precision : ', train_precision,'%\n')

print( 'Recall : ', train_recall,'%\n')

print('F1 score : ',train_f1_Score,'%\n')



## Let's draw the Precision Vs Recall 


In [None]:

p, r, thresholds = precision_recall_curve(y_train_pred_df.Original_Conver, y_train_pred_df.Conver_Prob)

plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")

plt.grid()

plt.show()


### Looks like Precision-Recall cut ooff value is slighly higher than the Sensitivity-Specificity cut-off



### Step 11: Making predictions on the test set

In [None]:
# Scaling numeric variables for the validation dataset

X_test[num_cols]=scaler.transform(X_test[num_cols])


In [None]:

X_test.head()


In [None]:

X_test.shape


In [None]:

X_test[rfe_cols]


In [None]:

# Prediction on the test dataset

y_test_pred= res.predict(sm.add_constant(X_test[rfe_cols]))



## Lead Scoring


In [None]:
# We will add a new column as Lead_Score which assigns a scoring for each lead, higher the score means higher chance of conversion


y_test_pred_df = pd.DataFrame({'Original_Conver':y_test.Converted, 'Conver_Prob':y_test_pred,'Lead_Score':y_test_pred*100,'ID': y_test.index})

y_test_pred_df.head()


In [None]:
# Creating a dataframe with the actual Conversion and the predicted probabilities for the test data

y_test_pred_df['predicted'] = y_test_pred_df.Conver_Prob.map(lambda x: 1 if x > 0.34 else 0)

y_test_pred_df.head()


In [None]:
# Confusion matrix for the test data

test_confusion = confusion_matrix(y_test_pred_df.Original_Conver, y_test_pred_df.predicted)

test_confusion


In [None]:

TP = test_confusion[1,1] # true positive 
TN = test_confusion[0,0] # true negatives
FP = test_confusion[0,1] # false positives
FN = test_confusion[1,0] # false negatives


In [None]:

# Let's check the evaluation scores of the validation data

test_accuracy = round(accuracy_score(y_test_pred_df.Original_Conver, y_test_pred_df.predicted)*100,2)

test_sensitivity = round(TP / float(TP+FN)*100,2)

test_specificity= round(TN / float(TN+FP)*100,2)

test_precision=round(precision_score(y_test_pred_df.Original_Conver, y_test_pred_df.predicted)*100,2)

test_recall= round(recall_score(y_test_pred_df.Original_Conver, y_test_pred_df.predicted)*100,2)

test_f1_Score=round(f1_score(y_test_pred_df.Original_Conver, y_test_pred_df.predicted)*100,2)

#F1 Score = 2*((precision*recall)/(precision+recall))

In [None]:

print('SCORES FOR THE VALIDATION DATATSET \n')      

print( 'Accuracy  : ', test_accuracy,'%\n')

print( 'Sensitivity : ', test_sensitivity,'%\n')

print( 'Specificity : ', test_specificity,'%\n')

print( 'Precision : ', test_precision,'%\n')

print( 'Recall : ', test_recall,'%\n')

print('F1 score : ',test_f1_Score,'%\n')



## Conclusion:

Most significant features those can contribute towards better conversion rate are :

1. Total Time Spent on Website
2. Lead Origin_lead add form
3. What is your current occupation_working professional
4. Last Activity_had a phone conversation
5. TotalVisits

X Education can make use of this model to target the potential candidates to screen those who have a higher probability of enrolling to their programs.