In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [4]:
#checking for null 
df.isnull().any()

lead_source                  True
industry                     True
number_of_courses_viewed    False
annual_income                True
employment_status            True
location                     True
interaction_count           False
lead_score                  False
converted                   False
dtype: bool

In [5]:
# Fill numerical columns with 0
df[df.select_dtypes(include ='number').columns]=df.select_dtypes(include ='number').fillna(0)

# Fill categorical columns with 'NA'
df[df.select_dtypes(exclude='number').columns] = df.select_dtypes(exclude='number').fillna('NA')



## Question 1 

In [6]:

df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [7]:
df.columns


Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [8]:
category =['lead_source', 'industry','employment_status', 'location']
numerical = [ 'number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score',
       'converted']

In [9]:
df_numerical= df[numerical]

In [10]:
df_numerical

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.80,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1
...,...,...,...,...,...
1457,1,0.0,4,0.53,1
1458,3,65259.0,2,0.24,1
1459,1,45688.0,3,0.02,1
1460,5,71016.0,0,0.25,1


## question -2 -correlation

In [11]:

corr_matrix = df_numerical.corr()
print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


In [12]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
y_train = train_df['converted'].values
y_val = val_df['converted'].values

In [13]:
X_train = train_df.drop(columns=['converted'])
X_val = val_df.drop(columns=['converted'])

In [14]:
train_df_categorical = train_df[category]

In [15]:
x= train_df_categorical
x

Unnamed: 0,lead_source,industry,employment_status,location
1066,social_media,manufacturing,self_employed,australia
638,events,retail,student,north_america
799,social_media,education,,europe
380,referral,education,employed,australia
303,paid_ads,healthcare,employed,europe
...,...,...,...,...
1130,events,manufacturing,employed,south_america
1294,events,healthcare,student,south_america
860,paid_ads,manufacturing,student,north_america
1459,paid_ads,technology,student,north_america


## Question - 3

In [16]:
from sklearn.metrics import mutual_info_score
print(mutual_info_score(x.lead_source, y_train))

0.025665373935054955


In [17]:
print(mutual_info_score(x.industry, y_train))

0.011684562750165564


In [18]:
print(mutual_info_score(x.employment_status, y_train))

0.013258496589914293


In [19]:
print(mutual_info_score(x.location, y_train))

0.0022530354195563346


In [20]:
dv = DictVectorizer(sparse=False)

train_dict = train_df[category + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = val_df[category + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

## question 4 

In [21]:

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [22]:

acc = model.score(X_val, y_val)
print("Validation Accuracy:", round(acc, 2))

Validation Accuracy: 0.78


## question 5- dropping columns one by one 

In [23]:

category =['lead_source','employment_status', 'location']
numerical = [ 'number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score',
       'converted']
df1 =df.drop(columns=['industry'])
df1

Unnamed: 0,lead_source,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,1,46992.0,employed,south_america,1,0.80,0
2,events,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,2,83843.0,,australia,1,0.87,0
4,referral,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...
1457,referral,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,1,45688.0,student,north_america,3,0.02,1
1460,referral,5,71016.0,self_employed,north_america,0,0.25,1


In [24]:
train_df1, val_df1 = train_test_split(df1, test_size=0.2, random_state=42)
y1_train = train_df1['converted'].values
y1_val = val_df1['converted'].values
X1_train = train_df1.drop(columns=['converted'])
X1_val = val_df1.drop(columns=['converted'])
dv = DictVectorizer(sparse=False)

train_dict = train_df1[category + numerical].to_dict(orient='records')
X1_train = dv.fit_transform(train_dict)

val_dict = val_df1[category + numerical].to_dict(orient='records')
X1_val = dv.transform(val_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X1_train, y1_train)
acc1 = model.score(X1_val, y1_val)
print("Validation Accuracy:", round(acc1, 2))


Validation Accuracy: 0.78


In [25]:
#Question 5 
category =['lead_source','industry', 'location']
numerical = [ 'number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score',
       'converted']
df2 =df.drop(columns=['employment_status'])
df2

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,south_america,4,0.94,1
1,social_media,retail,1,46992.0,south_america,1,0.80,0
2,events,healthcare,5,78796.0,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,australia,1,0.87,0
4,referral,education,3,85012.0,europe,3,0.62,1
...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,north_america,4,0.53,1
1458,referral,technology,3,65259.0,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,north_america,3,0.02,1
1460,referral,,5,71016.0,north_america,0,0.25,1


In [26]:
train_df2, val_df2 = train_test_split(df2, test_size=0.2, random_state=42)
y2_train = train_df2['converted'].values
y2_val = val_df2['converted'].values
X2_train = train_df2.drop(columns=['converted'])
X2_val = val_df2.drop(columns=['converted'])
dv = DictVectorizer(sparse=False)

train_dict = train_df2[category + numerical].to_dict(orient='records')
X2_train = dv.fit_transform(train_dict)

val_dict = val_df2[category + numerical].to_dict(orient='records')
X2_val = dv.transform(val_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X2_train, y2_train)
acc2 = model.score(X2_val, y2_val)
print("Validation Accuracy:", round(acc2, 2))


Validation Accuracy: 0.77


In [27]:
category =['lead_source', 'industry','employment_status', 'location']
numerical = [ 'number_of_courses_viewed', 'annual_income','interaction_count',
       'converted']
df3 =df.drop(columns=['lead_score'])
df3

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,1
1,social_media,retail,1,46992.0,employed,south_america,1,0
2,events,healthcare,5,78796.0,unemployed,australia,3,1
3,paid_ads,retail,2,83843.0,,australia,1,0
4,referral,education,3,85012.0,self_employed,europe,3,1
...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,1
1458,referral,technology,3,65259.0,student,europe,2,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,1
1460,referral,,5,71016.0,self_employed,north_america,0,1


In [28]:
train_df3, val_df3 = train_test_split(df3, test_size=0.2, random_state=42)
y3_train = train_df3['converted'].values
y3_val = val_df3['converted'].values
X3_train = train_df3.drop(columns=['converted'])
X3_val = val_df3.drop(columns=['converted'])
dv = DictVectorizer(sparse=False)

train_dict = train_df3[category + numerical].to_dict(orient='records')
X3_train = dv.fit_transform(train_dict)

val_dict = val_df3[category + numerical].to_dict(orient='records')
X3_val = dv.transform(val_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X3_train, y3_train)
acc3 = model.score(X3_val, y3_val)
print("Validation Accuracy:", round(acc3, 2))

Validation Accuracy: 0.78


In [29]:
print("Difference without Industry ",np.diff([acc, acc1]))
print("Difference without Employment status ",np.diff([acc, acc2]))
print("Difference without lead score ",np.diff([acc, acc3]))

Difference without Industry  [0.]
Difference without Employment status  [-0.00341297]
Difference without lead score  [0.]


## Question 6

In [30]:

from sklearn.metrics import accuracy_score
C_values = [0.01, 0.1, 1, 10, 100]
results = []

for C in C_values:
    model = LogisticRegression(
        solver='liblinear', C=C, max_iter=1000, random_state=42
    )
    model.fit(X_train, y_train)
    acc = accuracy_score(y_val, model.predict(X_val))
    results.append((C, round(acc, 3)))

In [31]:
for C, acc3 in results:
    print(f"C={C:<6}  val_acc={acc3:.3f}")
best_C, best_acc = max(results, key=lambda x: x[1])
print(f"\nBest C: {best_C}  |  Best val accuracy: {best_acc:.3f}")

C=0.01    val_acc=0.860
C=0.1     val_acc=0.778
C=1       val_acc=0.778
C=10      val_acc=0.778
C=100     val_acc=0.778

Best C: 0.01  |  Best val accuracy: 0.860
