In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('course_lead_scoring.csv')

In [None]:
df.head()

In [None]:
df['industry'].mode()

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical

In [None]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [None]:
df[numerical]

In [None]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [None]:
df.head().T

In [None]:
df[numerical]=df[numerical].fillna(0)

In [None]:
df[categorical]=df[categorical].fillna('No Value')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [None]:
del df_train['lead_score']
del df_val['lead_score']
del df_test['lead_score']

In [None]:
df_full_train.isnull().sum()

In [None]:
df_full_train.converted.value_counts(normalize=True)

In [None]:
from IPython.display import display

In [None]:
global_conversion_count = df['converted'].mean()

In [None]:
global_conversion_count

In [None]:
combined_list = numerical + categorical
for c in combined_list:
    print(c)
    df_group = df.groupby(c).converted.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_conversion_count
    df_group['risk'] = df_group['mean'] / global_conversion_count
    display(df_group)
    print()

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
def mutual_info_lead_score(series):
    return mutual_info_score(series, df_train.converted)

In [None]:
mi = df_train[categorical].apply(mutual_info_lead_score)
mi.sort_values(ascending=False)

In [None]:
corr = df[numerical].corrwith(df.lead_score).abs()
corr.sort_values(ascending=False)

In [None]:
corr = df[numerical].corrwith(df.interaction_count).abs()
corr.sort_values(ascending=False)

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.coef_[0].round(2)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
convert_decision = (y_pred >= 0.5)

In [None]:
(y_val == convert_decision).mean()

In [None]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = convert_decision.astype(int)
df_pred['actual'] = y_val

In [None]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [None]:
df_pred.correct.mean()

In [None]:
accuracy = accuracy_score(y_val, convert_decision)
accuracy

In [None]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(2)))

In [None]:
categorical

In [None]:
categorical_minus_industry = ['lead_source', 'employment_status', 'location']

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_minus_industry].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_minus_industry].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
convert_decision = (y_pred >= 0.5)

In [None]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = convert_decision.astype(int)
df_pred['actual'] = y_val

In [None]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [None]:
df_pred.correct.mean()

In [None]:
categorical_minus_empStatus = ['lead_source', 'industry', 'location']

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_minus_empStatus].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_minus_empStatus].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
convert_decision = (y_pred >= 0.5)

In [None]:
(y_val == convert_decision).mean()

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

In [None]:
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

In [None]:
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

In [None]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()

In [None]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.coef_[0].round(2)
y_pred = model.predict_proba(X_val)[:, 1]
convert_decision = (y_pred >= 0.5)
(y_val == convert_decision).mean()