In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load and clean data
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df = df.fillna({
    col: 'NA' if df[col].dtype == 'object' else 0 for col in df.columns
})

# Split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Target
y_train = df_train.converted.values
y_val = df_val.converted.values

# Features
categorical = ['location', 'industry', 'employment_status', 'lead_source']
numeric = ['annual_income', 'number_of_courses_viewed', 'interaction_count', 'lead_score']
features = categorical + numeric

# DictVectorizer
dv = DictVectorizer(sparse=False)
train_dicts = df_train[features].to_dict(orient='records')
val_dicts = df_val[features].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Logistic Regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict_proba(X_val)[:, 1]
acc = accuracy_score(y_val, y_pred >= 0.5)
print(f"Validation accuracy: {acc:.2f}")

Validation accuracy: 0.70


In [2]:
df.shape
df.converted.value_counts(normalize=True)

converted
1    0.619015
0    0.380985
Name: proportion, dtype: float64

In [3]:
print(df.shape)
print(df.columns.tolist())

(1462, 9)
['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']


In [4]:
import pandas as pd

url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(url)

print(df.shape)
df.head()

(1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

print(df.shape)
print(df.columns)

(1462, 9)
Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')
