In [1]:
import os
import hashlib
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    GridSearchCV
)

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures
)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import (
    XGBClassifier,
    XGBRegressor,
    DMatrix,
    train as xgb_train,
    cv as xgb_cv,
    plot_importance
)

from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    LassoCV,
    RidgeCV
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score
)

from typing import Tuple

In [2]:
RANDOM_STATE = 42

In [3]:
def hash_columns(columns):
    col_str = ",".join(sorted(columns))
    return hashlib.sha256(col_str.encode()).hexdigest()

In [4]:
def hash_series(series):
    series_str = ",".join(map(str, series.values))
    return hashlib.sha256(series_str.encode()).hexdigest()

In [5]:
def hash_data_frame(df):
    df_sorted = df.sort_index(axis=1).sort_values(by=list(df.columns))
    return hashlib.sha256(pd.util.hash_pandas_object(df_sorted, index=True).values).hexdigest()

In [6]:
def check_signature(expected, actual):
    # print(actual)
    try:
        assert actual == expected
        print("✔ Test passed!")
    except AssertionError:
        print("✘ Test failed.")
        raise

In [7]:
def test_data(func, signature):
    df = pd.read_csv("churn-bigml-20.csv")
    df = func(df)
    check_signature(signature, hash_data_frame(df))

In [8]:
def test_partition(func, signature_X, signature_y):
    df = pd.read_csv("churn-bigml-20.csv")
    X, y = func(df)
    # print(hash_data_frame(X))
    # print(hash_series(y))
    try:
        assert hash_data_frame(X) == signature_X
        assert hash_series(y) == signature_y
        print("✔ Test passed!")
    except AssertionError:
        print("✘ Test failed.")
        raise

In [9]:
def test_split(func, signature_X_train, signature_X_test, signature_y_train, signature_y_test):
    df = pd.read_csv("churn-bigml-20.csv")
    X_train, X_test, y_train, y_test = func(df.drop(columns=["Churn"]), df["Churn"])
    # print(hash_data_frame(X_train))
    # print(hash_data_frame(X_test))
    # print(hash_series(y_train))
    # print(hash_series(y_test))
    try:
        assert hash_data_frame(X_train) == signature_X_train
        assert hash_data_frame(X_test) == signature_X_test
        assert hash_series(y_train) == signature_y_train
        assert hash_series(y_test) == signature_y_test
        print("✔ Test passed!")
    except AssertionError:
        print("✘ Test failed.")
        raise

In [10]:
def test_evaluate(accuracy):
    try:
        assert accuracy > 0.75
        print("✔ Test passed!")
    except AssertionError:
        print("✘ Test failed.")
        raise

In [11]:
def test_validate(scores_1, mean_1, std_1, scores_2, mean_2, std_2):
    try:
        assert len(scores_1) == 7
        assert len(scores_2) == 7
        
        assert mean_1 > 0.75
        assert mean_2 > 0.75
    
        assert std_1 < 0.3
        assert std_2 < 0.3
        print("✔ Test passed!")
    except AssertionError:
        print("✘ Test failed.")
        raise

In [12]:
df = pd.read_csv("churn-bigml-20.csv")

In [14]:
df.sample(10)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
146,MN,13,510,No,Yes,21,315.6,105,53.65,208.9,71,17.76,260.1,123,11.7,12.1,3,3.27,3,False
605,IL,131,510,No,No,0,263.4,123,44.78,151.9,74,12.91,218.5,101,9.83,10.7,2,2.89,2,False
322,MS,70,408,No,No,0,148.4,110,25.23,267.1,90,22.7,151.5,101,6.82,8.9,4,2.4,0,False
429,TX,126,415,No,No,0,190.9,143,32.45,149.7,72,12.72,191.4,87,8.61,13.0,3,3.51,1,False
303,NM,55,510,No,No,0,119.7,148,20.35,231.8,96,19.7,222.3,113,10.0,4.6,2,1.24,2,False
469,GA,131,408,No,No,0,197.0,79,33.49,201.0,114,17.09,151.2,111,6.8,11.6,5,3.13,1,False
411,MT,124,415,No,Yes,30,144.5,35,24.57,262.3,101,22.3,226.5,82,10.19,12.0,7,3.24,2,False
491,TN,95,510,No,No,0,174.0,57,29.58,281.1,118,23.89,197.2,94,8.87,9.7,2,2.62,0,False
543,KS,166,415,Yes,Yes,28,175.8,126,29.89,253.6,76,21.56,128.5,72,5.78,11.4,5,3.08,1,False
368,WY,62,415,No,No,0,172.4,132,29.31,230.5,100,19.59,228.2,109,10.27,11.0,5,2.97,0,False


In [15]:
### AUTOMATICALLY GRADED TASK

#    cols = ["State", "Total day charge", "Total eve charge", "Total night charge", "Total intl charge"]

def drop_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop features that will affect model performance in a negative manner.
    
    Return a `pd.DataFrame` with all remaining columns.

    Note:
        Be careful to analyze each feature and understand which features need to be removed. 
        Do not remove a feature unless you need to, based on information from the data.
    Hint:
        Be aware of multicolinearity.
        total day/eve/night/intl charge – овие се директно корелирани со total minutes (т.е. Total day minutes × rate = Total day charge


        total day/eve/night/intl minutes, calls, charge – овие се силно корелирани (multicollinearity), па треба да се остави само едно од нив или да се користи aggregate feature.
    """

    ### BEGIN SOLUTION
    cols = ["State", "Total day charge", "Total eve charge", "Total night charge", "Total intl charge"]
    df.drop(columns=cols, inplace=True)
    ### END SOLUTION
    
    return df

In [16]:
df = drop_features(df)

In [17]:
### TEST
test_data(drop_features, "abe2aed5b19936cddbd8d9483994f33f555d56e3c05fd6e2aac4046671cbce7e")
### BEGIN HIDDEN TESTS
def test_drop_features():
    # df.to_csv("drop_features.csv")
    expected_df = pd.read_csv("drop_features.csv", index_col=0)
    initial_df = pd.read_csv("churn-bigml-20.csv")
    actual_df = drop_features(initial_df)
    pd.testing.assert_frame_equal(expected_df, actual_df)
test_drop_features()
### END HIDDEN TESTS

✔ Test passed!


In [18]:
### AUTOMATICALLY GRADED TASK
def encode_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Encode categorical columns.
    
    Return a new `pd.DataFrame` with applied changes.

    Note:
        Inspect the dataset carefully and encode all features that represent a category. 
        Use the best possible encoder for each feature.
    Hint:
        If you need to use one-hot encoding, use `sklearn.preprocessing.OneHotEncoder`.
    """
    
    ### BEGIN SOLUTION
    df["Churn"] = df["Churn"].astype(int)

    binary_cols = ["International plan", "Voice mail plan"]
    for col in binary_cols:
        df[col] = df[col].map({"Yes": 1, "No": 0})

    df["Area code"] = df["Area code"].astype(str)

    enc = OneHotEncoder(sparse_output=False)
    encoded = enc.fit_transform(df[["Area code"]])

    encoded_df = pd.DataFrame(
        encoded,
        columns=enc.get_feature_names_out(["Area code"]),
        index=df.index,         
    )

    df = df.drop(columns=["Area code"])
    df = pd.concat([df, encoded_df], axis=1)
    ### END SOLUTION

    return df

In [19]:
df = encode_categorical(df)

In [20]:
### TEST
test_data(encode_categorical, "f89aa815551a9a213de273b88cb0cc8df9d337eec167ef2374cd33a60a45bffc")
### BEGIN HIDDEN TESTS
def test_encode_categorical():
    # df.to_csv("encode_categorical.csv")
    expected_df = pd.read_csv("encode_categorical.csv", index_col=0)
    initial_df = pd.read_csv("drop_features.csv", index_col=0)
    actual_df = encode_categorical(initial_df)
    pd.testing.assert_frame_equal(expected_df, actual_df)
test_encode_categorical()
### END HIDDEN TESTS

✔ Test passed!


In [21]:
### AUTOMATICALLY GRADED TASK
def create_features_and_target(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Split the data into X and y, where X represents all input features, and y represents the target.
    
    Return a tuple containing X and y.
    """
    
    ### BEGIN SOLUTION
    X = df.drop(columns=["Churn"])
    y = df["Churn"]
    ### END SOLUTION
    
    return X, y

In [22]:
X, y = create_features_and_target(df)

In [23]:
### TEST
test_partition(
    create_features_and_target,
    "6a9659138ebbb161112b90ebf046548f3cb17c35fc4461f107581a702fb3cb43",
    "aaffac0438ef4c1559e8e9b9f0444fd41e80ead3243dcaba404e0aeb69320c90",
)
### BEGIN HIDDEN TESTS
def test_create_features_and_target():
    # X.to_csv("X.csv")
    # pd.DataFrame(y).to_csv("y.csv")
    expected_X = pd.read_csv("X.csv", index_col=0)
    expected_y = pd.read_csv("y.csv", index_col=0)["Churn"]
    initial_df = pd.read_csv("encode_categorical.csv", index_col=0)
    actual_X, actual_y = create_features_and_target(initial_df)
    pd.testing.assert_frame_equal(expected_X, actual_X)
    pd.testing.assert_series_equal(expected_y, actual_y)
test_create_features_and_target()
### END HIDDEN TESTS

✔ Test passed!


In [24]:
### AUTOMATICALLY GRADED TASK
def split_train_test(
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float = 0.2,
    random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: 
    """
    Split into train and test data.
    
    Return X_train, X_test, y_train, y_test.

    Note:
        Use stratify=y to keep class balance.
    """

    # BEGIN SOLUTION
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )
    # END SOLUTION
    
    return X_train, X_test, y_train, y_test

In [25]:
X_train, X_test, y_train, y_test = split_train_test(X, y)

In [26]:
### TEST
test_split(
    split_train_test,
    "fe0a778aa6452117d04940b72e7a1438503cbdf8bfdfe49e02c4af41c03b87ec",
    "f0d3ce74e6eaac3828fb35789514804f71736cff9137afd04187b69aaf6e116f",
    "5c5ddb1078e687d311959981b0112426ca7880512c69a1e06ce199b59e5d624a",
    "137481d87a985df7f64f7696351f438f9fc5e70b6d056df8f11345d4c7ee7967"
)
### BEGIN HIDDEN TESTS
def test_split_train_test():
    # X_train.to_csv("X_train.csv")
    # X_test.to_csv("X_test.csv")
    # pd.DataFrame(y_train).to_csv("y_train.csv")
    # pd.DataFrame(y_test).to_csv("y_test.csv")
    expected_X_train = pd.read_csv("X_train.csv", index_col=0)
    expected_X_test = pd.read_csv("X_test.csv", index_col=0)
    expected_y_train = pd.read_csv("y_train.csv", index_col=0)["Churn"]
    expected_y_test = pd.read_csv("y_test.csv", index_col=0)["Churn"]
    initial_X = pd.read_csv("X.csv", index_col=0)
    initial_y = pd.read_csv("y.csv", index_col=0)["Churn"]
    actual_X_train, actual_X_test, actual_y_train, actual_y_test = split_train_test(initial_X, initial_y)
    pd.testing.assert_frame_equal(expected_X_train, actual_X_train)
    pd.testing.assert_frame_equal(expected_X_test, actual_X_test)
    pd.testing.assert_series_equal(expected_y_train, actual_y_train)
    pd.testing.assert_series_equal(expected_y_test, actual_y_test)
test_split_train_test()
### END HIDDEN TESTS

✔ Test passed!


In [27]:
### AUTOMATICALLY GRADED TASK
def train_model_1(X_train, y_train):
    """
    Train the first model on the provided training data.
    
    Return the trained model.

    Note:
        Use `random_state=RANDOM_STATE` to ensure reproducibility.
    """

    ### BEGIN SOLUTION
    model = XGBClassifier(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=4,
        random_state=RANDOM_STATE,
        eval_metric="logloss"
    )
    model.fit(X_train, y_train)
    ### END SOLUTION
    
    return model

In [28]:
model_1 = train_model_1(X_train, y_train)

In [35]:
### AUTOMATICALLY GRADED TASK
def train_model_2(X_train, y_train):
    """
    Train the second model on the provided training data.
    
    Return the trained model.

    Note:
        Use `random_state=RANDOM_STATE` to ensure reproducibility.

        DOLE PISE ALLOWED MODELS INAKU U TESTOT
    """
    
    ### BEGIN SOLUTION
    model = DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    ### END SOLUTION
    
    return model

In [36]:
model_2 = train_model_2(X_train, y_train)

In [37]:
### AUTOMATICALLY GRADED TASK
def evaluate_accuracy(model, X_test, y_test) -> float:
    """
    Evaluate the model on the test set using accuracy.
    
    Return a float representing the accuracy score.
    """

    ### BEGIN SOLUTION
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    ### END SOLUTION
    return float(acc)

In [38]:
accuracy_1 = evaluate_accuracy(model_1, X_test, y_test)

In [39]:
### TEST
test_evaluate(accuracy_1)
### BEGIN HIDDEN TESTS
def test_evaluate_model_1():
    initial_X_train = pd.read_csv("X_train.csv", index_col=0)
    initial_X_test = pd.read_csv("X_test.csv", index_col=0)
    initial_y_train = pd.read_csv("y_train.csv", index_col=0)["Churn"]
    initial_y_test = pd.read_csv("y_test.csv", index_col=0)["Churn"]
    actual_model_1 = train_model_1(initial_X_train, initial_y_train)
    actual_pred_y = actual_model_1.predict(initial_X_test)
    actual_accuracy_1 = accuracy_score(initial_y_test, actual_pred_y)
    assert actual_accuracy_1 > 0.75
    assert round(actual_accuracy_1, 2) == round(evaluate_accuracy(actual_model_1, initial_X_test, initial_y_test), 2)
    
    allowed_models = {
        "DecisionTreeClassifier",
        "RandomForestClassifier",
        "XGBClassifier",
        "LGBMClassifier",
        "CatBoostClassifier",
        "LogisticRegression",  
    }
    name_1 = actual_model_1.__class__.__name__
    assert name_1 in allowed_models
test_evaluate_model_1()
### END HIDDEN TESTS

✔ Test passed!


In [40]:
accuracy_2 = evaluate_accuracy(model_2, X_test, y_test)

In [34]:
### TEST
test_evaluate(accuracy_2)
### BEGIN HIDDEN TESTS
def test_evaluate_model_2():
    initial_X_train = pd.read_csv("X_train.csv", index_col=0)
    initial_X_test = pd.read_csv("X_test.csv", index_col=0)
    initial_y_train = pd.read_csv("y_train.csv", index_col=0)["Churn"]
    initial_y_test = pd.read_csv("y_test.csv", index_col=0)["Churn"]
    actual_model_1 = train_model_1(initial_X_train, initial_y_train)
    actual_model_2 = train_model_2(initial_X_train, initial_y_train)
    actual_pred_y = actual_model_2.predict(initial_X_test)
    actual_accuracy_2 = accuracy_score(initial_y_test, actual_pred_y)
    assert actual_accuracy_2 > 0.75
    assert round(actual_accuracy_2, 2) == round(evaluate_accuracy(actual_model_2, initial_X_test, initial_y_test), 2)
    
    allowed_models = {
        "DecisionTreeClassifier",
        "RandomForestClassifier",
        "XGBClassifier",
        "LGBMClassifier",
        "CatBoostClassifier",
        "LogisticRegression",  
    }
    
    name_1 = actual_model_1.__class__.__name__
    name_2 = actual_model_2.__class__.__name__
    
    assert name_2 in allowed_models
    assert name_1 != name_2
test_evaluate_model_2()
### END HIDDEN TESTS

✔ Test passed!


In [35]:
### AUTOMATICALLY GRADED TASK
def cross_validate(model, X_train, y_train, cv: int = 7) -> Tuple[float, float]:
    """
    Perform K-fold cross-validation on a model using the provided dataset.
    
    Return a tuple containing all accuracy scores, mean_accuracy and std_accuracy. 
    """

    ### BEGIN SOLUTION
    accuracy_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1
    )
    mean_accuracy, std_accuracy = float(accuracy_scores.mean()), float(accuracy_scores.std())
    ### END SOLUTION
    
    return accuracy_scores, mean_accuracy, std_accuracy

In [36]:
scores_1, mean_1, std_1 = cross_validate(model_1, X_train, y_train)
scores_2, mean_2, std_2 = cross_validate(model_2, X_train, y_train)

In [37]:
### TEST
test_validate(scores_1, mean_1, std_1, scores_2, mean_2, std_2)
### BEGIN HIDDEN TESTS
def test_cross_validate():
    initial_X_train = pd.read_csv("X_train.csv", index_col=0)
    initial_y_train = pd.read_csv("y_train.csv", index_col=0)["Churn"]
    actual_model_1 = train_model_1(initial_X_train, initial_y_train)
    actual_model_2 = train_model_2(initial_X_train, initial_y_train)
    actual_scores_1, actual_mean_1, actual_std_1 = cross_validate(actual_model_1, X_train, y_train)
    # pd.DataFrame(actual_scores_1, columns=["scores_1"]).to_csv("scores_1.csv")
    expected_scores_1 = pd.read_csv("scores_1.csv", index_col=0)["scores_1"]
    actual_scores_2, actual_mean_2, actual_std_2 = cross_validate(actual_model_2, X_train, y_train)
    # pd.DataFrame(actual_scores_1, columns=["scores_2"]).to_csv("scores_2.csv")
    expected_scores_2 = pd.read_csv("scores_2.csv", index_col=0)["scores_2"]
    np.testing.assert_allclose(actual_scores_1, expected_scores_1, atol=0.3)
    np.testing.assert_allclose(actual_scores_2, expected_scores_2, atol=0.3)
test_cross_validate()
### END HIDDEN TESTS

✔ Test passed!


In [38]:
name_1 = model_1.__class__.__name__
name_2 = model_2.__class__.__name__

acc_1 = evaluate_accuracy(model_1, X_test, y_test)
acc_2 = evaluate_accuracy(model_2, X_test, y_test)

_, cv_mean1, cv_std1 = cross_validate(model_1, X, y)
_, cv_mean2, cv_std2 = cross_validate(model_2, X, y)

print(f"Model 1: {name_1}")
print(f"  Test accuracy:    {acc_1:.4f}")
print(f"  CV mean accuracy: {cv_mean1:.4f}")
print(f"  CV std deviation: {cv_std1:.4f}")

print(f"\nModel 2: {name_2}")
print(f"  Test accuracy:    {acc_2:.4f}")
print(f"  CV mean accuracy: {cv_mean2:.4f}")
print(f"  CV std deviation: {cv_std2:.4f}")

Model 1: XGBClassifier
  Test accuracy:    0.9328
  CV mean accuracy: 0.9132
  CV std deviation: 0.0355

Model 2: DecisionTreeClassifier
  Test accuracy:    0.9403
  CV mean accuracy: 0.9115
  CV std deviation: 0.0231


### MANUALLY GRADED TASK

##### ENTER YOUR EXPLAINATION OF THE RESULTS HERE