In [1]:
%run ../utils/configuration.ipynb

In [2]:
%run ../03_Models/0300_common_functions.ipynb

In [3]:
import polars as pl

# 1 - Reading Dataset and  Merge files

In [4]:
def readAndMergeFiles():
    try:
        df_demographics = pl.read_excel(f"{raw_path}/Telco_customer_churn_demographics.xlsx")
        df_location = pl.read_excel(f"{raw_path}/Telco_customer_churn_location.xlsx")
        df_population = pl.read_excel(f"{raw_path}/Telco_customer_churn_population.xlsx")
        df_services = pl.read_excel(f"{raw_path}/Telco_customer_churn_services.xlsx")
        df_status = pl.read_excel(f"{raw_path}/Telco_customer_churn_status.xlsx")

        #Rename columns to snake_case
        df_demographics = df_demographics.rename({c: to_snake_case(c) for c in df_demographics.columns})
        df_location = df_location.rename({c: to_snake_case(c) for c in df_location.columns})
        df_population = df_population.rename({c: to_snake_case(c) for c in df_population.columns})
        df_services = df_services.rename({c: to_snake_case(c) for c in df_services.columns})
        df_status = df_status.rename({c: to_snake_case(c) for c in df_status.columns})

        #Drop duplicate or unnecessary columns
        df_demographics = df_demographics.drop(['count'])
        df_location = df_location.drop(["count", "country", "state"])

        df_population = df_population.select([pl.col('zip_code'), pl.col('population')])
        
        #Merge dataframes
        df_merge = df_services.join(df_demographics, on="customer_id", how="left")\
                        .join(df_location, on="customer_id", how="left")\
                        .join(df_status, on="customer_id", how="left")\
                        .join(df_population, on="zip_code", how="left")
        return df_merge
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)

In [5]:
def splitMergedData(df):
    df_train = df.sample(
        fraction=0.8,
        with_replacement=False,
        shuffle=True
    )

    df_test = df.join(df_train, on="customer_id", how="anti")
    return df_train, df_test

# 2 - Pre Processing and Feature Engineering

In [6]:
def solvingOutliers(df):
    q1, q3 = df.select([
        pl.col("number_of_referrals").quantile(0.25).alias("q1"),
        pl.col("number_of_referrals").quantile(0.75).alias("q3")
    ]).row(0)

    upper_bound = q3 + (q3 - q1) * 1.5

    df = df.with_columns(
        pl.col("number_of_referrals")
        .clip(upper_bound=upper_bound)
    )

    q1, q3 = df.select([
        pl.col("avg_monthly_gb_download").quantile(0.25).alias("q1"),
        pl.col("avg_monthly_gb_download").quantile(0.75).alias("q3")
    ]).row(0)

    upper_bound = q3 + (q3 - q1) * 1.5

    df = df.with_columns(
        pl.col("avg_monthly_gb_download")
        .clip(upper_bound=upper_bound)
    )

    q1, q3 = df.select([
        pl.col("total_long_distance_charges").quantile(0.25).alias("q1"),
        pl.col("total_long_distance_charges").quantile(0.75).alias("q3")
    ]).row(0)

    upper_bound = q3 + (q3 - q1) * 1.5

    df = df.with_columns(
        pl.col("total_long_distance_charges")
        .clip(upper_bound=upper_bound)
    )

    q1, q3 = df.select([
        pl.col("total_revenue").quantile(0.25).alias("q1"),
        pl.col("total_revenue").quantile(0.75).alias("q3")
    ]).row(0)

    upper_bound = q3 + (q3 - q1) * 1.5

    df = df.with_columns(
        pl.col("total_revenue")
        .clip(upper_bound=upper_bound)
    )

    q1, q3 = df.select([
        pl.col("satisfaction_score").quantile(0.25).alias("q1"),
        pl.col("satisfaction_score").quantile(0.75).alias("q3")
    ]).row(0)

    lower_bound = q1 - (q3 - q1) * 1.5

    df = df.with_columns(
        pl.when(pl.col("satisfaction_score") < lower_bound)
        .then(pl.lit(lower_bound))
        .otherwise(pl.col("satisfaction_score"))
        .alias("satisfaction_score")
    )

    return df

In [7]:
valid_vals = ["yes", "no", "male", "female"]

bivalue_columns = [
    'referred_a_friend'
    ,'phone_service'
    ,'multiple_lines'
    ,'internet_service'
    ,'online_security'
    ,'online_backup'
    ,'device_protection_plan'
    ,'premium_tech_support'
    ,'streaming_tv'
    ,'streaming_movies'
    ,'streaming_music'
    ,'unlimited_data'
    ,'paperless_billing'
    ,'gender'
    ,'churn_label'
    ,'under_30'	
    ,'senior_citizen'	
    ,'married'	
    ,'dependents'
]

def encodeBivariateVariables(df, bivalue_columns, valid_vals):
    for col in bivalue_columns:
        df = df.with_columns(
            pl.col(col).str.to_lowercase().str.strip_chars()
        )

        mode_val = (
            df
            .filter(pl.col(col).is_in(valid_vals))
            .select(pl.col(col).mode())
            .item()
        )

        df = df.with_columns(
            pl.when(pl.col(col).is_in(valid_vals))
                .then(pl.col(col))
                .otherwise(pl.lit(mode_val))
                .alias(col)
        )

        df = df.with_columns(
            pl.when(pl.col(col).is_in(["yes", "male"]))
                .then(1)
            .when(pl.col(col).is_in(["no", "female"]))
                .then(0)
            .otherwise(None)
            .cast(pl.Int32)
            .alias(col)
        )
    return df

In [8]:
def encodeMuitivariateVariables(df):
    valid_vals = ["Offer A", "Offer B", "Offer C","Offer D", "Offer E"]
    col = "offer"
    mode_val = (
        df
        .filter(pl.col(col).is_in(valid_vals))
        .select(pl.col(col).mode())
        .item()
    )

    df = df.with_columns(
        pl.when(pl.col(col).is_in(valid_vals))
            .then(pl.col(col))
            .otherwise(pl.lit(mode_val))
            .alias(col)
    )

    df = df.with_columns(
        pl.when(pl.col(col) == "Offer A")
            .then(1)
            .when(pl.col(col) == "Offer B")
            .then(2)
            .when(pl.col(col) == "Offer C")
            .then(3)
            .when(pl.col(col) == "Offer D")
            .then(4)
            .when(pl.col(col) == "Offer E")
            .then(5)
            .otherwise(None)
            .cast(pl.Int32)
            .alias(col)
    )

    valid_vals = ["DSL", "Cable", "Fiber Optic"]
    col = "internet_type"
    mode_val = (
        df
        .filter(pl.col(col).is_in(valid_vals))
        .select(pl.col(col).mode())
        .item()
    )

    df = df.with_columns(
        pl.when(pl.col(col).is_in(valid_vals))
            .then(pl.col(col))
            .otherwise(pl.lit(mode_val))
            .alias(col)
    )

    df = df.with_columns(
        pl.when(pl.col(col) == "DSL")
            .then(1)
            .when(pl.col(col) == "Cable")
            .then(2)
            .when(pl.col(col) == "Fiber Optic")
            .then(3)
            .otherwise(None)
            .cast(pl.Int32)
            .alias(col)
    )

    valid_vals = ["Month-to-Month", "One Year", "Two Year"]
    col = "contract"
    mode_val = (
        df
        .filter(pl.col(col).is_in(valid_vals))
        .select(pl.col(col).mode())
        .item()
    )

    df = df.with_columns(
        pl.when(pl.col(col).is_in(valid_vals))
            .then(pl.col(col))
            .otherwise(pl.lit(mode_val))
            .alias(col)
    )

    df = df.with_columns(
        pl.when(pl.col(col) == "Month-to-Month")
            .then(1)
            .when(pl.col(col) == "One Year")
            .then(2)
            .when(pl.col(col) == "Two Year")
            .then(3)
            .otherwise(None)
            .cast(pl.Int32)
            .alias(col)
    )

    return df

In [9]:
def featureEngineering(df):
    valid_vals = ["Churned", "Joined", "Stayed"]
    col = "customer_status"
    mode_val = (
        df
        .filter(pl.col(col).is_in(valid_vals))
        .select(pl.col(col).mode())
        .item()
    )

    df = df.with_columns(
        pl.when(pl.col(col).is_in(valid_vals))
            .then(pl.col(col))
            .otherwise(pl.lit(mode_val))
            .alias(col)
    )

    df = df.with_columns(
        pl.when(pl.col(col) == "Churned")
            .then(0)
            .otherwise(1)
            .cast(pl.Int32)
            .alias(col)
    )

    col = "churn_category"

    df = df.with_columns(
        pl.when((pl.col(col).is_in(["NO_LABEL", None,"", "null"])) | (pl.col(col).is_null()))
            .then(pl.lit("Other"))
            .otherwise(pl.col(col))
            .alias(col)
    )

    col = "total_extra_data_charges"

    df = df.with_columns(
        pl.when(pl.col(col) > 0)
            .then(pl.lit(1))
            .otherwise(pl.col(col))
            .alias(col)
    )

    col = "total_refunds"

    df = df.with_columns(
        pl.when(pl.col(col) > 0)
            .then(pl.lit(1))
            .otherwise(pl.col(col))
            .alias(col)
    )

    col = "number_of_dependents"

    df = df.with_columns(
        pl.when(pl.col(col) > 0)
            .then(pl.lit(1))
            .otherwise(pl.col(col))
            .alias(col)
    )

    df = df.to_dummies(columns=["payment_method"])
    df = df.to_dummies(columns=["churn_category"])

    return df

In [10]:
def dropColumns(df):
    df = df.drop(['customer_id',
                'under_30', 
                'count', 
                'quarter', 
                'lat_long',	
                'latitude',	
                'longitude', 
                'married', 
                'total_long_distance_charges', 
                'tenure_in_months', 
                'city', 
                'churn_reason',
                'senior_citizen',
                'referred_a_friend',
                'customer_status',
                'zip_code',
                'churn_label',
                'churn_category_Other',
                'churn_category_Competitor',
                'churn_category_Dissatisfaction',
                'churn_category_Attitude',
                'churn_category_Price',
                'quarter_right',
                'count_right'])
    return df

# Train

In [11]:
env = 'dev'
df = readAndMergeFiles()
if env == 'dev':
    df_train, df_test = splitMergedData(df)
    df = df_train
df = solvingOutliers(df)
df = encodeBivariateVariables(df, bivalue_columns, valid_vals)
df = encodeMuitivariateVariables(df)
df = featureEngineering(df)
df = dropColumns(df)
df = apply_feature_selection(df, 2)

In [12]:

model_NN = MLPClassifier(max_iter = 2000, random_state = 99)
df_all = pd.DataFrame(columns = ['Train','Validation'], index = ['NN'])
print('------------------------------------------')
print('f1_score')
print('------------------------------------------')
display(show_results(df_all, df.to_pandas(),f1_score, model_NN))

------------------------------------------
f1_score
------------------------------------------


Unnamed: 0,Train,Validation
NN,0.723+/-0.009,0.665+/-0.02


# Test

In [13]:
df_test = solvingOutliers(df_test)
df_test = encodeBivariateVariables(df_test, bivalue_columns, valid_vals)
df_test = encodeMuitivariateVariables(df_test)
df_test = featureEngineering(df_test)
df_test = dropColumns(df_test)
df_test = apply_feature_selection(df_test, 2)

In [14]:
# Final Training Data (from df_1 based on previous cell)
data = df.to_pandas().copy()
X_train = data.drop(['churn_value'], axis=1)
y_train = data['churn_value'].copy()

df_test = df_test.drop(['churn_value'])
# Final Test Data (df_test is already fully processed)
X_test = df_test.to_pandas().copy()

# Train final model
final_model = MLPClassifier(max_iter = 2000, random_state = 99)
final_model.fit(X_train, y_train)

# Predict probabilities
predict_proba_test = final_model.predict_proba(X_test)

# Predict classes if needed
predict_class_test = final_model.predict(X_test)

In [15]:
# Show output
print("Predicted class probabilities:")
print(predict_proba_test[:5])

Predicted class probabilities:
[[9.99999774e-01 2.25886083e-07]
 [1.00000000e+00 1.62948822e-12]
 [1.00000000e+00 5.06176071e-16]
 [1.00000000e+00 7.15245623e-18]
 [1.09050924e-07 9.99999891e-01]]


In [16]:
print("\nPredicted class labels:")
for x in predict_class_test:
    print(x)


Predicted class labels:
0
0
0
0
1
0
1
0
0
1
0
0
0
1
1
1
1
1
0
0
1
0
0
0
0
0
0
1
0
0
1
1
1
1
1
0
0
1
0
0
0
0
1
1
0
1
0
0
1
1
1
0
1
1
1
1
1
0
1
0
1
0
0
0
1
0
1
1
0
0
0
0
0
0
0
1
1
1
0
1
0
0
0
0
0
0
0
0
0
0
1
1
0
0
1
0
1
0
0
0
1
1
1
1
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
0
1
0
1
1
0
0
0
0
0
1
0
1
1
1
0
1
0
1
0
0
0
0
1
0
0
1
0
0
0
0
1
0
1
0
0
0
0
0
1
0
1
0
1
1
0
0
0
1
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
1
1
1
0
1
1
0
0
0
0
1
0
0
1
0
1
0
1
0
0
1
0
1
0
0
0
0
0
1
1
0
1
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
0
0
0
1
0
1
0
0
0
1
0
1
1
1
0
0
0
0
1
1
0
0
0
0
1
1
0
0
0
1
1
1
0
0
1
0
0
1
0
1
1
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
1
1
1
0
1
1
0
0
0
0
1
1
0
0
0
0
1
1
0
1
1
0
1
0
0
1
1
1
0
0
0
0
1
0
0
1
1
1
0
1
1
1
1
1
0
1
0
0
0
0
0
0
0
1
0
1
0
0
0
1
0
0
0
0
1
0
0
0
1
1
0
1
1
0
0
0
0
1
0
1
0
1
1
0
0
0
1
0
1
1
0
0
0
0
0
1
0
1
1
1
0
1
1
0
1
1
0
0
1
1
0
1
0
1
0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
1
0
0
0
0
1
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
1
1
1