# Imports and definitions

In [1]:
import pickle
from pathlib import Path

import polars as pl
import polars.selectors as cs

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import plotly.express as px


_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [2]:
base_dir = Path('/workspaces/data-scientist-at-magenta/')
code_dir = base_dir / 'notebooks'
dagster_storage_dir = base_dir / 'dagster_home' / 'storage'

In [3]:
def load_artifact(targ_file:str):
    targ_path = dagster_storage_dir / targ_file
    
    if not targ_path.exists():
        raise FileNotFoundError(f'Artifact {targ_file} not found in {dagster_storage_dir}')

    with open(targ_path,'rb') as fp:
        test_artifact = pickle.load(fp)

    return pl.from_pandas(test_artifact)

`core_data` <br><br>

| Feature Name           | Description                                                  |
|------------------------|--------------------------------------------------------------|
| rating_account_id      | Unique identifier for the contract account                    |
| customer_id            | Unique identifier for the customer                           |
| age                    | Age of the customer **in years**                                       |
| contract_lifetime_days | Total duration of the customer contract in days              |
| remaining_binding_days | Number of days left in the contract binding period - usual binding period is 2 years - **if it's positive it means that the customer is still in the binding period**       |
| has_special_offer      | Indicates if the customer has a special offer      |
| is_magenta1_customer   | Indicates if the customer is part of the Magenta1 program - fedelty program    |
| available_gb           | Amount of mobile data included in the current tariff         |
| gross_mrc              | Gross monthly recurring charge (in euros)                    |
| smartphone_brand       | Brand of the customer’s smartphone                           |
| has_done_upselling     | Whether the customer has already done an upsell in the last 3 years      |


`usage_info`

| Feature Name           | Description                                                  |
|------------------------|--------------------------------------------------------------|
| rating_account_id      | Unique identifier for the contract account                    |
| billed_period_month_d  | Billing period (monthly)                                     |
| has_used_roaming       | Indicates if roaming was used during the period            |
| used_gb                | Amount of mobile data used in the billing period (in GB)     |


`customer_interactions`

| Feature Name   | Description                                                              |
|----------------|--------------------------------------------------------------------------|
| customer_id    | Unique identifier for the customer                                       |
| type_subtype   | Category and subtype of the interaction (e.g., tariff change, billing)   |
| n              | Number of interactions of this type in the last 6 months                                |
| days_since_last| Number of days since the last interaction of this type                   |


# Read data

In [4]:
%%time

core_data = load_artifact('core_data')
customer_interactions = load_artifact('customer_interactions')
usage_info = load_artifact('usage_info')

CPU times: user 119 ms, sys: 31.1 ms, total: 150 ms
Wall time: 197 ms


---

# Features computation

## `core_data`

In [5]:
%%time

core_data = core_data.with_columns(
    pl.col('rating_account_id').cast(pl.Utf8),
    pl.col("has_done_upselling").cast(pl.Boolean),
    pl.col("has_special_offer").cast(pl.Boolean),
    pl.col("is_magenta1_customer").cast(pl.Boolean)
)

# Manipulating binding days
core_data = core_data.with_columns(
    (pl.col('contract_lifetime_days') + pl.col('remaining_binding_days')).alias('contract_binding_days'),
    (pl.col('contract_lifetime_days') / (pl.col('contract_lifetime_days') + pl.col('remaining_binding_days'))).round(2).alias('completion_rate'),
    pl.when(pl.col('remaining_binding_days') > 0)
        .then(True)
        .otherwise(False)
        .alias('is_bounded')
)


# One-hot-encoding smartphone brands - extracting the values in order to keep the same order for the columns
# The number of unique values is not too high, so one-hot-encoding is not affecting the dimensionality too much
smartphone_brands_list = core_data.select(pl.col('smartphone_brand')).unique().to_series().sort().to_list()
core_data = core_data.with_columns(
    [
        pl.when(pl.col("smartphone_brand") == brand)
        .then(True)
        .otherwise(False)
        .alias(f"is_{brand.lower()}")
        for brand in smartphone_brands_list
    ]
)
core_data = core_data.drop("smartphone_brand")


# Add how many contract has the customer - including the current one
n_contract_per_customer = core_data.group_by("customer_id").agg(
    pl.col("rating_account_id").count().alias("n_contracts_per_customer")
)
core_data = core_data.join(n_contract_per_customer, on="customer_id", how="left")

CPU times: user 38.1 ms, sys: 8.92 ms, total: 47 ms
Wall time: 61.9 ms


In [6]:
core_data

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32
"""289094""","""4.161115""",36,878,325,false,false,20,70,false,1203,0.73,true,false,false,false,false,true,1
"""677626""","""2.429976""",34,998,614,false,false,0,5,false,1612,0.62,true,false,false,true,false,false,1
"""769928""","""3.875044""",36,37,-26,false,true,50,16.94,false,11,3.36,false,false,false,true,false,false,2
"""873260""","""4.649933""",50,503,-149,false,true,20,30.2,true,354,1.42,false,false,false,false,false,true,1
"""109774""","""3.851059""",47,331,-328,true,true,,46.12,false,3,110.33,false,false,false,true,false,false,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""502283""","""5.605022""",88,1573,-576,false,false,10,34.18,false,997,1.58,false,false,false,true,false,false,4
"""618421""","""2.862063""",85,1138,412,true,false,40,50.1,false,1550,0.73,true,false,false,false,false,true,1
"""104422""","""2.414264""",79,1709,-494,false,false,10,12.96,false,1215,1.41,false,false,false,true,false,false,3
"""642380""","""3.619106""",84,1592,403,false,false,10,56.73,false,1995,0.8,true,false,false,true,false,false,2


In [7]:
core_data.shape

(100000, 19)

---

## `usage_info`

In [8]:
%%time

usage_info = usage_info.with_columns([
    pl.col('rating_account_id').cast(pl.Utf8),
    pl.col('billed_period_month_d').cast(pl.Date),
    pl.col('has_used_roaming').cast(pl.Boolean),
    pl.col('used_gb').cast(pl.Float64)
]).sort(['rating_account_id', 'billed_period_month_d'])

CPU times: user 219 ms, sys: 6.43 ms, total: 226 ms
Wall time: 196 ms


In [9]:
%%time

month_usage = usage_info.group_by('rating_account_id').agg([
    pl.col('used_gb')
])

month_usage = month_usage.with_columns([
    pl.col('used_gb').list.get(0).alias('last_1_month_usage_gb'),
    pl.col('used_gb').list.get(1).alias('last_2_month_usage_gb'),
    pl.col('used_gb').list.get(2).alias('last_3_month_usage_gb'),
    pl.col('used_gb').list.get(3).alias('last_4_month_usage_gb'),

]).drop('used_gb')


CPU times: user 7.22 ms, sys: 1.73 ms, total: 8.95 ms
Wall time: 7.52 ms


In [10]:
month_usage

rating_account_id,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,f64,f64,f64,f64
"""100010""",0.9,0.8,0.4,0.9
"""100017""",0.7,0.5,0.8,0.3
"""100036""",0.9,0.2,0.2,1
"""100047""",35.6,48.9,35.4,50.7
"""100064""",0.7,0.7,0.3,0.8
…,…,…,…,…
"""999922""",2.3,1.4,3.6,2.1
"""999934""",12.8,13.8,14.5,6.5
"""999940""",9.8,11.6,5.4,14.4
"""999956""",13,10.4,13.7,14.1


In [11]:
%%time

aggregated_features = usage_info.group_by('rating_account_id').agg([
    
    # BASIC USAGE STATISTICS
    pl.col('used_gb').mean().round(2).alias('avg_monthly_usage_gb'),
    pl.col('used_gb').median().round(2).alias('median_monthly_usage_gb'),
    pl.col('used_gb').sum().round(2).alias('total_usage_gb'),
    pl.col('used_gb').std().round(2).alias('usage_std_gb'),
    pl.col('used_gb').min().round(2).alias('min_monthly_usage_gb'),
    pl.col('used_gb').max().round(2).alias('max_monthly_usage_gb'),
    pl.col('used_gb').quantile(0.25).round(2).alias('usage_q25_gb'),
    pl.col('used_gb').quantile(0.75).round(2).alias('usage_q75_gb'),
    
    # ROAMING STATISTICS
    pl.col('has_used_roaming').sum().alias('months_with_roaming'),
    pl.col('has_used_roaming').any().alias('ever_used_roaming'),
    pl.col('has_used_roaming').all().alias('always_used_roaming'),
    
    # # USAGE INTENSITY CATEGORIES
    (pl.col('used_gb') == 0).sum().alias('zero_usage_months'),
    (pl.col('used_gb') > 0).sum().alias('active_usage_months'),
])

CPU times: user 377 ms, sys: 15.3 ms, total: 393 ms
Wall time: 330 ms


In [12]:
aggregated_features

rating_account_id,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months
str,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32
"""100010""",0.75,0.85,3,0.24,0.4,0.9,0.8,0.9,0,false,false,0,4
"""100017""",0.57,0.6,2.3,0.22,0.3,0.8,0.5,0.7,2,true,false,0,4
"""100036""",0.57,0.55,2.3,0.43,0.2,1,0.2,0.9,1,true,false,0,4
"""100047""",42.65,42.25,170.6,8.29,35.4,50.7,35.6,48.9,0,false,false,0,4
"""100064""",0.62,0.7,2.5,0.22,0.3,0.8,0.7,0.7,1,true,false,0,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",2.35,2.2,9.4,0.92,1.4,3.6,2.1,2.3,0,false,false,0,4
"""999934""",11.9,13.3,47.6,3.67,6.5,14.5,12.8,13.8,3,true,false,0,4
"""999940""",10.3,10.7,41.2,3.78,5.4,14.4,9.8,11.6,1,true,false,0,4
"""999956""",12.8,13.35,51.2,1.66,10.4,14.1,13,13.7,0,false,false,0,4


In [13]:
%%time

# CALCULATE TRENDS AND ROLLING METRICS
trend_features = usage_info.group_by('rating_account_id').agg([
    # ROLLING AVERAGES
    # 2-month rolling average
    pl.col('used_gb').rolling_mean_by(
        'billed_period_month_d', window_size='2mo'
    ).alias('avg_2month_rolling_usage_gb'),
    
    # 3-month rolling average
        pl.col('used_gb').rolling_mean_by(
        'billed_period_month_d', window_size='3mo'
    ).alias('avg_3month_rolling_usage_gb'),

    
    # PERIOD-OVER-PERIOD DELTAS
    (pl.col('used_gb') - pl.col('used_gb').shift(1)).alias('delta_1mo'),
    (pl.col('used_gb') - pl.col('used_gb').shift(2)).alias('delta_2mo'),
    (pl.col('used_gb') - pl.col('used_gb').shift(3)).alias('delta_3mo'),

    # VOLATILITY METRICS
    # Rolling standard deviation
    pl.col('used_gb').rolling_std_by(
        'billed_period_month_d', window_size='2mo'
    ).alias('std_2month_rolling_usage_gb')
])

trend_features = trend_features.with_columns([
    # delta_1mo statistics
    pl.col('delta_1mo').list.mean().round(2).alias('avg_delta_1mo'),
    pl.col('delta_1mo').list.std().round(2).alias('delta_1mo_volatility'),
    pl.col('delta_1mo').list.max().round(2).alias('max_delta_1mo_increase'),
    pl.col('delta_1mo').list.min().round(2).alias('max_delta_1mo_decrease'),
    pl.col('delta_1mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_1mo_increase'),
    pl.col('delta_1mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_1mo_decrease'),
    pl.col('delta_1mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_1mo_change'),

    # delta_2mo statistics
    pl.col('delta_2mo').list.mean().round(2).alias('avg_delta_2mo'),
    pl.col('delta_2mo').list.std().round(2).alias('delta_2mo_volatility'),
    pl.col('delta_2mo').list.max().round(2).alias('max_delta_2mo_increase'),
    pl.col('delta_2mo').list.min().round(2).alias('max_delta_2mo_decrease'),
    pl.col('delta_2mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_2mo_increase'),
    pl.col('delta_2mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_2mo_decrease'),
    pl.col('delta_2mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_2mo_change'),

    # delta_3mo statistics
    pl.col('delta_3mo').list.mean().round(2).alias('avg_delta_3mo'),
    pl.col('delta_3mo').list.max().round(2).alias('max_delta_3mo_increase'),
    pl.col('delta_3mo').list.min().round(2).alias('max_delta_3mo_decrease'),
    pl.col('delta_3mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_3mo_increase'),
    pl.col('delta_3mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_3mo_decrease'),
    pl.col('delta_3mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_3mo_change'),
])

# The following block extracts the last N values from the rolling/statistical lists for each account.
# Each column contains the most recent, second most recent, etc. value from the corresponding list

trend_features = trend_features.with_columns([
    pl.col('avg_2month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_2mo_rolling_avg')
    for i in range(3)
]).drop('avg_2month_rolling_usage_gb')

trend_features = trend_features.with_columns([
    pl.col('avg_3month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_3mo_rolling_avg')
    for i in range(2)
]).drop('avg_3month_rolling_usage_gb')

trend_features = trend_features.with_columns([
    pl.col('delta_1mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_1mo')
    for i in range(3)
]).drop('delta_1mo')

trend_features = trend_features.with_columns([
    pl.col('delta_2mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_2mo')
    for i in range(2)
]).drop('delta_2mo')

trend_features = trend_features.with_columns([
    pl.col('delta_3mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_3mo')
    for i in range(1)
]).drop('delta_3mo')

trend_features = trend_features.with_columns([
    pl.col('std_2month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_2mo_rolling_stdev')
    for i in range(3)
]).drop('std_2month_rolling_usage_gb')

CPU times: user 2.83 s, sys: 13.7 ms, total: 2.84 s
Wall time: 1.87 s


In [14]:
trend_features

rating_account_id,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev
str,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""100010""",0,0.46,0.5,-0.4,1,2,0,-0.2,0.42,0.1,-0.5,1,1,0,0,0,0,0,0,1,0.65,0.6,0.85,0.7,0.7,0.5,-0.4,-0.1,0.1,-0.5,0,0.35,0.28,0.07
"""100017""",-0.13,0.4,0.3,-0.5,1,2,0,-0.05,0.21,0.1,-0.2,1,1,0,-0.4,-0.4,-0.4,0,1,0,0.55,0.65,0.6,0.53,0.67,-0.5,0.3,-0.2,-0.2,0.1,-0.4,0.35,0.21,0.14
"""100036""",0.03,0.75,0.8,-0.7,1,1,1,0.05,1.06,0.8,-0.7,1,1,0,0.1,0.1,0.1,1,0,0,0.6,0.2,0.55,0.47,0.43,0.8,0,-0.7,0.8,-0.7,0.1,0.57,0,0.49
"""100047""",5.03,16.08,15.3,-13.5,2,1,0,0.8,1.41,1.8,-0.2,1,1,0,15.1,15.1,15.1,1,0,0,43.05,42.15,42.25,45,39.97,15.3,-13.5,13.3,1.8,-0.2,15.1,10.82,9.55,9.4
"""100064""",0.03,0.45,0.5,-0.4,1,1,1,-0.15,0.35,0.1,-0.4,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.5,0.7,0.6,0.57,0.5,-0.4,0,0.1,-0.4,0.1,0.35,0.28,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",-0.07,1.99,2.2,-1.5,1,2,0,1,0.42,1.3,0.7,2,0,0,-0.2,-0.2,-0.2,0,1,0,2.85,2.5,1.85,2.37,2.43,-1.5,2.2,-0.9,0.7,1.3,-0.2,1.06,1.56,0.64
"""999934""",-2.1,5.11,1,-8,2,1,0,-2.8,6.36,1.7,-7.3,1,1,0,-6.3,-6.3,-6.3,0,1,0,10.5,14.15,13.3,11.6,13.7,-8,0.7,1,-7.3,1.7,-6.3,5.66,0.49,0.71
"""999940""",1.53,7.6,9,-6.2,2,1,0,-0.8,5.09,2.8,-4.4,1,1,0,4.6,4.6,4.6,1,0,0,9.9,8.5,10.7,10.47,8.93,9,-6.2,1.8,2.8,-4.4,4.6,6.36,4.38,1.27
"""999956""",0.37,2.95,3.3,-2.6,2,1,0,2.2,2.12,3.7,0.7,2,0,0,1.1,1.1,1.1,1,0,0,13.9,12.05,11.7,12.73,12.37,0.4,3.3,-2.6,3.7,0.7,1.1,0.28,2.33,1.84


In [15]:
usage_features = aggregated_features.join(
    trend_features, 
    on='rating_account_id', 
    how='left'
).join(
    month_usage,
    on='rating_account_id',
    how='left'
)

In [16]:
usage_features

rating_account_id,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""100010""",0.75,0.85,3,0.24,0.4,0.9,0.8,0.9,0,false,false,0,4,0,0.46,0.5,-0.4,1,2,0,-0.2,0.42,0.1,-0.5,1,1,0,0,0,0,0,0,1,0.65,0.6,0.85,0.7,0.7,0.5,-0.4,-0.1,0.1,-0.5,0,0.35,0.28,0.07,0.9,0.8,0.4,0.9
"""100017""",0.57,0.6,2.3,0.22,0.3,0.8,0.5,0.7,2,true,false,0,4,-0.13,0.4,0.3,-0.5,1,2,0,-0.05,0.21,0.1,-0.2,1,1,0,-0.4,-0.4,-0.4,0,1,0,0.55,0.65,0.6,0.53,0.67,-0.5,0.3,-0.2,-0.2,0.1,-0.4,0.35,0.21,0.14,0.7,0.5,0.8,0.3
"""100036""",0.57,0.55,2.3,0.43,0.2,1,0.2,0.9,1,true,false,0,4,0.03,0.75,0.8,-0.7,1,1,1,0.05,1.06,0.8,-0.7,1,1,0,0.1,0.1,0.1,1,0,0,0.6,0.2,0.55,0.47,0.43,0.8,0,-0.7,0.8,-0.7,0.1,0.57,0,0.49,0.9,0.2,0.2,1
"""100047""",42.65,42.25,170.6,8.29,35.4,50.7,35.6,48.9,0,false,false,0,4,5.03,16.08,15.3,-13.5,2,1,0,0.8,1.41,1.8,-0.2,1,1,0,15.1,15.1,15.1,1,0,0,43.05,42.15,42.25,45,39.97,15.3,-13.5,13.3,1.8,-0.2,15.1,10.82,9.55,9.4,35.6,48.9,35.4,50.7
"""100064""",0.62,0.7,2.5,0.22,0.3,0.8,0.7,0.7,1,true,false,0,4,0.03,0.45,0.5,-0.4,1,1,1,-0.15,0.35,0.1,-0.4,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.5,0.7,0.6,0.57,0.5,-0.4,0,0.1,-0.4,0.1,0.35,0.28,0,0.7,0.7,0.3,0.8
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",2.35,2.2,9.4,0.92,1.4,3.6,2.1,2.3,0,false,false,0,4,-0.07,1.99,2.2,-1.5,1,2,0,1,0.42,1.3,0.7,2,0,0,-0.2,-0.2,-0.2,0,1,0,2.85,2.5,1.85,2.37,2.43,-1.5,2.2,-0.9,0.7,1.3,-0.2,1.06,1.56,0.64,2.3,1.4,3.6,2.1
"""999934""",11.9,13.3,47.6,3.67,6.5,14.5,12.8,13.8,3,true,false,0,4,-2.1,5.11,1,-8,2,1,0,-2.8,6.36,1.7,-7.3,1,1,0,-6.3,-6.3,-6.3,0,1,0,10.5,14.15,13.3,11.6,13.7,-8,0.7,1,-7.3,1.7,-6.3,5.66,0.49,0.71,12.8,13.8,14.5,6.5
"""999940""",10.3,10.7,41.2,3.78,5.4,14.4,9.8,11.6,1,true,false,0,4,1.53,7.6,9,-6.2,2,1,0,-0.8,5.09,2.8,-4.4,1,1,0,4.6,4.6,4.6,1,0,0,9.9,8.5,10.7,10.47,8.93,9,-6.2,1.8,2.8,-4.4,4.6,6.36,4.38,1.27,9.8,11.6,5.4,14.4
"""999956""",12.8,13.35,51.2,1.66,10.4,14.1,13,13.7,0,false,false,0,4,0.37,2.95,3.3,-2.6,2,1,0,2.2,2.12,3.7,0.7,2,0,0,1.1,1.1,1.1,1,0,0,13.9,12.05,11.7,12.73,12.37,0.4,3.3,-2.6,3.7,0.7,1.1,0.28,2.33,1.84,13,10.4,13.7,14.1


In [17]:
usage_features.shape

(100000, 52)

---

## `customer_interactions`

In [18]:
%%time

interactions_features = customer_interactions.pivot(
    index='customer_id',
    on='type_subtype', 
    values=['n', 'days_since_last'],
    aggregate_function='first' # There is only one value per customer
)

CPU times: user 29.4 ms, sys: 5.88 ms, total: 35.3 ms
Wall time: 69 ms


In [19]:
interactions_features.shape

(42095, 9)

---

## `combined_features`

In [20]:
features = core_data.join(
    usage_features,
    on='rating_account_id',
    how='left'
).join(
    interactions_features,
    on='customer_id',
    how='left'
)

In [21]:
features.head()

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64
"""289094""","""4.161115""",36,878,325,False,False,20.0,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.2,-0.7,0,2,0,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,,,,,,,,
"""677626""","""2.429976""",34,998,614,False,False,0.0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.5,-0.5,1,1,0,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,,,1.0,1.0,,,87.0,118.0
"""769928""","""3.875044""",36,37,-26,False,True,50.0,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,0.1,-0.3,1,1,0,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,,,,,,,,
"""873260""","""4.649933""",50,503,-149,False,True,20.0,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.8,-0.1,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,,,,,,,,
"""109774""","""3.851059""",47,331,-328,True,True,,46.12,False,3,110.33,False,False,False,True,False,False,3,0.35,0.35,1.4,0.29,0.1,0.6,0.1,0.6,1,True,False,0,4,0.0,0.5,0.5,-0.5,1,…,0.5,-0.5,1,1,0,0.0,0.0,0.0,0,0,1,0.35,0.1,0.35,0.27,0.27,0.5,0.0,-0.5,0.5,-0.5,0.0,0.35,0.0,0.35,0.6,0.1,0.1,0.6,,,,,,,,


In [22]:
%%time

# Filling null values from intereactions features
features = features.with_columns([
    pl.when(pl.col(col).is_null())
    .then(0)
    .otherwise(pl.col(col))
    .alias(col)
    for col in features.columns if col.startswith('n_')
] + [
    pl.when(pl.col(col).is_null())
    .then(-1)
    .otherwise(pl.col(col))
    .alias(col)
    for col in features.columns if col.startswith('days_since_last')
])

CPU times: user 2.99 ms, sys: 1.04 ms, total: 4.03 ms
Wall time: 8.44 ms


### Dealing with null values in `available_gb`

In [23]:
features.filter(pl.col('available_gb').is_null()).select('rating_account_id', 'available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')

rating_account_id,available_gb,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,i64,f64,f64,f64,f64
"""109774""",,0.6,0.1,0.1,0.6
"""781755""",,0.6,0.8,0.3,0
"""827238""",,0,0.7,0.9,0.1
"""330581""",,0,0.2,0.2,0.9
"""416121""",,0.8,0.8,0.3,0.5
…,…,…,…,…,…
"""662172""",,54.7,62.5,24.7,37.4
"""556788""",,19.6,32.6,31,65.2
"""283647""",,51.3,18.1,53.3,39.2
"""581854""",,49.5,21.5,27.7,18.6


In [24]:
%%time

# Option A1: predict the value with a linear model

# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)

X = df[['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb']]
y = df['available_gb']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Ridge regression (L2)
ridge = Ridge()
ridge.fit(X_train, y_train)

# Predict and evaluate
y_pred = ridge.predict(X_test)
mae_linear_model = mean_absolute_error(y_test, y_pred)
print(f'MAE Ridge: {mae_linear_model:.4f}')

MAE Ridge: 15.0033
CPU times: user 29.2 ms, sys: 16.2 ms, total: 45.4 ms
Wall time: 84.8 ms


In [25]:
%%time

# Option A2: predict the value with a linear model

# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)

X = df[['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb']]
y = df['available_gb']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Lasso regression (L1)
lasso = Lasso()
lasso.fit(X_train, y_train)

# Predict and evaluate
y_pred = lasso.predict(X_test)
mae_lasso = mean_absolute_error(y_test, y_pred)
print(f'MAE Lasso: {mae_lasso:.4f}')

MAE Lasso: 14.9996
CPU times: user 59.4 ms, sys: 6.5 ms, total: 65.9 ms
Wall time: 83.2 ms


In [26]:
%%time

# Option B: mean of the previous contract activities

columns_to_average = ['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb']

# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)


# Compute horizontal mean and find closest value
result_df = df.with_columns([
    # Compute horizontal mean
    pl.mean_horizontal(columns_to_average).alias('prediction')
])

mae_horizontal_mean = mean_absolute_error(result_df['available_gb'], result_df['prediction'])
print(f'MAE horizontal_mean: {mae_horizontal_mean:.4f}')

MAE horizontal_mean: 21.8625
CPU times: user 15.3 ms, sys: 2.9 ms, total: 18.2 ms
Wall time: 19.1 ms


The error of the linear models is lower compared to the mean of the activities. **Winning option A2**

In [27]:
# APPLY THE PREDICTION TO MISSING ROWS
# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)

X = df.select(['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb'])
y = df.select('available_gb').to_series()

df_missing = (
        features
            .filter(pl.col('available_gb').is_null())
            .select(pl.exclude("available_gb"))
)

X_missing = df_missing.select(['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb'])


# Fit Lasso regression (L1)
lasso = Lasso()
lasso.fit(X, y)

# Predict
df_missing = df_missing.with_columns(available_gb=lasso.predict(X_missing))

# Since the value cannot be float, mapping the predicted values to similar options from data
available_values = features.filter(pl.col('available_gb').is_not_null()).select(pl.col('available_gb')).unique().to_series().to_list()

def find_closest(val, avail_list):
    'Find closest value from available_values list'
    if val is None:
        return None
    return min(avail_list, key=lambda x: abs(x - val))

# Compute horizontal mean and find closest value
df_missing = df_missing.with_columns([
    # Find closest available value
    pl.col('available_gb').map_elements(
        lambda x: find_closest(x, available_values), 
        return_dtype=pl.Int64
    ).alias('available_gb')
])

In [28]:
%%time

features = pl.concat(
    [
        features.filter(pl.col('available_gb').is_not_null()),
        df_missing
    ],
    how='diagonal'
)

features.head()

CPU times: user 10.1 ms, sys: 995 μs, total: 11.1 ms
Wall time: 12.9 ms


rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64
"""289094""","""4.161115""",36,878,325,False,False,20,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.2,-0.7,0,2,0,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,0,0,0,0,-1,-1,-1,-1
"""677626""","""2.429976""",34,998,614,False,False,0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.5,-0.5,1,1,0,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,0,0,1,1,-1,-1,87,118
"""769928""","""3.875044""",36,37,-26,False,True,50,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,0.1,-0.3,1,1,0,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,0,0,0,0,-1,-1,-1,-1
"""873260""","""4.649933""",50,503,-149,False,True,20,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.8,-0.1,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,0,0,0,0,-1,-1,-1,-1
"""692379""","""4.382165""",46,80,-25,False,True,40,60.71,False,55,1.45,False,True,False,False,False,False,1,0.55,0.6,2.2,0.3,0.2,0.8,0.4,0.8,2,True,False,0,4,0.0,0.53,0.6,-0.4,1,…,0.4,-0.6,1,1,0,0.0,0.0,0.0,0,0,1,0.5,0.3,0.6,0.47,0.47,0.6,-0.2,-0.4,0.4,-0.6,0.0,0.42,0.14,0.28,0.8,0.4,0.2,0.8,0,0,0,0,-1,-1,-1,-1


In [29]:
# Checking if there are other columns to fill

null_counts_features = features.select([
    pl.col(col).is_null().sum().alias(f'{col}_nulls') for col in features.columns
])

null_counts_features.select([
    col for col in null_counts_features.columns if null_counts_features[0, col] != 0
])

### Computing additional features

Computing features based on `available_gb`, since there are no missing values

In [30]:
%%time

# Compute how many times the contract required more than available data, baed on definde threshold of the current plan.

# Compute 20%, 50%, and 80% for available_gb per rating_account_id
thresholds_available_gb = features.group_by('rating_account_id').agg([
    (pl.col('available_gb') / 100 * 25).get(0).round(2).alias('p25'),
    (pl.col('available_gb') / 100 * 50).get(0).round(2).alias('p50'),
    (pl.col('available_gb') / 100 * 70).get(0).round(2).alias('p75'),
])

# Compute, for each month, in which range the usage falls
percentile_exprs = []
for i in range(1, 5):
    percentile_expr = (
        pl.when(pl.col(f'last_{i}_month_usage_gb').is_between(-1, pl.col('p25'), closed='right'))  # -1 otherwise 0 is not counted
        .then(pl.lit('P1'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p25'), pl.col('p50'), closed='right'))
        .then(pl.lit('P2'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p50'), pl.col('p75'), closed='right'))
        .then(pl.lit('P3'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p75'), pl.col('available_gb'), closed='right'))
        .then(pl.lit('P4'))
        .when(pl.col(f'last_{i}_month_usage_gb') > pl.col('available_gb'))
        .then(pl.lit('P5'))  # how many times has exceeded the available data
        .otherwise(pl.lit(None))
        .alias(f'month_{i}_threshold')
    )
    percentile_exprs.append(percentile_expr)

# Compute how many times, in the past 4 months, the usage felt in specific ranges
count_exprs = []
for p in range(1, 6):
    count_expr = sum(
        (pl.col(f'month_{i}_threshold') == f'P{p}').cast(pl.Int32)
        for i in range(1, 5)
    ).alias(f'times_in_p{p}')
    count_exprs.append(count_expr)

# Final computation
features = (
    features
    .join(
        thresholds_available_gb,
        on='rating_account_id',
        how='left')
    .with_columns(percentile_exprs)
    .with_columns(count_exprs)
).drop(['p25', 'p50', 'p75', 'month_1_threshold', 'month_2_threshold', 'month_3_threshold', 'month_4_threshold'])

CPU times: user 66.6 ms, sys: 16.5 ms, total: 83 ms
Wall time: 119 ms


In [31]:
# Check that the sum of times_in_p1 to times_in_p5 is always 4 (number of billing months) for each row
check_sum = features.select(
    (pl.col('times_in_p1') + pl.col('times_in_p2') + pl.col('times_in_p3') + pl.col('times_in_p4') + pl.col('times_in_p5')).alias('sum_p')
)

# Count how many rows do not have sum == 4
invalid_rows = check_sum.filter(pl.col('sum_p') != 4).height
print(f"Number of invalid rows: {invalid_rows}")

Number of invalid rows: 0


---

## `final features dataframe`

In [32]:
features.shape

(100000, 83)

In [33]:
features.head()

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel,times_in_p1,times_in_p2,times_in_p3,times_in_p4,times_in_p5
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i32,i32,i32,i32,i32
"""289094""","""4.161115""",36,878,325,False,False,20,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""677626""","""2.429976""",34,998,614,False,False,0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,0,0,1,1,-1,-1,87,118,0,0,0,0,4
"""769928""","""3.875044""",36,37,-26,False,True,50,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""873260""","""4.649933""",50,503,-149,False,True,20,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""692379""","""4.382165""",46,80,-25,False,True,40,60.71,False,55,1.45,False,True,False,False,False,False,1,0.55,0.6,2.2,0.3,0.2,0.8,0.4,0.8,2,True,False,0,4,0.0,0.53,0.6,-0.4,1,…,0.0,0.0,0.0,0,0,1,0.5,0.3,0.6,0.47,0.47,0.6,-0.2,-0.4,0.4,-0.6,0.0,0.42,0.14,0.28,0.8,0.4,0.2,0.8,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0


# Features cleaning

## Correlation analysis

In [67]:
numeric_and_booleans_features = features.select(cs.numeric())

In [68]:
%%time

correlation_matrix = numeric_and_booleans_features.corr()

CPU times: user 122 ms, sys: 9.49 ms, total: 132 ms
Wall time: 117 ms


In [69]:
correlation_matrix

age,contract_lifetime_days,remaining_binding_days,available_gb,gross_mrc,contract_binding_days,completion_rate,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel,times_in_p1,times_in_p2,times_in_p3,times_in_p4,times_in_p5
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.0629525879053476,0.007917425991875568,0.0020015308181241475,-0.0014248647679732413,-0.04689189462413721,0.00353602431532941,-0.0033925108458075066,-0.07313075894941969,-0.07117159510219147,-0.07313072441998948,-0.0752667069178047,-0.06588505936769457,-0.07627248976389259,-0.06852675138912509,-0.07211471686542024,0.0024213213370002985,0.010187406343503983,-0.0101874063435039,0.0007023898149709626,-0.0698694468472535,-0.06434228889805703,0.06525763675673277,-0.0027109579596412622,-0.003923417078656138,0.013449505856104643,0.0038771650238448078,-0.05982538993845558,-0.03588147143189146,0.0418930823776148,0.002696058456064011,-0.007157296164963452,0.012284435206751758,0.0007018860866903546,0.0007018860866903546,0.0007018860866903546,-0.0027964731629029245,-0.00016122916901736825,0.008278462206978325,-0.07023640985754045,-0.07118943304127716,-0.07216078833847767,-0.07239798432566195,-0.07251675544933449,-0.0022278083988556082,0.004810231937738154,-0.0018765816242892104,0.002577746601875384,0.0029294087217661374,0.0007018860866903546,-0.05818506841796149,-0.06188408762322952,-0.05992545507252074,-0.06797684008539309,-0.06941992923036817,-0.06614590372886091,-0.067636313903008,-0.0029619271116923386,-0.0036624528884935327,-0.0010602562815131568,0.0024569564673974044,-0.002173552477055877,-0.0017217017286030646,-0.004082781479784842,0.0007222265215127118,0.047705646519236894,0.012602850196615879,-0.011356125642459285,-0.02461459789404415,-0.04884294998721446
-0.06295258790534759,1,-0.0003783910010728704,-0.032126065687675204,0.0030451716731358623,0.8171535046016242,-0.07300261199879411,0.0023650517704186607,0.7632866265204388,0.7508626128730621,0.7632872022690604,0.7272201981256826,0.710920257111116,0.7732354631860344,0.7318593393860774,0.7537621693882587,-0.005097468591353552,-0.09475926466371004,0.09475926466370997,-0.0015123138415764018,0.6811245720092849,0.630724779896917,-0.6291921881077874,0.03264938171411658,0.03404249035232747,-0.1352208870762016,0.0048604813883570365,0.5801060874511151,0.3801496092665654,-0.3739104433991196,0.02229034000445165,0.017848040212429837,-0.1103229664401533,-0.0015110509232823767,-0.0015110509232823767,-0.0015110509232823767,0.01553886015196863,0.01351762142620585,-0.08134120648691907,0.7434205175805239,0.743278014254858,0.7427992699446373,0.7563778588356003,0.7563208014062932,-0.003962065315546003,0.008440296747651647,-0.006002610013797874,0.004470249234288799,0.002431251194703502,-0.0015110509232823767,0.5862943391397315,0.5878593834322187,0.5894401962885459,0.7074652280240008,0.7068184459534311,0.7085158996998934,0.7074916297772239,0.00421444660602776,0.0021811147714451817,0.002424094037175166,0.0038562618871112587,0.004366695774608397,0.0014775262795114102,-0.0016269760313566877,0.002949492743458491,-0.4651370824557613,-0.15697385908485523,0.08043653464478628,0.26470369022922924,0.4983430708639573
0.007917425991875568,-0.0003783910010728704,0.9999999999999999,0.0007421669104084503,0.002179619852898867,0.5761108710174765,-0.1507495214901689,0.0031184382252098324,-0.003368056021030941,-0.0033786218444621825,-0.0033670204985767243,-0.002278291324361265,-0.003511932081388142,-0.003135261219195083,-0.0038810688474674535,-0.0029258975348432593,-0.0033505124922018543,-0.0010846215558273022,0.0010846215558273072,0.004231075366245402,-0.0012913676246216438,-0.0005253656943313309,0.0012444482226628166,0.0010398710334209135,-0.0009434670746600751,-0.0001991602516028179,0.0006135525006577703,-0.0036709594401793722,-0.0019173180886053083,0.0028571672810733012,0.0016059206003933085,-0.001983000044914771,0.001044298351067821,0.004228160955008833,0.004228160955008833,0.004228160955008833,0.008375570520982954,-0.006714190636222069,-0.004642390731298755,-0.003134501641543642,-0.001615083942391306,-0.003421838758769516,-0.002308263865614657,-0.0032358684666985455,-0.0013602383904919337,-0.003388871642487854,0.009003476899453246,-0.00473342809016704,0.005614739084775269,0.004228160955008833,-0.0037692239748397215,-0.001050758085637673,0.0012077114722154902,-0.0060008342919767674,-0.0005025521403265127,-0.0025678096237829566,-0.003403362532700231,0.0042893103455309885,-0.0045450803839230955,-0.000550729968203646,0.008401502292391047,0.0041148704343960145,-0.004200652333154999,-0.0014239168372262169,0.002532071796620933,0.003595027248980564,0.0005717954093538758,-0.002259417399267059,0.00031672611610409875,-0.0036916032443004695
0.0020015308181241475,-0.032126065687675204,0.0007421669104084504,0.9999999999999999,-0.0024587734096769894,-0.02583113428554061,0.006971121919043611,0.0002684735229871767,-0.04011757756460991,-0.03933747541803906,-0.040116928543826175,-0.04024613006208369,-0.03617021038246265,-0.04139055484207002,-0.03782438878577749,-0.03989934830826546,0.0007200633236605356,0.008177975369587902,-0.008177975369587918,-0.0089571980632671,-0.03691136786859041,-0.03785421524287581,0.029866491049181588,-0.0007914361364533618,-0.0028488766446646786,0.00737721491273247,-0.007199502882120817,-0.03512391586661177,-0.028257023438348194,0.017383569734294932,-0.005250118537681683,0.0034678803596330595,0.004880126414490814,-0.008958884761091974,-0.008958884761091974,-0.008958884761091974,-0.007460334315593718,0.0048201060755382686,0.007384229549775811,-0.04065715615556799,-0.03882817397170189,-0.03745232561192221,-0.04066676061305669,-0.03867803535082624,-0.0047058938957302435,-0.0012152153721237326,-0.003113646249159127,-0.005897651701941139,-0.004326456852194825,-0.008958884761091974,-0.03248632327135266,-0.0286038506515081,-0.034649920815514205,-0.034627216699059306,-0.03668618027259971,-0.03724845166090696,-0.040194977494247656,-0.0027753592056530617,0.001975370864724089,-0.0024030418946654358,-0.0036484670334102904,-0.001828261199771774,-0.00010865678076617748,-0.0009845477696042433,-0.003399587000315279,0.44300923575092677,0.12012203595677905,0.026597859735423855,0.07138108632869992,-0.5836747528828375
-0.0014248647679732413,0.0030451716731358623,0.002179619852898867,-0.0024587734096769894,1,0.003745413711897003,0.001448901652406299,0.0016938968378855258,0.0034662775264801956,0.0036649796886498833,0.0034670413042216385,0.001983447006780046,0.003559935212115557,0.0029799401592150577,0.004232483813729707,0.003156082446484031,-0.002621823061911647,0.00416868698384991,-0.004168686983849909,0.0016857292450166749,0.002357003944395102,0.0033169574760850613,-0.0015639842171657377,-0.0011157641239080813,0.0014068762332894742,-0.0005855575777674823,-0.0027569414013037955,0.001878543120285064,-0.0008726892860535902,-0.003321487870409262,-0.0018806672781231143,0.0005494721132352117,0.003653928234913019,0.0016859505374580168,0.0016859505374580168,0.0016859505374580168,0.0032038550078330964,-0.00031892126693167317,-0.008074245652902642,0.002742729071843708,0.0036344999378973724,0.0040094187222853165,0.0037084812711037705,0.0033371612082357233,0.002865308322562273,-0.005622969378124299,0.004457926510055416,-0.0027532977838048696,-0.001160926028117356,0.0016859505374580168,0.0027105335786805603,0.0036034334811532757,-0.0009023286321955589,0.0024470740183862895,0.0051927124225158204,0.001736507540913687,0.0034893492438268013,-0.0014842179605258324,-0.0070461841673263425,0.0014512163470149944,0.004541296831139777,-0.0030816234859641096,-0.0035406766558893866,0.002945215744682074,0.0028001963669227344,-0.001521995787355854,-0.0011358179365463976,0.0027629290569683796,-0.0013717817552831178,0.001947958861748606
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.047705646519236894,-0.4651370824557613,0.0035950272489805634,0.44300923575092677,-0.001521995787355854,-0.37811760294981384,0.03603245876904223,-0.0010580614604684586,-0.6574356700261336,-0.6477335996820826,-0.6574351617633541,-0.5921330567575226,-0.6324734010954444,-0.6550171232472066,-0.6366293036345562,-0.6460450821056,-0.0022202435618355145,0.19473479366783358,-0.1947347936678337,0.001326906786667687,-0.555236245379734,-0.5138604773898797,0.5131083533985022,-0.05017190324173619,-0.05086622658141281,0.20486216667920032,-0.003294884422366589,-0.47356767486786105,-0.30981429961732015,0.30574381631026665,-0.03577022579577623,-0.026718973160469936,0.17175102633102984,0.001327230036611494,0.001327230036611494,0.001327230036611494,-0.023392744802626767,-0.02100348798291842,0.12428396274545907,-0.6401201528361139,-0.6404085652111067,-0.6399923369406615,-0.6515520568287523,-0.6515088247418852,0.0034509210628460986,-0.006026941760208073,0.003915808807681568,-0.0025729495726785224,-0.002106399879094899,0.001327230036611494,-0.4787230044872635,-0.47902096232931146,-0.4795624677691726,-0.6091650305015999,-0.6093767387773618,-0.6100776324444828,-0.6091716580776606,0.000059637575997904895,-0.0012191470648258765,-0.001453135944898726,-0.002648473788040453,0.00007632894699609086,-0.003910193406857043,0.0009721214954402584,-0.0018419967704396566,1,-0.25830911226021347,-0.3079286742066061,-0.35295243923345915,-0.7354253793324923
0.01260285019661588,-0.15697385908485523,0.0005717954093538758,0.12012203595677905,-0.0011358179365463974,-0.12797638259614352,0.010083043683954946,-0.00404546325199369,-0.08053488635503672,-0.07978097346055228,-0.08053626632882388,-0.08836353485322143,-0.06601591086657327,-0.08501006100996686,-0.07663393330576229,-0.08098256481617847,0.004011439195300704,-0.10143640503833477,0.10143640503833465,-0.005175368001069828,-0.08260255851680684,-0.07864889497623406,0.07415262192460485,0.01909079448633512,0.02651005626675375,-0.09244581478282995,-0.004948752389538511,-0.07057830152343202,-0.04956675320168578,0.04218459133318309,0.007545440873567659,0.02054964434387215,-0.0772561138065041,-0.00517598980275904,-0.00517598980275904,-0.00517598980275904,-0.0009713468603610833,0.0209298352498439,-0.055885357074821077,-0.07945220110661956,-0.07822605127716974,-0.07736023862530608,-0.08032589540486917,-0.0791475167054129,-0.0024269018307169327,-0.0018252937563690555,-0.000967793660771512,-0.004236585771994782,-0.002790853651064619,-0.00517598980275904,-0.07256802452778705,-0.07229930076095765,-0.07123041550751416,-0.07319372401378046,-0.07410161945810369,-0.07485337732906021,-0.07648248613966893,-0.004008765254458228,-0.0033501395087788067,-0.002247015924201317,-0.00035153602710774243,-0.006087907692970569,-0.005219717915631191,-0.0000971813050819011,-0.0007624493507186986,-0.2583091122602135,1,0.06638363851405626,-0.06603850435498461,-0.32501826130391254
-0.011356125642459285,0.08043653464478628,-0.002259417399267059,0.026597859735423855,0.0027629290569683796,0.06444416664035231,-0.0035336615281290557,0.0022099155654710734,0.1277148971901287,0.11951849314969573,0.12771508845407165,0.143799347234508,0.11589738196130095,0.1398686648806849,0.11073242930943984,0.12454400153033612,0.007081831567542339,-0.06486624231444324,0.06486624231444316,0.005930550562054738,0.1355873536708041,0.12767042540258938,-0.12355130756745797,0.029447727913750426,0.006673307821842664,-0.07328089869697282,0.008038226591646828,0.10932843104513819,0.07704456886249,-0.06504319484404679,0.02590299565436248,-0.002915527235662338,-0.06312560264259107,0.005929657650460138,0.005929657650460138,0.005929657650460138,0.023371756848562675,-0.0076998405133813785,-0.04385520067360781,0.12603942337353619,0.12382557674145965,0.12263496873381974,0.127055143248569,0.1256875386113633,0.002117545597359137,0.005462335963983286,-0.001598904504421387,0.007554921578262583,0.0038584171305832875,0.005929657650460138,0.1151001103839581,0.12127235937567242,0.11240751451406249,0.11698705992275005,0.11650811063655493,0.11927187169755994,0.12079954414966645,0.002155692056575203,0.002197531356652983,-0.0024947330179143673,0.00012364639706145917,0.004345671329991873,0.0005387009612483223,-0.003744099075050931,0.002376749352079732,-0.3079286742066061,0.06638363851405626,1,0.23716708742306913,-0.06804662056045266
-0.02461459789404415,0.26470369022922924,0.00031672611610409875,0.07138108632869991,-0.0013717817552831178,0.21654385057369152,-0.02055105593527674,0.00006538944256845367,0.31465248046091876,0.30125243815779473,0.31465223603044434,0.30811558755180074,0.2988633998709326,0.3282575316302787,0.29329312440993455,0.3026811792336865,-0.001462540001162611,-0.07417531216240263,0.07417531216240258,-0.005430670448569713,0.28960525934322573,0.26515322560233184,-0.27062384118898314,0.025420292851836217,0.02089319317763292,-0.09391298792028537,-0.003406036778495155,0.2429900764871216,0.1551007619977368,-0.16075892658304858,0.01309907483983647,0.014208605512967968,-0.07506590488344689,-0.005431806709049364,-0.005431806709049364,-0.005431806709049364,0.01082428069133967,0.009214047973624372,-0.05609551368176083,0.3052282339316725,0.30749281511218685,0.30744338894873346,0.31164617985217347,0.3126778851338343,-0.006027484428002244,0.0006280708297526537,-0.00008370792167350667,-0.005375769999349068,0.0005437491386759282,-0.005431806709049364,0.24635443777408542,0.25479741091474256,0.25249114916240734,0.2920853471576891,0.29328747285143614,0.2922381597706154,0.2891333587223048,-0.000739556777484845,-0.00007880956307549504,-0.0012508651192542877,0.0018811956779389762,0.002209165637533562,0.0011698713406282468,-0.002237722383573684,-0.0008157446393455923,-0.3529524392334592,-0.06603850435498461,0.23716708742306913,1,0.055590376547614756


In [70]:
fig = px.imshow(
    correlation_matrix.to_numpy(),
    labels=dict(x='Features', y='Features', color='Correlation'),
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    aspect='auto'
)
fig.update_layout(
    width=1800,
    height=1800,
    title='Correlation Matrix Heatmap'
)
fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

# Blue means that variable X and variable Y follow the same behaviour (both increasing or decreasing)
# Red means that variable X has the opposite behaviour of variable Y

In [71]:
%%time 

highly_correlated_features = (
    correlation_matrix
    .unpivot(index=None, variable_name='col1', value_name='correlation')
    .with_columns([
        pl.repeat(correlation_matrix.columns, correlation_matrix.height).flatten().alias('col2'),
        pl.col('correlation').abs()
    ])
    .filter(pl.col('col1') != pl.col('col2'))
    .filter(pl.col('col1') < pl.col('col2'))  # Keep only one pair per combination
    .with_columns([
        pl.when(pl.col('correlation') > 0.99)
            .then(pl.lit('identical'))
        .when(pl.col('correlation') > 0.8)
            .then(pl.lit('high'))
        .when(pl.col('correlation') > 0.7)
            .then(pl.lit('medium'))
        .otherwise(pl.lit('ok'))
        .alias('analysis')
    ])
    .sort('correlation', descending=True)
)

CPU times: user 2.45 ms, sys: 945 μs, total: 3.39 ms
Wall time: 3.44 ms


In [45]:
highly_correlated_features

col1,correlation,col2,analysis
str,f64,str,str
"""active_usage_months""",1,"""zero_usage_months""","""identical"""
"""avg_delta_3mo""",1,"""max_delta_3mo_increase""","""identical"""
"""avg_delta_3mo""",1,"""max_delta_3mo_decrease""","""identical"""
"""avg_delta_3mo""",1,"""last_1_delta_3mo""","""identical"""
"""max_delta_3mo_decrease""",1,"""max_delta_3mo_increase""","""identical"""
…,…,…,…
"""last_4_month_usage_gb""",0.000030343105303516372,"""n_prolongation""","""ok"""
"""last_1_delta_1mo""",0.000022458070975042498,"""n_rechnungsanfragen""","""ok"""
"""last_2_delta_1mo""",0.000004109764369784097,"""months_with_delta_3mo_decrease""","""ok"""
"""active_usage_months""",0.00000014146788088847202,"""n_rechnungsanfragen""","""ok"""


In [47]:
highly_correlated_features.group_by('analysis', maintain_order=True).len()

analysis,len
str,u32
"""identical""",19
"""high""",158
"""medium""",99
"""ok""",2139


## Variance analysis

In [72]:
only_numeric_features = features.select(cs.numeric() & ~cs.boolean())

In [None]:
%%time

# Compute the standard deviation
variance_analysis = (
    features
    .select([
        pl.col(col).std()
        for col in numeric_and_booleans_features.columns
    ])
    .unpivot(variable_name='feature', value_name='stdev')
    .sort('stdev', descending=True)
)

CPU times: user 16.2 ms, sys: 878 μs, total: 17.1 ms
Wall time: 21.7 ms


In [104]:
variance_analysis

feature,stdev
str,f64
"""months_with_no_delta_3mo_change""",0.17850662773735168
"""active_usage_months""",0.23464950586759398
"""zero_usage_months""",0.23464950586759403
"""months_with_no_delta_2mo_change""",0.25738762612818217
"""months_with_no_delta_1mo_change""",0.31996979811650533
…,…
"""days_since_last_produkte&services-tarifwechsel""",42.254754177537265
"""total_usage_gb""",69.21684430203868
"""remaining_binding_days""",345.64345852492
"""contract_lifetime_days""",490.1271217887317


In [99]:
%%time

# Compute variance, std, and unique ratio for each numeric/boolean feature
variance_analysis = (
    features
    .select([
        pl.col(col).std().alias('std_dev')
        for col in numeric_and_booleans_features.columns
    ])
)


DuplicateError: the name 'std_dev' is duplicate

It's possible that multiple expressions are returning the same default column name. If this is the case, try renaming the columns with `.alias("new_name")` to avoid duplicate column names.

In [98]:
variance_analysis

<generator object <genexpr> at 0x73d11d5dbca0>

In [92]:
%%time

# Unique count dataframe (excluding boolean columns)
unique_count = (
    features
    .select([
        pl.col(col).n_unique().alias(col)
        for col in only_numeric_features.columns
    ])
    .unpivot(variable_name="feature", value_name="unique_count")
    .sort("unique_count")
)




CPU times: user 99.1 ms, sys: 2.46 ms, total: 102 ms
Wall time: 211 ms


In [93]:
unique_count

feature,unique_count
str,u32
"""months_with_delta_3mo_increase""",2
"""months_with_delta_3mo_decrease""",2
"""months_with_no_delta_3mo_change""",2
"""months_with_delta_2mo_increase""",3
"""months_with_delta_2mo_decrease""",3
…,…
"""completion_rate""",2487
"""contract_binding_days""",2516
"""usage_std_gb""",2671
"""avg_monthly_usage_gb""",2748


In [None]:
# Do not remove this column because they can have only few values by design - based on number of billing months for example
do_not_remove_numeric = [
    'months_with_delta_3mo_increase',
    'months_with_delta_3mo_decrease',
    'months_with_no_delta_3mo_change',
    'months_with_delta_2mo_increase',
    'months_with_delta_2mo_decrease',
    'months_with_no_delta_2mo_change',
    'months_with_delta_1mo_increase',
    'months_with_delta_1mo_decrease',
    'months_with_no_delta_1mo_change',
    'zero_usage_months',
    'active_usage_months',
    'max_delta_1mo_increase',
    'max_delta_1mo_decrease',
    'max_delta_2mo_increase',
    'max_delta_2mo_decrease',
    'max_delta_3mo_increase',
    'max_delta_3mo_decrease',
    'times_in_p1',
    'times_in_p2',
    'times_in_p3',
    'times_in_p4',
    'times_in_p5',
]

do_not_remove_target = ['has_done_upselling']

do_not_remove = do_not_remove_numeric + do_not_remove_target


# Storing