# Table of Contents

1. [Imports and definitions](#imports-and-definitions)
2. [Read data](#read-data)
3. [Features computation](#features-computation)
   - [core_data](#core_data)
   - [usage_info](#usage_info)
   - [customer_interactions](#customer_interactions)
   - [combined_features](#combined_features)
     - [Dealing with null values in available_gb](#dealing-with-null-values-in-available_gb)
     - [Computing additional features](#computing-additional-features)
   - [final features dataframe](#final-features-dataframe)
4. [Features cleaning](#features-cleaning)
   - [Correlation analysis](#correlation-analysis)
     - [Analysis correlated networks](#analysis-correlated-networks)
   - [Variance analysis](#variance-analysis)
   - [Cleaning](#cleaning)
5. [Storing](#storing)

---

# Imports and definitions

In [1]:
import pickle
from pathlib import Path

import polars as pl
import polars.selectors as cs

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import plotly.express as px
import plotly.graph_objects as go

import networkx as nx


_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
raw_dir = data_dir / "raw"
train_dir = data_dir / 'train'
output_dir = code_dir / 'data'
features_dir = output_dir / 'features'
test_dir = output_dir / 'test'
artifacts_dir = output_dir / 'models' / 'artifacts'

In [3]:
def load_artifact(targ_file:str):
    targ_path = raw_dir / targ_file
    
    if not targ_path.exists():
        raise FileNotFoundError(f'Artifact {targ_file} not found in {raw_dir}')

    with open(targ_path,'rb') as fp:
        test_artifact = pickle.load(fp)

    return pl.from_pandas(test_artifact)


def plot_correlation_network(G, pos):
    edge_x = []
    edge_y = []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='#888'),
        hoverinfo='none',  # Disable edge hover
        mode='lines'
    )

    node_x = []
    node_y = []
    node_text = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        # Gather all neighbors and their correlations
        neighbors = []
        for nbr in G.neighbors(node):
            corr = G.get_edge_data(node, nbr).get('correlation', None)
            if corr is not None:
                neighbors.append(f"{nbr}: {corr:.2f}")
            else:
                neighbors.append(f"{nbr}")
        if neighbors:
            hover = f"{node}<br>Correlations:<br>" + "<br>".join(neighbors)
        else:
            hover = node
        node_text.append(hover)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=list(G.nodes),
        textposition="top center",
        hoverinfo='text',
        hovertext=node_text,
        marker=dict(
            showscale=False,
            color='#87ceeb',
            size=20,
            line_width=2
        )
    )

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Clusters of Highly Correlated Features',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        width=1200,
                        height=800
                    ))
    fig.show()


def correlation_graph_analysis(correlation_analysis, threshold):
    # Filter for high correlation
    high_corr_df = correlation_analysis.filter(pl.col("correlation").abs() > threshold)

    # Extract columns and correlation values as lists
    col1_list = high_corr_df["col1"].to_list()
    col2_list = high_corr_df["col2"].to_list()
    corr_list = high_corr_df["correlation"].to_list()

    # Build a graph of correlated columns, adding correlation as edge attribute
    G = nx.Graph()
    for c1, c2, corr in zip(col1_list, col2_list, corr_list):
        G.add_edge(c1, c2, correlation=corr)

    # Find groups of correlated columns (connected components)
    clusters = list(nx.connected_components(G))

    # Select one column from each cluster to keep
    columns_to_keep = [list(cluster)[0] for cluster in clusters]

    # Identify all involved columns
    all_involved = set(col1_list).union(set(col2_list))

    # Find all unique columns from original data
    all_columns = set(correlation_analysis["col1"].to_list()) | set(correlation_analysis["col2"].to_list())

    # Include columns that are not highly correlated with others
    non_clustered = all_columns - all_involved
    columns_to_keep.extend(non_clustered)

    # Final sorted list of columns to keep
    columns_to_keep = set(columns_to_keep)

    # Columns to remove: all columns involved in high correlation minus those selected to keep
    columns_to_remove = sorted(all_involved - columns_to_keep)

    return G, columns_to_remove, clusters

`core_data` <br><br>

| Feature Name           | Description                                                  |
|------------------------|--------------------------------------------------------------|
| rating_account_id      | Unique identifier for the contract account                    |
| customer_id            | Unique identifier for the customer                           |
| age                    | Age of the customer **in years**                                       |
| contract_lifetime_days | Total duration of the customer contract in days              |
| remaining_binding_days | Number of days left in the contract binding period - usual binding period is 2 years - **if it's positive it means that the customer is still in the binding period**       |
| has_special_offer      | Indicates if the customer has a special offer      |
| is_magenta1_customer   | Indicates if the customer is part of the Magenta1 program - fedelty program    |
| available_gb           | Amount of mobile data included in the current tariff         |
| gross_mrc              | Gross monthly recurring charge (in euros)                    |
| smartphone_brand       | Brand of the customer’s smartphone                           |
| has_done_upselling     | Whether the customer has already done an upsell in the last 3 years      |


`usage_info`

| Feature Name           | Description                                                  |
|------------------------|--------------------------------------------------------------|
| rating_account_id      | Unique identifier for the contract account                    |
| billed_period_month_d  | Billing period (monthly)                                     |
| has_used_roaming       | Indicates if roaming was used during the period            |
| used_gb                | Amount of mobile data used in the billing period (in GB)     |


`customer_interactions`

| Feature Name   | Description                                                              |
|----------------|--------------------------------------------------------------------------|
| customer_id    | Unique identifier for the customer                                       |
| type_subtype   | Category and subtype of the interaction (e.g., tariff change, billing)   |
| n              | Number of interactions of this type in the last 6 months                                |
| days_since_last| Number of days since the last interaction of this type                   |


# Read data

In [4]:
%%time

core_data = load_artifact('core_data')
customer_interactions = load_artifact('customer_interactions')
usage_info = load_artifact('usage_info')

CPU times: user 45.6 ms, sys: 14.7 ms, total: 60.3 ms
Wall time: 79.8 ms


---

# Features computation

## `core_data`

In [5]:
%%time

core_data = core_data.with_columns(
    pl.col('rating_account_id').cast(pl.Utf8),
    pl.col("has_done_upselling").cast(pl.Boolean),
    pl.col("has_special_offer").cast(pl.Boolean),
    pl.col("is_magenta1_customer").cast(pl.Boolean)
)

# Manipulating binding days
core_data = core_data.with_columns(
    (pl.col('contract_lifetime_days') + pl.col('remaining_binding_days')).alias('contract_binding_days'),
    (pl.col('contract_lifetime_days') / (pl.col('contract_lifetime_days') + pl.col('remaining_binding_days'))).round(2).alias('completion_rate'),
    pl.when(pl.col('remaining_binding_days') > 0)
        .then(True)
        .otherwise(False)
        .alias('is_bounded')
)


# One-hot-encoding smartphone brands - extracting the values in order to keep the same order for the columns
# The number of unique values is not too high, so one-hot-encoding is not affecting the dimensionality too much
smartphone_brands_list = core_data.select(pl.col('smartphone_brand')).unique().to_series().sort().to_list()
core_data = core_data.with_columns(
    [
        pl.when(pl.col("smartphone_brand") == brand)
        .then(True)
        .otherwise(False)
        .alias(f"is_{brand.lower()}")
        for brand in smartphone_brands_list
    ]
)
core_data = core_data.drop("smartphone_brand")


# Add how many contract has the customer - including the current one
n_contract_per_customer = core_data.group_by("customer_id").agg(
    pl.col("rating_account_id").count().alias("n_contracts_per_customer")
)
core_data = core_data.join(n_contract_per_customer, on="customer_id", how="left")

CPU times: user 19.3 ms, sys: 17.8 ms, total: 37.2 ms
Wall time: 37.5 ms


In [6]:
core_data

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32
"""289094""","""4.161115""",36,878,325,false,false,20,70,false,1203,0.73,true,false,false,false,false,true,1
"""677626""","""2.429976""",34,998,614,false,false,0,5,false,1612,0.62,true,false,false,true,false,false,1
"""769928""","""3.875044""",36,37,-26,false,true,50,16.94,false,11,3.36,false,false,false,true,false,false,2
"""873260""","""4.649933""",50,503,-149,false,true,20,30.2,true,354,1.42,false,false,false,false,false,true,1
"""109774""","""3.851059""",47,331,-328,true,true,,46.12,false,3,110.33,false,false,false,true,false,false,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""502283""","""5.605022""",88,1573,-576,false,false,10,34.18,false,997,1.58,false,false,false,true,false,false,4
"""618421""","""2.862063""",85,1138,412,true,false,40,50.1,false,1550,0.73,true,false,false,false,false,true,1
"""104422""","""2.414264""",79,1709,-494,false,false,10,12.96,false,1215,1.41,false,false,false,true,false,false,3
"""642380""","""3.619106""",84,1592,403,false,false,10,56.73,false,1995,0.8,true,false,false,true,false,false,2


In [7]:
core_data.shape

(100000, 19)

---

## `usage_info`

In [8]:
%%time

usage_info = usage_info.with_columns([
    pl.col('rating_account_id').cast(pl.Utf8),
    pl.col('billed_period_month_d').cast(pl.Date),
    pl.col('has_used_roaming').cast(pl.Boolean),
    pl.col('used_gb').cast(pl.Float64)
]).sort(['rating_account_id', 'billed_period_month_d'])

CPU times: user 96.6 ms, sys: 11.1 ms, total: 108 ms
Wall time: 32.8 ms


In [9]:
%%time

month_usage = usage_info.group_by('rating_account_id').agg([
    pl.col('used_gb')
])

month_usage = month_usage.with_columns([
    pl.col('used_gb').list.get(0).alias('last_1_month_usage_gb'),
    pl.col('used_gb').list.get(1).alias('last_2_month_usage_gb'),
    pl.col('used_gb').list.get(2).alias('last_3_month_usage_gb'),
    pl.col('used_gb').list.get(3).alias('last_4_month_usage_gb'),

]).drop('used_gb')


CPU times: user 4.13 ms, sys: 2.88 ms, total: 7.01 ms
Wall time: 4.85 ms


In [10]:
month_usage

rating_account_id,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,f64,f64,f64,f64
"""100010""",0.9,0.8,0.4,0.9
"""100017""",0.7,0.5,0.8,0.3
"""100036""",0.9,0.2,0.2,1
"""100047""",35.6,48.9,35.4,50.7
"""100064""",0.7,0.7,0.3,0.8
…,…,…,…,…
"""999922""",2.3,1.4,3.6,2.1
"""999934""",12.8,13.8,14.5,6.5
"""999940""",9.8,11.6,5.4,14.4
"""999956""",13,10.4,13.7,14.1


In [11]:
%%time

aggregated_features = usage_info.group_by('rating_account_id').agg([
    
    # BASIC USAGE STATISTICS
    pl.col('used_gb').mean().round(2).alias('avg_monthly_usage_gb'),
    pl.col('used_gb').median().round(2).alias('median_monthly_usage_gb'),
    pl.col('used_gb').sum().round(2).alias('total_usage_gb'),
    pl.col('used_gb').std().round(2).alias('usage_std_gb'),
    pl.col('used_gb').min().round(2).alias('min_monthly_usage_gb'),
    pl.col('used_gb').max().round(2).alias('max_monthly_usage_gb'),
    pl.col('used_gb').quantile(0.25).round(2).alias('usage_q25_gb'),
    pl.col('used_gb').quantile(0.75).round(2).alias('usage_q75_gb'),
    
    # ROAMING STATISTICS
    pl.col('has_used_roaming').sum().alias('months_with_roaming'),
    pl.col('has_used_roaming').any().alias('ever_used_roaming'),
    pl.col('has_used_roaming').all().alias('always_used_roaming'),
    
    # # USAGE INTENSITY CATEGORIES
    (pl.col('used_gb') == 0).sum().alias('zero_usage_months'),
    (pl.col('used_gb') > 0).sum().alias('active_usage_months'),
])

CPU times: user 905 ms, sys: 15.6 ms, total: 921 ms
Wall time: 103 ms


In [12]:
aggregated_features

rating_account_id,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months
str,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32
"""100010""",0.75,0.85,3,0.24,0.4,0.9,0.8,0.9,0,false,false,0,4
"""100017""",0.57,0.6,2.3,0.22,0.3,0.8,0.5,0.7,2,true,false,0,4
"""100036""",0.57,0.55,2.3,0.43,0.2,1,0.2,0.9,1,true,false,0,4
"""100047""",42.65,42.25,170.6,8.29,35.4,50.7,35.6,48.9,0,false,false,0,4
"""100064""",0.62,0.7,2.5,0.22,0.3,0.8,0.7,0.7,1,true,false,0,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",2.35,2.2,9.4,0.92,1.4,3.6,2.1,2.3,0,false,false,0,4
"""999934""",11.9,13.3,47.6,3.67,6.5,14.5,12.8,13.8,3,true,false,0,4
"""999940""",10.3,10.7,41.2,3.78,5.4,14.4,9.8,11.6,1,true,false,0,4
"""999956""",12.8,13.35,51.2,1.66,10.4,14.1,13,13.7,0,false,false,0,4


In [13]:
%%time

# CALCULATE TRENDS AND ROLLING METRICS
trend_features = usage_info.group_by('rating_account_id').agg([
    # ROLLING AVERAGES
    # 2-month rolling average
    pl.col('used_gb').rolling_mean_by(
        'billed_period_month_d', window_size='2mo'
    ).alias('avg_2month_rolling_usage_gb'),
    
    # 3-month rolling average
        pl.col('used_gb').rolling_mean_by(
        'billed_period_month_d', window_size='3mo'
    ).alias('avg_3month_rolling_usage_gb'),

    
    # PERIOD-OVER-PERIOD DELTAS
    (pl.col('used_gb') - pl.col('used_gb').shift(1)).alias('delta_1mo'),
    (pl.col('used_gb') - pl.col('used_gb').shift(2)).alias('delta_2mo'),
    (pl.col('used_gb') - pl.col('used_gb').shift(3)).alias('delta_3mo'),

    # VOLATILITY METRICS
    # Rolling standard deviation
    pl.col('used_gb').rolling_std_by(
        'billed_period_month_d', window_size='2mo'
    ).alias('std_2month_rolling_usage_gb')
])

trend_features = trend_features.with_columns([
    # delta_1mo statistics
    pl.col('delta_1mo').list.mean().round(2).alias('avg_delta_1mo'),
    pl.col('delta_1mo').list.std().round(2).alias('delta_1mo_volatility'),
    pl.col('delta_1mo').list.max().round(2).alias('max_delta_1mo_increase'),
    pl.col('delta_1mo').list.min().round(2).alias('max_delta_1mo_decrease'),
    pl.col('delta_1mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_1mo_increase'),
    pl.col('delta_1mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_1mo_decrease'),
    pl.col('delta_1mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_1mo_change'),

    # delta_2mo statistics
    pl.col('delta_2mo').list.mean().round(2).alias('avg_delta_2mo'),
    pl.col('delta_2mo').list.std().round(2).alias('delta_2mo_volatility'),
    pl.col('delta_2mo').list.max().round(2).alias('max_delta_2mo_increase'),
    pl.col('delta_2mo').list.min().round(2).alias('max_delta_2mo_decrease'),
    pl.col('delta_2mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_2mo_increase'),
    pl.col('delta_2mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_2mo_decrease'),
    pl.col('delta_2mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_2mo_change'),

    # delta_3mo statistics
    pl.col('delta_3mo').list.mean().round(2).alias('avg_delta_3mo'),
    pl.col('delta_3mo').list.max().round(2).alias('max_delta_3mo_increase'),
    pl.col('delta_3mo').list.min().round(2).alias('max_delta_3mo_decrease'),
    pl.col('delta_3mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_3mo_increase'),
    pl.col('delta_3mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_3mo_decrease'),
    pl.col('delta_3mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_3mo_change'),
])

# The following block extracts the last N values from the rolling/statistical lists for each account.
# Each column contains the most recent, second most recent, etc. value from the corresponding list

trend_features = trend_features.with_columns([
    pl.col('avg_2month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_2mo_rolling_avg')
    for i in range(3)
]).drop('avg_2month_rolling_usage_gb')

trend_features = trend_features.with_columns([
    pl.col('avg_3month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_3mo_rolling_avg')
    for i in range(2)
]).drop('avg_3month_rolling_usage_gb')

trend_features = trend_features.with_columns([
    pl.col('delta_1mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_1mo')
    for i in range(3)
]).drop('delta_1mo')

trend_features = trend_features.with_columns([
    pl.col('delta_2mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_2mo')
    for i in range(2)
]).drop('delta_2mo')

trend_features = trend_features.with_columns([
    pl.col('delta_3mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_3mo')
    for i in range(1)
]).drop('delta_3mo')

trend_features = trend_features.with_columns([
    pl.col('std_2month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_2mo_rolling_stdev')
    for i in range(3)
]).drop('std_2month_rolling_usage_gb')

CPU times: user 1.34 s, sys: 26.6 ms, total: 1.37 s
Wall time: 234 ms


In [14]:
trend_features

rating_account_id,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev
str,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""100010""",0,0.46,0.5,-0.4,1,2,0,-0.2,0.42,0.1,-0.5,1,1,0,0,0,0,0,0,1,0.65,0.6,0.85,0.7,0.7,0.5,-0.4,-0.1,0.1,-0.5,0,0.35,0.28,0.07
"""100017""",-0.13,0.4,0.3,-0.5,1,2,0,-0.05,0.21,0.1,-0.2,1,1,0,-0.4,-0.4,-0.4,0,1,0,0.55,0.65,0.6,0.53,0.67,-0.5,0.3,-0.2,-0.2,0.1,-0.4,0.35,0.21,0.14
"""100036""",0.03,0.75,0.8,-0.7,1,1,1,0.05,1.06,0.8,-0.7,1,1,0,0.1,0.1,0.1,1,0,0,0.6,0.2,0.55,0.47,0.43,0.8,0,-0.7,0.8,-0.7,0.1,0.57,0,0.49
"""100047""",5.03,16.08,15.3,-13.5,2,1,0,0.8,1.41,1.8,-0.2,1,1,0,15.1,15.1,15.1,1,0,0,43.05,42.15,42.25,45,39.97,15.3,-13.5,13.3,1.8,-0.2,15.1,10.82,9.55,9.4
"""100064""",0.03,0.45,0.5,-0.4,1,1,1,-0.15,0.35,0.1,-0.4,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.5,0.7,0.6,0.57,0.5,-0.4,0,0.1,-0.4,0.1,0.35,0.28,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",-0.07,1.99,2.2,-1.5,1,2,0,1,0.42,1.3,0.7,2,0,0,-0.2,-0.2,-0.2,0,1,0,2.85,2.5,1.85,2.37,2.43,-1.5,2.2,-0.9,0.7,1.3,-0.2,1.06,1.56,0.64
"""999934""",-2.1,5.11,1,-8,2,1,0,-2.8,6.36,1.7,-7.3,1,1,0,-6.3,-6.3,-6.3,0,1,0,10.5,14.15,13.3,11.6,13.7,-8,0.7,1,-7.3,1.7,-6.3,5.66,0.49,0.71
"""999940""",1.53,7.6,9,-6.2,2,1,0,-0.8,5.09,2.8,-4.4,1,1,0,4.6,4.6,4.6,1,0,0,9.9,8.5,10.7,10.47,8.93,9,-6.2,1.8,2.8,-4.4,4.6,6.36,4.38,1.27
"""999956""",0.37,2.95,3.3,-2.6,2,1,0,2.2,2.12,3.7,0.7,2,0,0,1.1,1.1,1.1,1,0,0,13.9,12.05,11.7,12.73,12.37,0.4,3.3,-2.6,3.7,0.7,1.1,0.28,2.33,1.84


In [15]:
usage_features = aggregated_features.join(
    trend_features, 
    on='rating_account_id', 
    how='left'
).join(
    month_usage,
    on='rating_account_id',
    how='left'
)

In [16]:
usage_features

rating_account_id,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""100010""",0.75,0.85,3,0.24,0.4,0.9,0.8,0.9,0,false,false,0,4,0,0.46,0.5,-0.4,1,2,0,-0.2,0.42,0.1,-0.5,1,1,0,0,0,0,0,0,1,0.65,0.6,0.85,0.7,0.7,0.5,-0.4,-0.1,0.1,-0.5,0,0.35,0.28,0.07,0.9,0.8,0.4,0.9
"""100017""",0.57,0.6,2.3,0.22,0.3,0.8,0.5,0.7,2,true,false,0,4,-0.13,0.4,0.3,-0.5,1,2,0,-0.05,0.21,0.1,-0.2,1,1,0,-0.4,-0.4,-0.4,0,1,0,0.55,0.65,0.6,0.53,0.67,-0.5,0.3,-0.2,-0.2,0.1,-0.4,0.35,0.21,0.14,0.7,0.5,0.8,0.3
"""100036""",0.57,0.55,2.3,0.43,0.2,1,0.2,0.9,1,true,false,0,4,0.03,0.75,0.8,-0.7,1,1,1,0.05,1.06,0.8,-0.7,1,1,0,0.1,0.1,0.1,1,0,0,0.6,0.2,0.55,0.47,0.43,0.8,0,-0.7,0.8,-0.7,0.1,0.57,0,0.49,0.9,0.2,0.2,1
"""100047""",42.65,42.25,170.6,8.29,35.4,50.7,35.6,48.9,0,false,false,0,4,5.03,16.08,15.3,-13.5,2,1,0,0.8,1.41,1.8,-0.2,1,1,0,15.1,15.1,15.1,1,0,0,43.05,42.15,42.25,45,39.97,15.3,-13.5,13.3,1.8,-0.2,15.1,10.82,9.55,9.4,35.6,48.9,35.4,50.7
"""100064""",0.62,0.7,2.5,0.22,0.3,0.8,0.7,0.7,1,true,false,0,4,0.03,0.45,0.5,-0.4,1,1,1,-0.15,0.35,0.1,-0.4,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.5,0.7,0.6,0.57,0.5,-0.4,0,0.1,-0.4,0.1,0.35,0.28,0,0.7,0.7,0.3,0.8
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",2.35,2.2,9.4,0.92,1.4,3.6,2.1,2.3,0,false,false,0,4,-0.07,1.99,2.2,-1.5,1,2,0,1,0.42,1.3,0.7,2,0,0,-0.2,-0.2,-0.2,0,1,0,2.85,2.5,1.85,2.37,2.43,-1.5,2.2,-0.9,0.7,1.3,-0.2,1.06,1.56,0.64,2.3,1.4,3.6,2.1
"""999934""",11.9,13.3,47.6,3.67,6.5,14.5,12.8,13.8,3,true,false,0,4,-2.1,5.11,1,-8,2,1,0,-2.8,6.36,1.7,-7.3,1,1,0,-6.3,-6.3,-6.3,0,1,0,10.5,14.15,13.3,11.6,13.7,-8,0.7,1,-7.3,1.7,-6.3,5.66,0.49,0.71,12.8,13.8,14.5,6.5
"""999940""",10.3,10.7,41.2,3.78,5.4,14.4,9.8,11.6,1,true,false,0,4,1.53,7.6,9,-6.2,2,1,0,-0.8,5.09,2.8,-4.4,1,1,0,4.6,4.6,4.6,1,0,0,9.9,8.5,10.7,10.47,8.93,9,-6.2,1.8,2.8,-4.4,4.6,6.36,4.38,1.27,9.8,11.6,5.4,14.4
"""999956""",12.8,13.35,51.2,1.66,10.4,14.1,13,13.7,0,false,false,0,4,0.37,2.95,3.3,-2.6,2,1,0,2.2,2.12,3.7,0.7,2,0,0,1.1,1.1,1.1,1,0,0,13.9,12.05,11.7,12.73,12.37,0.4,3.3,-2.6,3.7,0.7,1.1,0.28,2.33,1.84,13,10.4,13.7,14.1


In [17]:
usage_features.shape

(100000, 52)

---

## `customer_interactions`

In [18]:
%%time

interactions_features = customer_interactions.pivot(
    index='customer_id',
    on='type_subtype', 
    values=['n', 'days_since_last'],
    aggregate_function='first' # There is only one value per customer
)

CPU times: user 13.5 ms, sys: 11 ms, total: 24.5 ms
Wall time: 11.1 ms


In [19]:
interactions_features.shape

(42095, 9)

---

## `combined_features`

In [20]:
features = core_data.join(
    usage_features,
    on='rating_account_id',
    how='left'
).join(
    interactions_features,
    on='customer_id',
    how='left'
)

In [21]:
features.head()

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64
"""289094""","""4.161115""",36,878,325,False,False,20.0,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.2,-0.7,0,2,0,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,,,,,,,,
"""677626""","""2.429976""",34,998,614,False,False,0.0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.5,-0.5,1,1,0,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,,,1.0,1.0,,,87.0,118.0
"""769928""","""3.875044""",36,37,-26,False,True,50.0,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,0.1,-0.3,1,1,0,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,,,,,,,,
"""873260""","""4.649933""",50,503,-149,False,True,20.0,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.8,-0.1,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,,,,,,,,
"""109774""","""3.851059""",47,331,-328,True,True,,46.12,False,3,110.33,False,False,False,True,False,False,3,0.35,0.35,1.4,0.29,0.1,0.6,0.1,0.6,1,True,False,0,4,0.0,0.5,0.5,-0.5,1,…,0.5,-0.5,1,1,0,0.0,0.0,0.0,0,0,1,0.35,0.1,0.35,0.27,0.27,0.5,0.0,-0.5,0.5,-0.5,0.0,0.35,0.0,0.35,0.6,0.1,0.1,0.6,,,,,,,,


In [22]:
%%time

# Filling null values from intereactions features
features = features.with_columns([
    pl.when(pl.col(col).is_null())
    .then(0)
    .otherwise(pl.col(col))
    .alias(col)
    for col in features.columns if col.startswith('n_')
] + [
    pl.when(pl.col(col).is_null())
    .then(-1)
    .otherwise(pl.col(col))
    .alias(col)
    for col in features.columns if col.startswith('days_since_last')
])

CPU times: user 1.29 ms, sys: 2.27 ms, total: 3.55 ms
Wall time: 1.15 ms


### Dealing with null values in `available_gb`

In [23]:
features.filter(pl.col('available_gb').is_null()).select('rating_account_id', 'available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')

rating_account_id,available_gb,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,i64,f64,f64,f64,f64
"""109774""",,0.6,0.1,0.1,0.6
"""781755""",,0.6,0.8,0.3,0
"""827238""",,0,0.7,0.9,0.1
"""330581""",,0,0.2,0.2,0.9
"""416121""",,0.8,0.8,0.3,0.5
…,…,…,…,…,…
"""662172""",,54.7,62.5,24.7,37.4
"""556788""",,19.6,32.6,31,65.2
"""283647""",,51.3,18.1,53.3,39.2
"""581854""",,49.5,21.5,27.7,18.6


In [24]:
%%time

# Option A1: predict the value with a linear model

# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)

X = df[['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb']]
y = df['available_gb']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Ridge regression (L2)
ridge = Ridge()
ridge.fit(X_train, y_train)

# Predict and evaluate
y_pred = ridge.predict(X_test)
mae_linear_model = mean_absolute_error(y_test, y_pred)
print(f'MAE Ridge: {mae_linear_model:.4f}')

MAE Ridge: 15.0033
CPU times: user 17.3 ms, sys: 17.3 ms, total: 34.5 ms
Wall time: 13.3 ms


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [25]:
%%time

# Option A2: predict the value with a linear model

# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)

X = df[['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb']]
y = df['available_gb']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Lasso regression (L1)
lasso = Lasso()
lasso.fit(X_train, y_train)

# Predict and evaluate
y_pred = lasso.predict(X_test)
mae_lasso = mean_absolute_error(y_test, y_pred)
print(f'MAE Lasso: {mae_lasso:.4f}')

MAE Lasso: 14.9996
CPU times: user 19.2 ms, sys: 12.6 ms, total: 31.8 ms
Wall time: 12.4 ms


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [26]:
%%time

# Option B: mean of the previous contract activities

columns_to_average = ['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb']

# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)


# Compute horizontal mean and find closest value
result_df = df.with_columns([
    # Compute horizontal mean
    pl.mean_horizontal(columns_to_average).alias('prediction')
])

mae_horizontal_mean = mean_absolute_error(result_df['available_gb'], result_df['prediction'])
print(f'MAE horizontal_mean: {mae_horizontal_mean:.4f}')

MAE horizontal_mean: 21.8625
CPU times: user 14.8 ms, sys: 6.35 ms, total: 21.1 ms
Wall time: 5.78 ms


The error of the linear models is lower compared to the mean of the activities. **Winning option A2**

In [27]:
# APPLY THE PREDICTION TO MISSING ROWS
# Prepare the data: drop rows with null available_gb and select numeric features
df = (
        features
            .filter(pl.col('available_gb').is_not_null())
            .select('available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')
)

X = df.select(['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb'])
y = df.select('available_gb').to_series()

df_missing = (
        features
            .filter(pl.col('available_gb').is_null())
            .select(pl.exclude("available_gb"))
)

X_missing = df_missing.select(['last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb'])


# Fit Lasso regression (L1)
lasso = Lasso()
lasso.fit(X, y)

# Predict
df_missing = df_missing.with_columns(available_gb=lasso.predict(X_missing))

# Since the value cannot be float, mapping the predicted values to similar options from data
available_values = features.filter(pl.col('available_gb').is_not_null()).select(pl.col('available_gb')).unique().to_series().to_list()

def find_closest(val, avail_list):
    'Find closest value from available_values list'
    if val is None:
        return None
    return min(avail_list, key=lambda x: abs(x - val))

# Compute horizontal mean and find closest value
df_missing = df_missing.with_columns([
    # Find closest available value
    pl.col('available_gb').map_elements(
        lambda x: find_closest(x, available_values), 
        return_dtype=pl.Int64
    ).alias('available_gb')
])

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [28]:
%%time

features = pl.concat(
    [
        features.filter(pl.col('available_gb').is_not_null()),
        df_missing
    ],
    how='diagonal'
)

features.head()

CPU times: user 13.4 ms, sys: 2.95 ms, total: 16.4 ms
Wall time: 2.56 ms


rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64
"""289094""","""4.161115""",36,878,325,False,False,20,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.2,-0.7,0,2,0,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,0,0,0,0,-1,-1,-1,-1
"""677626""","""2.429976""",34,998,614,False,False,0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.5,-0.5,1,1,0,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,0,0,1,1,-1,-1,87,118
"""769928""","""3.875044""",36,37,-26,False,True,50,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,0.1,-0.3,1,1,0,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,0,0,0,0,-1,-1,-1,-1
"""873260""","""4.649933""",50,503,-149,False,True,20,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.8,-0.1,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,0,0,0,0,-1,-1,-1,-1
"""692379""","""4.382165""",46,80,-25,False,True,40,60.71,False,55,1.45,False,True,False,False,False,False,1,0.55,0.6,2.2,0.3,0.2,0.8,0.4,0.8,2,True,False,0,4,0.0,0.53,0.6,-0.4,1,…,0.4,-0.6,1,1,0,0.0,0.0,0.0,0,0,1,0.5,0.3,0.6,0.47,0.47,0.6,-0.2,-0.4,0.4,-0.6,0.0,0.42,0.14,0.28,0.8,0.4,0.2,0.8,0,0,0,0,-1,-1,-1,-1


In [29]:
# Checking if there are other columns to fill

null_counts_features = features.select([
    pl.col(col).is_null().sum().alias(f'{col}_nulls') for col in features.columns
])

null_counts_features.select([
    col for col in null_counts_features.columns if null_counts_features[0, col] != 0
])

In [68]:
import pickle
with open(artifacts_dir / 'imputer_model.pkl', 'wb') as f:
    pickle.dump(lasso, f)

### Computing additional features

Computing features based on `available_gb`, since there are no missing values

In [30]:
%%time

# Compute how many times the contract required more than available data, baed on definde threshold of the current plan.

# Compute 20%, 50%, and 80% for available_gb per rating_account_id
thresholds_available_gb = features.group_by('rating_account_id').agg([
    (pl.col('available_gb') / 100 * 25).get(0).round(2).alias('p25'),
    (pl.col('available_gb') / 100 * 50).get(0).round(2).alias('p50'),
    (pl.col('available_gb') / 100 * 70).get(0).round(2).alias('p75'),
])

# Compute, for each month, in which range the usage falls
percentile_exprs = []
for i in range(1, 5):
    percentile_expr = (
        pl.when(pl.col(f'last_{i}_month_usage_gb').is_between(-1, pl.col('p25'), closed='right'))  # -1 otherwise 0 is not counted
        .then(pl.lit('P1'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p25'), pl.col('p50'), closed='right'))
        .then(pl.lit('P2'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p50'), pl.col('p75'), closed='right'))
        .then(pl.lit('P3'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p75'), pl.col('available_gb'), closed='right'))
        .then(pl.lit('P4'))
        .when(pl.col(f'last_{i}_month_usage_gb') > pl.col('available_gb'))
        .then(pl.lit('P5'))  # how many times has exceeded the available data
        .otherwise(pl.lit(None))
        .alias(f'month_{i}_threshold')
    )
    percentile_exprs.append(percentile_expr)

# Compute how many times, in the past 4 months, the usage felt in specific ranges
count_exprs = []
for p in range(1, 6):
    count_expr = sum(
        (pl.col(f'month_{i}_threshold') == f'P{p}').cast(pl.Int32)
        for i in range(1, 5)
    ).alias(f'times_in_p{p}')
    count_exprs.append(count_expr)

# Final computation
features = (
    features
    .join(
        thresholds_available_gb,
        on='rating_account_id',
        how='left')
    .with_columns(percentile_exprs)
    .with_columns(count_exprs)
).drop(['p25', 'p50', 'p75', 'month_1_threshold', 'month_2_threshold', 'month_3_threshold', 'month_4_threshold'])

CPU times: user 35 ms, sys: 28.7 ms, total: 63.7 ms
Wall time: 17.3 ms


In [31]:
# Check that the sum of times_in_p1 to times_in_p5 is always 4 (number of billing months) for each row
check_sum = features.select(
    (pl.col('times_in_p1') + pl.col('times_in_p2') + pl.col('times_in_p3') + pl.col('times_in_p4') + pl.col('times_in_p5')).alias('sum_p')
)

# Count how many rows do not have sum == 4
invalid_rows = check_sum.filter(pl.col('sum_p') != 4).height
print(f"Number of invalid rows: {invalid_rows}")

Number of invalid rows: 0


---

## `final features dataframe`

In [32]:
features.shape

(100000, 83)

In [33]:
features.head()

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel,times_in_p1,times_in_p2,times_in_p3,times_in_p4,times_in_p5
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i32,i32,i32,i32,i32
"""289094""","""4.161115""",36,878,325,False,False,20,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""677626""","""2.429976""",34,998,614,False,False,0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,0,0,1,1,-1,-1,87,118,0,0,0,0,4
"""769928""","""3.875044""",36,37,-26,False,True,50,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""873260""","""4.649933""",50,503,-149,False,True,20,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""692379""","""4.382165""",46,80,-25,False,True,40,60.71,False,55,1.45,False,True,False,False,False,False,1,0.55,0.6,2.2,0.3,0.2,0.8,0.4,0.8,2,True,False,0,4,0.0,0.53,0.6,-0.4,1,…,0.0,0.0,0.0,0,0,1,0.5,0.3,0.6,0.47,0.47,0.6,-0.2,-0.4,0.4,-0.6,0.0,0.42,0.14,0.28,0.8,0.4,0.2,0.8,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0


# Features cleaning

## Correlation analysis

In [34]:
numeric_and_booleans_features = features.select(cs.numeric() | cs.boolean())

In [35]:
'has_done_upselling' in numeric_and_booleans_features.columns

True

In [36]:
%%time

correlation_matrix = numeric_and_booleans_features.corr()

CPU times: user 20.6 ms, sys: 40.7 ms, total: 61.3 ms
Wall time: 40.9 ms


In [37]:
correlation_matrix

age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,…,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel,times_in_p1,times_in_p2,times_in_p3,times_in_p4,times_in_p5
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.0629525879053477,0.007917425991875747,-0.004591802863005269,-0.0005704522106472497,0.002001530818124203,-0.001424864767973105,-0.04153326724227131,-0.04689189462413822,0.0035360243153290272,0.0039747041463997265,-0.002902167339478833,0.001961146967666313,0.0009508412788796461,0.0017371282874024821,0.0002704501086987437,-0.003392510845809081,-0.0731307589494203,-0.07117159510219447,-0.07313072441999216,-0.07526670691780543,-0.06588505936769493,-0.07627248976390334,-0.06852675138912731,-0.0721147168654201,0.0024213213370010357,0.0028884406207485866,0.0027643128541414304,0.010187406343517207,-0.010187406343517258,0.000702389814970992,-0.069869446847254,-0.0643422888980605,0.06525763675673314,-0.0027109579596410914,-0.003923417078655652,0.013449505856102882,…,0.0007018860866903778,0.0007018860866903778,0.0007018860866903778,-0.002796473162905392,-0.00016122916901735,0.008278462206984744,-0.07023640985753887,-0.07118943304127713,-0.0721607883384821,-0.0723979843256634,-0.07251675544933986,-0.0022278083988556217,0.00481023193773842,-0.001876581624289159,0.0025777466018754532,0.0029294087217664184,0.0007018860866903778,-0.05818506841796055,-0.06188408762322747,-0.05992545507252038,-0.0679768400853932,-0.06941992923036822,-0.06614590372886239,-0.06763631390301206,-0.002961927111691673,-0.0036624528884945783,-0.00106025628151353,0.002456956467397652,-0.0021735524770572197,-0.001721701728603328,-0.004082781479783177,0.0007222265215129422,0.04770564651922236,0.012602850196628627,-0.011356125642465338,-0.024614597894047057,-0.0488429499872255
-0.0629525879053477,1,-0.0003783910010728898,-0.0023025678524813092,0.002191272514161916,-0.032126065687679374,0.0030451716731356077,0.004379550151761059,0.8171535046016165,-0.07300261199878585,0.002635506103491189,0.00028419739566267453,-0.004675920594287974,-0.0026743838921719247,-0.005180140865737749,0.0054936273735810524,0.002365051770419589,0.7632866265204288,0.7508626128730687,0.7632872022690763,0.7272201981256863,0.7109202571111088,0.7732354631860756,0.7318593393861054,0.7537621693882375,-0.005097468591355103,-0.0019697892938416246,-0.004880964016446549,-0.09475926466383468,0.0947592646638343,-0.0015123138415764153,0.6811245720092708,0.6307247798969505,-0.629192188107774,0.03264938171410441,0.03404249035232137,-0.13522088707618482,…,-0.0015110509232824144,-0.0015110509232824144,-0.0015110509232824144,0.015538860151976431,0.013517621426203845,-0.08134120648698065,0.743420517580498,0.7432780142548623,0.7427992699446642,0.7563778588355978,0.7563208014063192,-0.003962065315546012,0.008440296747652006,-0.006002610013797714,0.0044702492342888625,0.0024312511947036956,-0.0015110509232824144,0.5862943391396993,0.5878593834322124,0.5894401962885205,0.7074652280240051,0.7068184459534343,0.7085158996998931,0.7074916297772587,0.004214446606026726,0.002181114771445765,0.0024240940371759866,0.0038562618871116,0.00436669577461136,0.0014775262795115821,-0.001626976031355502,0.0029494927434599453,-0.4651370824554912,-0.1569738590849741,0.08043653464481942,0.26470369022925355,0.4983430708639657
0.007917425991875747,-0.00037839100107288977,1,0.002836621622797751,-0.0017270762444642996,0.0007421669104085086,0.002179619852898708,-0.02270540324045503,0.5761108710174909,-0.15074952149015686,0.7966607385345115,0.0022402108438379946,0.001793313269977867,-0.002214456294861528,-0.00019307999874908915,-0.00018573325638275372,0.0031184382252110996,-0.0033680560210308285,-0.003378621844462236,-0.0033670204985768964,-0.0022782913243613277,-0.003511932081388124,-0.0031352612191952586,-0.003881068847467549,-0.0029258975348432146,-0.0033505124922029133,-0.002559247357206389,-0.0032729940735209944,-0.001084621555828734,0.00108462155582873,0.004231075366245346,-0.0012913676246216865,-0.0005253656943313156,0.0012444482226628506,0.0010398710334205306,-0.0009434670746599267,-0.00019916025160278175,…,0.0042281609550089815,0.0042281609550089815,0.0042281609550089815,0.008375570520987039,-0.00671419063622088,-0.004642390731302243,-0.0031345016415436,-0.0016150839423913234,-0.0034218387587696517,-0.0023082638656146664,-0.003235868466698707,-0.0013602383904919408,-0.0033888716424879377,0.009003476899453028,-0.004733428090167229,0.005614739084775776,0.0042281609550089815,-0.0037692239748395094,-0.0010507580856376688,0.0012077114722154607,-0.006000834291976932,-0.0005025521403264842,-0.0025678096237829354,-0.0034033625327004667,0.004289310345529904,-0.004545080383924308,-0.000550729968203837,0.008401502292391755,0.004114870434398843,-0.004200652333155454,-0.0014239168372252664,0.0025320717966222225,0.003595027248978508,0.000571795409354263,-0.002259417399267957,0.00031672611610411816,-0.0036916032443004864
-0.0045918028630052695,-0.002302567852481309,0.002836621622797751,1,0.00044653260013443886,-0.0007265925023272958,0.003530696955821945,-0.0007153183529009393,-0.0002469677269840507,0.005304590702156782,0.0031468431797214984,-0.0018501656333439196,-0.0019855200479597743,0.004639694472270228,-0.0006994313105863596,-0.002148628346855223,-0.007399778373458924,-0.00036387406011787306,0.00032367384072212334,-0.0003643486087252424,-0.0001387047513497202,-0.0016886485359202952,-0.0007408673071813585,0.0005407046809743232,0.0001465050228351968,0.00266922216023411,0.004323862588912817,-0.0033479916949822326,-0.00043438304413660273,0.0004343830441365108,0.0013515627002632896,-0.000005663737023073465,0.0001335174116249571,-0.00025821578625268786,0.0030049416348714515,-0.0015036729066844846,-0.003052345920329059,…,0.0013499325064134057,0.0013499325064134057,0.0013499325064134057,0.0009211738308017993,-0.0010976404787036002,0.000495157232524441,-0.0007434587933056248,-0.0006425903034163827,0.00003486163028747551,-0.0003106479234465315,-0.0006073192648943126,0.0034742949891955117,-0.003787841108221961,0.0016777168352904751,-0.0003171353616405199,-0.0021068585284736647,0.0013499325064134057,-0.000149254235854357,-0.0007267262013227692,0.0007879608197672401,-0.0004793605118667085,0.0005479364355261209,-0.0017658823294663898,0.0003518833513482501,0.0018968546922944463,-0.004064966673513941,0.0006671059750851179,-0.0013340506894135374,0.0028737862974820244,-0.00520698772590246,0.000801703793485195,-0.001116240707747035,0.004908176055550588,-0.0015720367501634246,-0.004296473840202138,-0.004526820461447913,-0.0017769477542906008
-0.0005704522106472497,0.002191272514161916,-0.0017270762444642998,0.00044653260013443886,1,0.003973339897180192,-0.002199416869560693,0.03414953052150721,0.0007955623976277197,0.0008558560142034082,0.0005701964969026256,-0.0004967342566144464,-0.0026001613540908948,0.004291713228338531,-0.00033392055525764675,-0.0028338386441610765,0.004800126165530288,0.0004998491989980647,0.00015949929910605688,0.0005009475564805672,0.002714904968511883,-0.0002496032915360592,0.0013383993034231333,-0.0006532016362665572,0.000800729729297818,-0.0019588304706103054,-0.0019456597562547344,0.0023120308307689255,0.0008860887027977192,-0.0008860887027979678,-0.0006902612895723598,0.0033592135326741034,0.003963317145546357,-0.0026316178119499553,-0.00514027086683799,0.0035799532097934026,0.0031799146862987473,…,-0.0006910313027378197,-0.0006910313027378197,-0.0006910313027378197,-0.0020711702709586854,0.002820055022146893,-0.002099248096349946,0.0004149714165760444,-0.0002638984891702955,0.0005605959718353852,0.0001651411895967431,0.0003169151237531785,0.0018720020621991486,0.0002461717794627941,-0.002810463135843339,0.0021095376520570003,-0.0025634997389916443,-0.0006910313027378197,0.003465434573746271,0.0030783972205729575,0.000518131104137173,0.0013910333642018313,-0.00032734152750675175,-0.0001755415273111775,0.0009670697878180921,-0.001350850083372308,-0.0008705444981992911,0.0024878267534031977,0.0038602315018424495,0.0009160733249261185,-0.0036879796846018566,0.002072694952002621,0.002538730639622824,0.002204021963765657,-0.003010371970463974,-0.0008420598791021157,0.0017026248514040226,-0.0008350944593033289
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.04770564651922235,-0.4651370824554912,0.003595027248978508,0.004908176055550588,0.002204021963765657,0.4430092357502365,-0.0015219957873547272,-0.016437533277407306,-0.3781176029495978,0.03603245876901593,0.0026703476137642378,-0.006586134299671042,0.00461998062928485,0.003203557312516652,0.003962926803507999,-0.00046385275416894773,-0.0010580614604689812,-0.6574356700257723,-0.6477335996817698,-0.6574351617629731,-0.5921330567571854,-0.6324734010951043,-0.655017123246803,-0.636629303634189,-0.646045082105221,-0.0022202435618332707,-0.002877214880773874,0.0002603893665824224,0.19473479366796784,-0.19473479366796784,0.0013269067866670428,-0.5552362453794027,-0.513860477389615,0.5131083533982003,-0.05017190324175761,-0.05086622658137377,0.20486216667898915,…,0.0013272300366106815,0.0013272300366106815,0.0013272300366106815,-0.023392744802639868,-0.021003487982898188,0.1242839627454494,-0.6401201528357924,-0.6404085652107104,-0.6399923369403572,-0.6515520568283352,-0.6515088247415651,0.0034509210628444922,-0.006026941760205114,0.003915808807679355,-0.0025729495726770513,-0.0021063998790940668,0.0013272300366106815,-0.47872300448702526,-0.47902096232906094,-0.47956246776888367,-0.609165030501231,-0.60937673877697,-0.610077632444146,-0.609171658077355,0.000059637575998672714,-0.0012191470648234394,-0.0014531359448973823,-0.0026484737880406043,0.00007632894699679356,-0.003910193406858348,0.0009721214954387538,-0.0018419967704415702,0.9999999999999999,-0.25830911225992387,-0.30792867420658687,-0.35295243923304315,-0.735425379332541
0.012602850196628627,-0.1569738590849741,0.0005717954093542631,-0.0015720367501634244,-0.0030103719704639743,0.1201220359568861,-0.0011358179365472587,-0.008374254196067613,-0.12797638259623856,0.010083043683961538,-0.0037605243972385857,0.00571890715742588,-0.001222864892321493,-0.0023102559203584338,0.004922708952292458,-0.003581813816815024,-0.004045463252001213,-0.0805348863550965,-0.07978097346060944,-0.08053626632888904,-0.08836353485328555,-0.06601591086663423,-0.08501006101004556,-0.07663393330581635,-0.08098256481623868,0.004011439195302195,-0.0013450173125470343,0.001554746358504374,-0.10143640503854286,0.10143640503854286,-0.00517536800107382,-0.08260255851686844,-0.07864889497629995,0.07415262192466193,0.01909079448633173,0.026510056266762654,-0.0924458147830231,…,-0.005175989802762897,-0.005175989802762897,-0.005175989802762897,-0.000971346860364211,0.020929835249853847,-0.05588535707487825,-0.07945220110667435,-0.07822605127722324,-0.07736023862536541,-0.08032589540493207,-0.07914751670547175,-0.002426901830718723,-0.0018252937563705144,-0.000967793660772168,-0.004236585771998165,-0.0027908536510668565,-0.005175989802762897,-0.07256802452784236,-0.07229930076101733,-0.07123041550754901,-0.07319372401384083,-0.07410161945816272,-0.0748533773291152,-0.07648248613972104,-0.0040087652544593885,-0.0033501395087819326,-0.0022470159242042164,-0.0003515360271087144,-0.006087907692973942,-0.005219717915631802,-0.00009718130508292959,-0.000762449350718894,-0.25830911225992387,0.9999999999999998,0.06638363851403532,-0.06603850435504893,-0.32501826130446215
-0.011356125642465338,0.08043653464481942,-0.002259417399267957,-0.004296473840202139,-0.0008420598791021157,0.02659785973540774,0.002762929056969013,0.004138773679560465,0.06444416664037725,-0.0035336615281299534,-0.0033516336189887584,0.002592873468881893,0.0011823862420105719,-0.004517082988481224,-0.0013179389639532302,0.002315383633963025,0.0022099155654734513,0.12771489719017373,0.11951849314974672,0.12771508845412483,0.14379934723455692,0.11589738196132676,0.13986866488074107,0.11073242930949861,0.12454400153038282,0.007081831567543983,0.006931307039372357,0.002053291609496798,-0.0648662423144849,0.0648662423144849,0.005930550562057041,0.13558735367085836,0.12767042540264215,-0.12355130756750196,0.02944772791370152,0.006673307821826225,-0.07328089869685903,…,0.005929657650462606,0.005929657650462606,0.005929657650462606,0.023371756848586,-0.007699840513374086,-0.04385520067362952,0.1260394233735784,0.1238255767415046,0.12263496873387547,0.1270551432486157,0.12568753861141202,0.002117545597359983,0.005462335963985856,-0.0015989045044219935,0.007554921578265701,0.0038584171305849234,0.005929657650462606,0.11510011038398762,0.12127235937571822,0.11240751451407237,0.11698705992280377,0.11650811063659408,0.11927187169760539,0.12079954414972321,0.0021556920565726353,0.0021975313566573347,-0.0024947330179182444,0.00012364639706061308,0.004345671330002707,0.0005387009612486942,-0.0037440990750542275,0.0023767493520834536,-0.3079286742065868,0.06638363851403531,1,0.2371670874232743,-0.06804662056053945
-0.024614597894047054,0.26470369022925355,0.00031672611610411816,-0.004526820461447913,0.0017026248514040228,0.07138108632862197,-0.0013717817552834875,-0.004721940363697447,0.2165438505737108,-0.02055105593527626,-0.000997789067152584,-0.002061289632341357,-0.0047675891075099515,0.001963607921332731,-0.0034396984324922184,0.002377627935435075,0.00006538944256797002,0.3146524804609388,0.30125243815782193,0.31465223603047016,0.30811558755183915,0.298863399870991,0.32825753163035903,0.29329312440995947,0.3026811792337131,-0.0014625400011626585,0.00021636773445934235,0.0027606280589521684,-0.07417531216266414,0.07417531216266414,-0.0054306704485699115,0.2896052593432555,0.26515322560237997,-0.2706238411890056,0.025420292851836286,0.020893193177627927,-0.09391298792010366,…,-0.00543180670905028,-0.00543180670905028,-0.00543180670905028,0.010824280691351369,0.009214047973634914,-0.05609551368190936,0.30522823393168946,0.307492815112208,0.3074433889487827,0.31164617985221,0.3126778851338649,-0.006027484428002714,0.0006280708297527726,-0.00008370792167351961,-0.005375769999349422,0.0005437491386760237,-0.00543180670905028,0.24635443777407748,0.2547974109147824,0.2524911491623876,0.29208534715771983,0.29328747285145385,0.2922381597706389,0.28913335872234,-0.0007395567774847541,-0.00007880956307668456,-0.001250865119252942,0.0018811956779391586,0.0022091656375357147,0.0011698713406287345,-0.0022377223835694953,-0.000815744639347737,-0.35295243923304315,-0.06603850435504893,0.23716708742327428,1,0.05559037654764091


In [38]:
fig = px.imshow(
    correlation_matrix.to_numpy(),
    labels=dict(x='Features', y='Features', color='Correlation'),
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    aspect='auto'
)
fig.update_layout(
    width=1800,
    height=1800,
    title='Correlation Matrix Heatmap'
)
fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

# Blue means that variable X and variable Y follow the same behaviour (both increasing or decreasing)
# Red means that variable X has the opposite behaviour of variable Y

From the plot, it seems that the target `has_done_upselling` is **not correlated** with any other column

In [39]:
%%time 

correlation_analysis = (
    correlation_matrix
    .unpivot(index=None, variable_name='col1', value_name='correlation')
    .with_columns([
        pl.repeat(correlation_matrix.columns, correlation_matrix.height).flatten().alias('col2'),
        pl.col('correlation').abs()
    ])
    .filter(pl.col('col1') != pl.col('col2'))
    .filter(pl.col('col1') < pl.col('col2'))  # Keep only one pair per combination
    .with_columns([
        pl.when(pl.col('correlation') > 0.99)
            .then(pl.lit('identical'))
        .when(pl.col('correlation') > 0.9)
            .then(pl.lit('high'))
        .when(pl.col('correlation') > 0.8)
            .then(pl.lit('high-medium'))
        .when(pl.col('correlation') > 0.7)
            .then(pl.lit('medium'))
        .otherwise(pl.lit('ok'))
        .alias('analysis')
    ])
    .sort('correlation', descending=True)
)

CPU times: user 2 ms, sys: 3.57 ms, total: 5.56 ms
Wall time: 2.84 ms


In [40]:
correlation_analysis.group_by('analysis', maintain_order=True).len()

analysis,len
str,u32
"""identical""",19
"""high""",100
"""high-medium""",58
"""medium""",101
"""ok""",2962


In [41]:
has_done_upselling_corr = correlation_analysis.filter(
    (pl.col('col1') == 'has_done_upselling') | (pl.col('col2') == 'has_done_upselling')
)
has_done_upselling_corr

col1,correlation,col2,analysis
str,f64,str,str
"""available_gb""",0.052664786297940996,"""has_done_upselling""","""ok"""
"""age""",0.04153326724227132,"""has_done_upselling""","""ok"""
"""has_done_upselling""",0.03414953052150721,"""is_magenta1_customer""","""ok"""
"""gross_mrc""",0.02619134627808244,"""has_done_upselling""","""ok"""
"""has_done_upselling""",0.023156656135708362,"""times_in_p5""","""ok"""
…,…,…,…
"""has_done_upselling""",0.0005116889862998758,"""months_with_roaming""","""ok"""
"""has_done_upselling""",0.00048133868409945736,"""last_3_2mo_rolling_stdev""","""ok"""
"""has_done_upselling""",0.00040420281720261844,"""max_delta_1mo_decrease""","""ok"""
"""ever_used_roaming""",0.0002376346832045547,"""has_done_upselling""","""ok"""


### Anlysis correlated networks

Instead of removing all the features that exceed a certain threshold. Using this approach of showing the connected features will help to keep some features that retain important business knowldge.

In [42]:
G_70, remove_70, clusters_70 = correlation_graph_analysis(correlation_analysis, 0.7)

In [43]:
plot_correlation_network(G_70, pos=nx.spring_layout(G_70, seed=42))

In [44]:
G_80, remove_80, clusters_80 = correlation_graph_analysis(correlation_analysis, 0.8)

In [45]:
plot_correlation_network(G_80, pos=nx.spring_layout(G_80, seed=42))

In [46]:
G_90, remove_90, clusters_90 = correlation_graph_analysis(correlation_analysis, 0.9)

In [47]:
plot_correlation_network(G_90, pos=nx.spring_layout(G_90, seed=42))

In [48]:
G_95, remove_95, clusters_95 = correlation_graph_analysis(correlation_analysis, 0.95)

In [49]:
plot_correlation_network(G_95, pos=nx.spring_layout(G_95, seed=42))

## Variance analysis

In [50]:
only_numeric_features = features.select(cs.numeric() & ~cs.boolean())

In [51]:
%%time

# Compute the standard deviation
variance_analysis = (
    features
    .select([
        pl.col(col).std()
        for col in numeric_and_booleans_features.columns
    ])
    .unpivot(variable_name='feature', value_name='stdev')
    .sort('stdev', descending=True)
)

CPU times: user 8.26 ms, sys: 2.79 ms, total: 11.1 ms
Wall time: 1.91 ms


In [52]:
variance_analysis

feature,stdev
str,f64
"""contract_binding_days""",599.638049222254
"""contract_lifetime_days""",490.1271217887317
"""remaining_binding_days""",345.64345852492
"""total_usage_gb""",69.21684430203867
"""days_since_last_produkte&services-tarifwechsel""",42.254754177537265
…,…
"""active_usage_months""",0.23464950586759398
"""months_with_no_delta_3mo_change""",0.17850662773735168
"""is_oneplus""",0.15636891670783276
"""is_xiaomi""",0.1545039087355984


In [53]:
%%time

# Unique count dataframe (excluding boolean columns)
unique_count = (
    features
    .select([
        pl.col(col).n_unique().alias(col)
        for col in only_numeric_features.columns 
    ])
    .unpivot(variable_name="feature", value_name="unique_count")
    .sort("unique_count")
)


CPU times: user 55 ms, sys: 7.45 ms, total: 62.4 ms
Wall time: 9.03 ms


In [54]:
unique_count

feature,unique_count
str,u32
"""months_with_delta_3mo_increase""",2
"""months_with_delta_3mo_decrease""",2
"""months_with_no_delta_3mo_change""",2
"""months_with_delta_2mo_increase""",3
"""months_with_delta_2mo_decrease""",3
…,…
"""completion_rate""",2487
"""contract_binding_days""",2516
"""usage_std_gb""",2671
"""avg_monthly_usage_gb""",2748


## Cleaning

In [55]:
do_not_remove = ['has_done_upselling']

In [56]:
stdev_threshold = 0.1
stdev_1quantile = variance_analysis['stdev'].quantile(0.1)

In [57]:
low_variance_features = variance_analysis.filter(
    pl.col('stdev') <= stdev_threshold
)['feature'].unique()

low_variance_features

feature
str
"""always_used_roaming"""


In [58]:
highly_correlated_keep = [
    'active_usage_months', 'total_usage_gb', 'avg_monthly_usage_gb', 'max_monthly_usage_gb',
    'contract_lifetime_days', 'last_1_delta_3mo', 'months_with_delta_3mo_increase', 'months_with_delta_2mo_increase', 'months_with_delta_3mo_increase'
]

features_from_clusters = sorted(set().union(*clusters_80))
highly_correlated_remove = list(set(features_from_clusters) - set(highly_correlated_keep))

In [62]:
# Cleaning the dataset
features_to_remove = (
    set(low_variance_features.to_list()) | set(highly_correlated_remove)
) - set(do_not_remove)

print(f'Dimensions before cleaning: {features.shape}')

features_cleaned = features.drop(list(features_to_remove))

print(f'Dimensions after cleaning: {features_cleaned.shape}')

Dimensions before cleaning: (100000, 83)
Dimensions after cleaning: (100000, 52)


In [60]:
features_cleaned.columns

['rating_account_id',
 'customer_id',
 'age',
 'contract_lifetime_days',
 'remaining_binding_days',
 'has_special_offer',
 'is_magenta1_customer',
 'available_gb',
 'gross_mrc',
 'has_done_upselling',
 'completion_rate',
 'is_bounded',
 'is_huawei',
 'is_oneplus',
 'is_samsung',
 'is_xiaomi',
 'is_iphone',
 'n_contracts_per_customer',
 'avg_monthly_usage_gb',
 'total_usage_gb',
 'max_monthly_usage_gb',
 'months_with_roaming',
 'ever_used_roaming',
 'active_usage_months',
 'months_with_no_delta_1mo_change',
 'avg_delta_2mo',
 'delta_2mo_volatility',
 'max_delta_2mo_increase',
 'max_delta_2mo_decrease',
 'months_with_delta_2mo_increase',
 'months_with_no_delta_2mo_change',
 'months_with_delta_3mo_increase',
 'months_with_no_delta_3mo_change',
 'last_1_delta_1mo',
 'last_2_delta_1mo',
 'last_3_delta_1mo',
 'last_1_delta_2mo',
 'last_2_delta_2mo',
 'last_1_delta_3mo',
 'n_rechnungsanfragen',
 'n_produkte&services-tarifdetails',
 'n_prolongation',
 'n_produkte&services-tarifwechsel',
 'days

# Storing

In [None]:
%%time

for directory in [features_dir, train_dir, test_dir, artifacts_dir]:
    directory.mkdir(parents=True, exist_ok=True)

# Save features
features.write_parquet(features_dir / 'features_cleaned-v0.parquet')
features_cleaned.write_parquet(features_dir / 'features-v0.parquet')

# Prepare features and target for splitting
X = features_cleaned.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = features_cleaned.select('has_done_upselling')

# Convert to numpy for sklearn
X_np = X.to_numpy()
y_np = y.to_numpy().ravel()

# Split into train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_np, y_np, test_size=0.2, random_state=42, stratify=y_np
)

# Convert back to Polars and save main split
train_df = pl.concat([
    pl.DataFrame(X_train, schema=X.columns),
    pl.DataFrame({'has_done_upselling': y_train})
], how='horizontal')

test_df = pl.concat([
    pl.DataFrame(X_test, schema=X.columns),
    pl.DataFrame({'has_done_upselling': y_test})
], how='horizontal')

train_df.write_parquet(train_dir / 'data-v0-80.parquet')
test_df.write_parquet(test_dir / 'data-v0-20.parquet')

# Split test data for meta learners (50/50)
X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42, stratify=y_test
)

# Convert back to Polars and save meta splits
train_meta_df = pl.concat([
    pl.DataFrame(X_meta_train, schema=X.columns),
    pl.DataFrame({'has_done_upselling': y_meta_train})
], how='horizontal')

test_meta_df = pl.concat([
    pl.DataFrame(X_meta_test, schema=X.columns),
    pl.DataFrame({'has_done_upselling': y_meta_test})
], how='horizontal')

train_meta_df.write_parquet(train_dir / 'data-meta-v0-50.parquet')
test_meta_df.write_parquet(test_dir / 'data-meta-v0-50.parquet')

**Features version 0**: 

- age imputed with Ridge regression
    
- stdev_threshold = 0.1
    
- using netowrk analysis for removing correlated features