# Imports and definitions

In [66]:
import pickle
from pathlib import Path

import polars as pl
import polars.selectors as cs

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import plotly.express as px
import plotly.graph_objects as go

import networkx as nx


_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [3]:
base_dir = Path('/Users/danlab/code/magenta-task/')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
raw_dir = data_dir / "raw"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'


In [4]:
def load_artifact(targ_file:str):
    targ_path = raw_dir / targ_file
    
    if not targ_path.exists():
        raise FileNotFoundError(f'Artifact {targ_file} not found in {raw_dir}')

    with open(targ_path,'rb') as fp:
        test_artifact = pickle.load(fp)

    return pl.from_pandas(test_artifact)


def plot_correlation_network(G, pos):
    edge_x = []
    edge_y = []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='#888'),
        hoverinfo='none',  # Disable edge hover
        mode='lines'
    )

    node_x = []
    node_y = []
    node_text = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        # Gather all neighbors and their correlations
        neighbors = []
        for nbr in G.neighbors(node):
            corr = G.get_edge_data(node, nbr).get('correlation', None)
            if corr is not None:
                neighbors.append(f"{nbr}: {corr:.2f}")
            else:
                neighbors.append(f"{nbr}")
        if neighbors:
            hover = f"{node}<br>Correlations:<br>" + "<br>".join(neighbors)
        else:
            hover = node
        node_text.append(hover)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=list(G.nodes),
        textposition="top center",
        hoverinfo='text',
        hovertext=node_text,
        marker=dict(
            showscale=False,
            color='#87ceeb',
            size=20,
            line_width=2
        )
    )

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Clusters of Highly Correlated Features',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        width=1200,
                        height=800
                    ))
    fig.show()


def correlation_graph_analysis(correlation_analysis, threshold):
    # Filter for high correlation
    high_corr_df = correlation_analysis.filter(pl.col("correlation").abs() > threshold)

    # Extract columns and correlation values as lists
    col1_list = high_corr_df["col1"].to_list()
    col2_list = high_corr_df["col2"].to_list()
    corr_list = high_corr_df["correlation"].to_list()

    # Build a graph of correlated columns, adding correlation as edge attribute
    G = nx.Graph()
    for c1, c2, corr in zip(col1_list, col2_list, corr_list):
        G.add_edge(c1, c2, correlation=corr)

    # Find groups of correlated columns (connected components)
    clusters = list(nx.connected_components(G))

    # Select one column from each cluster to keep
    columns_to_keep = [list(cluster)[0] for cluster in clusters]

    # Identify all involved columns
    all_involved = set(col1_list).union(set(col2_list))

    # Find all unique columns from original data
    all_columns = set(correlation_analysis["col1"].to_list()) | set(correlation_analysis["col2"].to_list())

    # Include columns that are not highly correlated with others
    non_clustered = all_columns - all_involved
    columns_to_keep.extend(non_clustered)

    # Final sorted list of columns to keep
    columns_to_keep = set(columns_to_keep)

    # Columns to remove: all columns involved in high correlation minus those selected to keep
    columns_to_remove = sorted(all_involved - columns_to_keep)

    return G, columns_to_remove, clusters

`core_data` <br><br>

| Feature Name           | Description                                                  |
|------------------------|--------------------------------------------------------------|
| rating_account_id      | Unique identifier for the contract account                    |
| customer_id            | Unique identifier for the customer                           |
| age                    | Age of the customer **in years**                                       |
| contract_lifetime_days | Total duration of the customer contract in days              |
| remaining_binding_days | Number of days left in the contract binding period - usual binding period is 2 years - **if it's positive it means that the customer is still in the binding period**       |
| has_special_offer      | Indicates if the customer has a special offer      |
| is_magenta1_customer   | Indicates if the customer is part of the Magenta1 program - fedelty program    |
| available_gb           | Amount of mobile data included in the current tariff         |
| gross_mrc              | Gross monthly recurring charge (in euros)                    |
| smartphone_brand       | Brand of the customer’s smartphone                           |
| has_done_upselling     | Whether the customer has already done an upsell in the last 3 years      |


`usage_info`

| Feature Name           | Description                                                  |
|------------------------|--------------------------------------------------------------|
| rating_account_id      | Unique identifier for the contract account                    |
| billed_period_month_d  | Billing period (monthly)                                     |
| has_used_roaming       | Indicates if roaming was used during the period            |
| used_gb                | Amount of mobile data used in the billing period (in GB)     |


`customer_interactions`

| Feature Name   | Description                                                              |
|----------------|--------------------------------------------------------------------------|
| customer_id    | Unique identifier for the customer                                       |
| type_subtype   | Category and subtype of the interaction (e.g., tariff change, billing)   |
| n              | Number of interactions of this type in the last 6 months                                |
| days_since_last| Number of days since the last interaction of this type                   |


# Read data

In [68]:
%%time

core_data = load_artifact('core_data')
customer_interactions = load_artifact('customer_interactions')
usage_info = load_artifact('usage_info')

CPU times: user 55.1 ms, sys: 31.6 ms, total: 86.7 ms
Wall time: 152 ms


---

# Features computation

## `core_data`

In [6]:
%%time

core_data = core_data.with_columns(
    pl.col('rating_account_id').cast(pl.Utf8),
    pl.col("has_done_upselling").cast(pl.Boolean),
    pl.col("has_special_offer").cast(pl.Boolean),
    pl.col("is_magenta1_customer").cast(pl.Boolean)
)

# Manipulating binding days
core_data = core_data.with_columns(
    (pl.col('contract_lifetime_days') + pl.col('remaining_binding_days')).alias('contract_binding_days'),
    (pl.col('contract_lifetime_days') / (pl.col('contract_lifetime_days') + pl.col('remaining_binding_days'))).round(2).alias('completion_rate'),
    pl.when(pl.col('remaining_binding_days') > 0)
        .then(True)
        .otherwise(False)
        .alias('is_bounded')
)


# One-hot-encoding smartphone brands - extracting the values in order to keep the same order for the columns
# The number of unique values is not too high, so one-hot-encoding is not affecting the dimensionality too much
smartphone_brands_list = core_data.select(pl.col('smartphone_brand')).unique().to_series().sort().to_list()
core_data = core_data.with_columns(
    [
        pl.when(pl.col("smartphone_brand") == brand)
        .then(True)
        .otherwise(False)
        .alias(f"is_{brand.lower()}")
        for brand in smartphone_brands_list
    ]
)
core_data = core_data.drop("smartphone_brand")


# Add how many contract has the customer - including the current one
n_contract_per_customer = core_data.group_by("customer_id").agg(
    pl.col("rating_account_id").count().alias("n_contracts_per_customer")
)
core_data = core_data.join(n_contract_per_customer, on="customer_id", how="left")

CPU times: user 24.6 ms, sys: 22.7 ms, total: 47.3 ms
Wall time: 31.8 ms


In [7]:
core_data

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32
"""289094""","""4.161115""",36,878,325,false,false,20,70,false,1203,0.73,true,false,false,false,false,true,1
"""677626""","""2.429976""",34,998,614,false,false,0,5,false,1612,0.62,true,false,false,true,false,false,1
"""769928""","""3.875044""",36,37,-26,false,true,50,16.94,false,11,3.36,false,false,false,true,false,false,2
"""873260""","""4.649933""",50,503,-149,false,true,20,30.2,true,354,1.42,false,false,false,false,false,true,1
"""109774""","""3.851059""",47,331,-328,true,true,,46.12,false,3,110.33,false,false,false,true,false,false,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""502283""","""5.605022""",88,1573,-576,false,false,10,34.18,false,997,1.58,false,false,false,true,false,false,4
"""618421""","""2.862063""",85,1138,412,true,false,40,50.1,false,1550,0.73,true,false,false,false,false,true,1
"""104422""","""2.414264""",79,1709,-494,false,false,10,12.96,false,1215,1.41,false,false,false,true,false,false,3
"""642380""","""3.619106""",84,1592,403,false,false,10,56.73,false,1995,0.8,true,false,false,true,false,false,2


In [8]:
core_data.shape

(100000, 19)

---

## `usage_info`

In [9]:
%%time

usage_info = usage_info.with_columns([
    pl.col('rating_account_id').cast(pl.Utf8),
    pl.col('billed_period_month_d').cast(pl.Date),
    pl.col('has_used_roaming').cast(pl.Boolean),
    pl.col('used_gb').cast(pl.Float64)
]).sort(['rating_account_id', 'billed_period_month_d'])

CPU times: user 102 ms, sys: 10.1 ms, total: 112 ms
Wall time: 39.6 ms


In [10]:
%%time

month_usage = usage_info.group_by('rating_account_id').agg([
    pl.col('used_gb')
])

month_usage = month_usage.with_columns([
    pl.col('used_gb').list.get(0).alias('last_1_month_usage_gb'),
    pl.col('used_gb').list.get(1).alias('last_2_month_usage_gb'),
    pl.col('used_gb').list.get(2).alias('last_3_month_usage_gb'),
    pl.col('used_gb').list.get(3).alias('last_4_month_usage_gb'),

]).drop('used_gb')


CPU times: user 4.55 ms, sys: 2.85 ms, total: 7.41 ms
Wall time: 5.1 ms


In [11]:
month_usage

rating_account_id,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,f64,f64,f64,f64
"""100010""",0.9,0.8,0.4,0.9
"""100017""",0.7,0.5,0.8,0.3
"""100036""",0.9,0.2,0.2,1
"""100047""",35.6,48.9,35.4,50.7
"""100064""",0.7,0.7,0.3,0.8
…,…,…,…,…
"""999922""",2.3,1.4,3.6,2.1
"""999934""",12.8,13.8,14.5,6.5
"""999940""",9.8,11.6,5.4,14.4
"""999956""",13,10.4,13.7,14.1


In [12]:
%%time

aggregated_features = usage_info.group_by('rating_account_id').agg([
    
    # BASIC USAGE STATISTICS
    pl.col('used_gb').mean().round(2).alias('avg_monthly_usage_gb'),
    pl.col('used_gb').median().round(2).alias('median_monthly_usage_gb'),
    pl.col('used_gb').sum().round(2).alias('total_usage_gb'),
    pl.col('used_gb').std().round(2).alias('usage_std_gb'),
    pl.col('used_gb').min().round(2).alias('min_monthly_usage_gb'),
    pl.col('used_gb').max().round(2).alias('max_monthly_usage_gb'),
    pl.col('used_gb').quantile(0.25).round(2).alias('usage_q25_gb'),
    pl.col('used_gb').quantile(0.75).round(2).alias('usage_q75_gb'),
    
    # ROAMING STATISTICS
    pl.col('has_used_roaming').sum().alias('months_with_roaming'),
    pl.col('has_used_roaming').any().alias('ever_used_roaming'),
    pl.col('has_used_roaming').all().alias('always_used_roaming'),
    
    # # USAGE INTENSITY CATEGORIES
    (pl.col('used_gb') == 0).sum().alias('zero_usage_months'),
    (pl.col('used_gb') > 0).sum().alias('active_usage_months'),
])

CPU times: user 755 ms, sys: 30.3 ms, total: 785 ms
Wall time: 116 ms


In [13]:
aggregated_features

rating_account_id,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months
str,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32
"""100010""",0.75,0.85,3,0.24,0.4,0.9,0.8,0.9,0,false,false,0,4
"""100017""",0.57,0.6,2.3,0.22,0.3,0.8,0.5,0.7,2,true,false,0,4
"""100036""",0.57,0.55,2.3,0.43,0.2,1,0.2,0.9,1,true,false,0,4
"""100047""",42.65,42.25,170.6,8.29,35.4,50.7,35.6,48.9,0,false,false,0,4
"""100064""",0.62,0.7,2.5,0.22,0.3,0.8,0.7,0.7,1,true,false,0,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",2.35,2.2,9.4,0.92,1.4,3.6,2.1,2.3,0,false,false,0,4
"""999934""",11.9,13.3,47.6,3.67,6.5,14.5,12.8,13.8,3,true,false,0,4
"""999940""",10.3,10.7,41.2,3.78,5.4,14.4,9.8,11.6,1,true,false,0,4
"""999956""",12.8,13.35,51.2,1.66,10.4,14.1,13,13.7,0,false,false,0,4


In [14]:
%%time

# CALCULATE TRENDS AND ROLLING METRICS
trend_features = usage_info.group_by('rating_account_id').agg([
    # ROLLING AVERAGES
    # 2-month rolling average
    pl.col('used_gb').rolling_mean_by(
        'billed_period_month_d', window_size='2mo'
    ).alias('avg_2month_rolling_usage_gb'),
    
    # 3-month rolling average
        pl.col('used_gb').rolling_mean_by(
        'billed_period_month_d', window_size='3mo'
    ).alias('avg_3month_rolling_usage_gb'),

    
    # PERIOD-OVER-PERIOD DELTAS
    (pl.col('used_gb') - pl.col('used_gb').shift(1)).alias('delta_1mo'),
    (pl.col('used_gb') - pl.col('used_gb').shift(2)).alias('delta_2mo'),
    (pl.col('used_gb') - pl.col('used_gb').shift(3)).alias('delta_3mo'),

    # VOLATILITY METRICS
    # Rolling standard deviation
    pl.col('used_gb').rolling_std_by(
        'billed_period_month_d', window_size='2mo'
    ).alias('std_2month_rolling_usage_gb')
])

trend_features = trend_features.with_columns([
    # delta_1mo statistics
    pl.col('delta_1mo').list.mean().round(2).alias('avg_delta_1mo'),
    pl.col('delta_1mo').list.std().round(2).alias('delta_1mo_volatility'),
    pl.col('delta_1mo').list.max().round(2).alias('max_delta_1mo_increase'),
    pl.col('delta_1mo').list.min().round(2).alias('max_delta_1mo_decrease'),
    pl.col('delta_1mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_1mo_increase'),
    pl.col('delta_1mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_1mo_decrease'),
    pl.col('delta_1mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_1mo_change'),

    # delta_2mo statistics
    pl.col('delta_2mo').list.mean().round(2).alias('avg_delta_2mo'),
    pl.col('delta_2mo').list.std().round(2).alias('delta_2mo_volatility'),
    pl.col('delta_2mo').list.max().round(2).alias('max_delta_2mo_increase'),
    pl.col('delta_2mo').list.min().round(2).alias('max_delta_2mo_decrease'),
    pl.col('delta_2mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_2mo_increase'),
    pl.col('delta_2mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_2mo_decrease'),
    pl.col('delta_2mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_2mo_change'),

    # delta_3mo statistics
    pl.col('delta_3mo').list.mean().round(2).alias('avg_delta_3mo'),
    pl.col('delta_3mo').list.max().round(2).alias('max_delta_3mo_increase'),
    pl.col('delta_3mo').list.min().round(2).alias('max_delta_3mo_decrease'),
    pl.col('delta_3mo').list.eval(pl.element() > 0).list.sum().alias('months_with_delta_3mo_increase'),
    pl.col('delta_3mo').list.eval(pl.element() < 0).list.sum().alias('months_with_delta_3mo_decrease'),
    pl.col('delta_3mo').list.eval(pl.element() == 0).list.sum().alias('months_with_no_delta_3mo_change'),
])

# The following block extracts the last N values from the rolling/statistical lists for each account.
# Each column contains the most recent, second most recent, etc. value from the corresponding list

trend_features = trend_features.with_columns([
    pl.col('avg_2month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_2mo_rolling_avg')
    for i in range(3)
]).drop('avg_2month_rolling_usage_gb')

trend_features = trend_features.with_columns([
    pl.col('avg_3month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_3mo_rolling_avg')
    for i in range(2)
]).drop('avg_3month_rolling_usage_gb')

trend_features = trend_features.with_columns([
    pl.col('delta_1mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_1mo')
    for i in range(3)
]).drop('delta_1mo')

trend_features = trend_features.with_columns([
    pl.col('delta_2mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_2mo')
    for i in range(2)
]).drop('delta_2mo')

trend_features = trend_features.with_columns([
    pl.col('delta_3mo').list.get(-(i+1)).round(2).alias(f'last_{i+1}_delta_3mo')
    for i in range(1)
]).drop('delta_3mo')

trend_features = trend_features.with_columns([
    pl.col('std_2month_rolling_usage_gb').list.get(-(i+1)).round(2).alias(f'last_{i+1}_2mo_rolling_stdev')
    for i in range(3)
]).drop('std_2month_rolling_usage_gb')

CPU times: user 1.36 s, sys: 62.1 ms, total: 1.42 s
Wall time: 261 ms


In [15]:
trend_features

rating_account_id,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev
str,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""100010""",0,0.46,0.5,-0.4,1,2,0,-0.2,0.42,0.1,-0.5,1,1,0,0,0,0,0,0,1,0.65,0.6,0.85,0.7,0.7,0.5,-0.4,-0.1,0.1,-0.5,0,0.35,0.28,0.07
"""100017""",-0.13,0.4,0.3,-0.5,1,2,0,-0.05,0.21,0.1,-0.2,1,1,0,-0.4,-0.4,-0.4,0,1,0,0.55,0.65,0.6,0.53,0.67,-0.5,0.3,-0.2,-0.2,0.1,-0.4,0.35,0.21,0.14
"""100036""",0.03,0.75,0.8,-0.7,1,1,1,0.05,1.06,0.8,-0.7,1,1,0,0.1,0.1,0.1,1,0,0,0.6,0.2,0.55,0.47,0.43,0.8,0,-0.7,0.8,-0.7,0.1,0.57,0,0.49
"""100047""",5.03,16.08,15.3,-13.5,2,1,0,0.8,1.41,1.8,-0.2,1,1,0,15.1,15.1,15.1,1,0,0,43.05,42.15,42.25,45,39.97,15.3,-13.5,13.3,1.8,-0.2,15.1,10.82,9.55,9.4
"""100064""",0.03,0.45,0.5,-0.4,1,1,1,-0.15,0.35,0.1,-0.4,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.5,0.7,0.6,0.57,0.5,-0.4,0,0.1,-0.4,0.1,0.35,0.28,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",-0.07,1.99,2.2,-1.5,1,2,0,1,0.42,1.3,0.7,2,0,0,-0.2,-0.2,-0.2,0,1,0,2.85,2.5,1.85,2.37,2.43,-1.5,2.2,-0.9,0.7,1.3,-0.2,1.06,1.56,0.64
"""999934""",-2.1,5.11,1,-8,2,1,0,-2.8,6.36,1.7,-7.3,1,1,0,-6.3,-6.3,-6.3,0,1,0,10.5,14.15,13.3,11.6,13.7,-8,0.7,1,-7.3,1.7,-6.3,5.66,0.49,0.71
"""999940""",1.53,7.6,9,-6.2,2,1,0,-0.8,5.09,2.8,-4.4,1,1,0,4.6,4.6,4.6,1,0,0,9.9,8.5,10.7,10.47,8.93,9,-6.2,1.8,2.8,-4.4,4.6,6.36,4.38,1.27
"""999956""",0.37,2.95,3.3,-2.6,2,1,0,2.2,2.12,3.7,0.7,2,0,0,1.1,1.1,1.1,1,0,0,13.9,12.05,11.7,12.73,12.37,0.4,3.3,-2.6,3.7,0.7,1.1,0.28,2.33,1.84


In [16]:
usage_features = aggregated_features.join(
    trend_features, 
    on='rating_account_id', 
    how='left'
).join(
    month_usage,
    on='rating_account_id',
    how='left'
)

In [17]:
usage_features

rating_account_id,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,avg_delta_2mo,delta_2mo_volatility,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""100010""",0.75,0.85,3,0.24,0.4,0.9,0.8,0.9,0,false,false,0,4,0,0.46,0.5,-0.4,1,2,0,-0.2,0.42,0.1,-0.5,1,1,0,0,0,0,0,0,1,0.65,0.6,0.85,0.7,0.7,0.5,-0.4,-0.1,0.1,-0.5,0,0.35,0.28,0.07,0.9,0.8,0.4,0.9
"""100017""",0.57,0.6,2.3,0.22,0.3,0.8,0.5,0.7,2,true,false,0,4,-0.13,0.4,0.3,-0.5,1,2,0,-0.05,0.21,0.1,-0.2,1,1,0,-0.4,-0.4,-0.4,0,1,0,0.55,0.65,0.6,0.53,0.67,-0.5,0.3,-0.2,-0.2,0.1,-0.4,0.35,0.21,0.14,0.7,0.5,0.8,0.3
"""100036""",0.57,0.55,2.3,0.43,0.2,1,0.2,0.9,1,true,false,0,4,0.03,0.75,0.8,-0.7,1,1,1,0.05,1.06,0.8,-0.7,1,1,0,0.1,0.1,0.1,1,0,0,0.6,0.2,0.55,0.47,0.43,0.8,0,-0.7,0.8,-0.7,0.1,0.57,0,0.49,0.9,0.2,0.2,1
"""100047""",42.65,42.25,170.6,8.29,35.4,50.7,35.6,48.9,0,false,false,0,4,5.03,16.08,15.3,-13.5,2,1,0,0.8,1.41,1.8,-0.2,1,1,0,15.1,15.1,15.1,1,0,0,43.05,42.15,42.25,45,39.97,15.3,-13.5,13.3,1.8,-0.2,15.1,10.82,9.55,9.4,35.6,48.9,35.4,50.7
"""100064""",0.62,0.7,2.5,0.22,0.3,0.8,0.7,0.7,1,true,false,0,4,0.03,0.45,0.5,-0.4,1,1,1,-0.15,0.35,0.1,-0.4,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.5,0.7,0.6,0.57,0.5,-0.4,0,0.1,-0.4,0.1,0.35,0.28,0,0.7,0.7,0.3,0.8
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""999922""",2.35,2.2,9.4,0.92,1.4,3.6,2.1,2.3,0,false,false,0,4,-0.07,1.99,2.2,-1.5,1,2,0,1,0.42,1.3,0.7,2,0,0,-0.2,-0.2,-0.2,0,1,0,2.85,2.5,1.85,2.37,2.43,-1.5,2.2,-0.9,0.7,1.3,-0.2,1.06,1.56,0.64,2.3,1.4,3.6,2.1
"""999934""",11.9,13.3,47.6,3.67,6.5,14.5,12.8,13.8,3,true,false,0,4,-2.1,5.11,1,-8,2,1,0,-2.8,6.36,1.7,-7.3,1,1,0,-6.3,-6.3,-6.3,0,1,0,10.5,14.15,13.3,11.6,13.7,-8,0.7,1,-7.3,1.7,-6.3,5.66,0.49,0.71,12.8,13.8,14.5,6.5
"""999940""",10.3,10.7,41.2,3.78,5.4,14.4,9.8,11.6,1,true,false,0,4,1.53,7.6,9,-6.2,2,1,0,-0.8,5.09,2.8,-4.4,1,1,0,4.6,4.6,4.6,1,0,0,9.9,8.5,10.7,10.47,8.93,9,-6.2,1.8,2.8,-4.4,4.6,6.36,4.38,1.27,9.8,11.6,5.4,14.4
"""999956""",12.8,13.35,51.2,1.66,10.4,14.1,13,13.7,0,false,false,0,4,0.37,2.95,3.3,-2.6,2,1,0,2.2,2.12,3.7,0.7,2,0,0,1.1,1.1,1.1,1,0,0,13.9,12.05,11.7,12.73,12.37,0.4,3.3,-2.6,3.7,0.7,1.1,0.28,2.33,1.84,13,10.4,13.7,14.1


In [18]:
usage_features.shape

(100000, 52)

---

## `customer_interactions`

In [19]:
%%time

interactions_features = customer_interactions.pivot(
    index='customer_id',
    on='type_subtype', 
    values=['n', 'days_since_last'],
    aggregate_function='first' # There is only one value per customer
)

CPU times: user 21.6 ms, sys: 13.7 ms, total: 35.3 ms
Wall time: 18.6 ms


In [20]:
interactions_features.shape

(42095, 9)

---

## `combined_features`

In [21]:
features = core_data.join(
    usage_features,
    on='rating_account_id',
    how='left'
).join(
    interactions_features,
    on='customer_id',
    how='left'
)

In [22]:
features.head()

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,max_delta_2mo_increase,max_delta_2mo_decrease,months_with_delta_2mo_increase,months_with_delta_2mo_decrease,months_with_no_delta_2mo_change,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel
str,str,i64,i64,i64,bool,bool,i64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64
"""289094""","""4.161115""",36,878,325,False,False,20.0,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.2,-0.7,0,2,0,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,,,,,,,,
"""677626""","""2.429976""",34,998,614,False,False,0.0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.5,-0.5,1,1,0,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,,,1.0,1.0,,,87.0,118.0
"""769928""","""3.875044""",36,37,-26,False,True,50.0,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,0.1,-0.3,1,1,0,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,,,,,,,,
"""873260""","""4.649933""",50,503,-149,False,True,20.0,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.8,-0.1,1,1,0,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,,,,,,,,
"""109774""","""3.851059""",47,331,-328,True,True,,46.12,False,3,110.33,False,False,False,True,False,False,3,0.35,0.35,1.4,0.29,0.1,0.6,0.1,0.6,1,True,False,0,4,0.0,0.5,0.5,-0.5,1,…,0.5,-0.5,1,1,0,0.0,0.0,0.0,0,0,1,0.35,0.1,0.35,0.27,0.27,0.5,0.0,-0.5,0.5,-0.5,0.0,0.35,0.0,0.35,0.6,0.1,0.1,0.6,,,,,,,,


In [23]:
%%time

# Filling null values from intereactions features
features = features.with_columns([
    pl.when(pl.col(col).is_null())
    .then(0)
    .otherwise(pl.col(col))
    .alias(col)
    for col in features.columns if col.startswith('n_')
] + [
    pl.when(pl.col(col).is_null())
    .then(-1)
    .otherwise(pl.col(col))
    .alias(col)
    for col in features.columns if col.startswith('days_since_last')
])

CPU times: user 1.97 ms, sys: 4.38 ms, total: 6.35 ms
Wall time: 2.44 ms


### Dealing with null values in `available_gb`

In [24]:
features.filter(pl.col('available_gb').is_null()).select('rating_account_id', 'available_gb', 'last_1_month_usage_gb', 'last_2_month_usage_gb', 'last_3_month_usage_gb', 'last_4_month_usage_gb')

rating_account_id,available_gb,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb
str,i64,f64,f64,f64,f64
"""109774""",,0.6,0.1,0.1,0.6
"""781755""",,0.6,0.8,0.3,0
"""827238""",,0,0.7,0.9,0.1
"""330581""",,0,0.2,0.2,0.9
"""416121""",,0.8,0.8,0.3,0.5
…,…,…,…,…,…
"""662172""",,54.7,62.5,24.7,37.4
"""556788""",,19.6,32.6,31,65.2
"""283647""",,51.3,18.1,53.3,39.2
"""581854""",,49.5,21.5,27.7,18.6


In [25]:
%%time

available_gb_values = features['available_gb'].to_numpy().reshape(-1, 1)

imputer = SimpleImputer(strategy='mean')
imputed_available_gb = imputer.fit_transform(available_gb_values).flatten()

CPU times: user 2.89 ms, sys: 1.63 ms, total: 4.52 ms
Wall time: 3.78 ms


In [26]:
features = features.with_columns(
    pl.Series('available_gb', imputed_available_gb)
)

In [27]:
# Checking if there are other columns to fill

null_counts_features = features.select([
    pl.col(col).is_null().sum().alias(f'{col}_nulls') for col in features.columns
])

null_counts_features.select([
    col for col in null_counts_features.columns if null_counts_features[0, col] != 0
])

### Computing additional features

Computing features based on `available_gb`, since there are no missing values

In [28]:
%%time

# Compute how many times the contract required more than available data, baed on definde threshold of the current plan.

# Compute 20%, 50%, and 80% for available_gb per rating_account_id
thresholds_available_gb = features.group_by('rating_account_id').agg([
    (pl.col('available_gb') / 100 * 25).get(0).round(2).alias('p25'),
    (pl.col('available_gb') / 100 * 50).get(0).round(2).alias('p50'),
    (pl.col('available_gb') / 100 * 70).get(0).round(2).alias('p75'),
])

# Compute, for each month, in which range the usage falls
percentile_exprs = []
for i in range(1, 5):
    percentile_expr = (
        pl.when(pl.col(f'last_{i}_month_usage_gb').is_between(-1, pl.col('p25'), closed='right'))  # -1 otherwise 0 is not counted
        .then(pl.lit('P1'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p25'), pl.col('p50'), closed='right'))
        .then(pl.lit('P2'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p50'), pl.col('p75'), closed='right'))
        .then(pl.lit('P3'))
        .when(pl.col(f'last_{i}_month_usage_gb').is_between(pl.col('p75'), pl.col('available_gb'), closed='right'))
        .then(pl.lit('P4'))
        .when(pl.col(f'last_{i}_month_usage_gb') > pl.col('available_gb'))
        .then(pl.lit('P5'))  # how many times has exceeded the available data
        .otherwise(pl.lit(None))
        .alias(f'month_{i}_threshold')
    )
    percentile_exprs.append(percentile_expr)

# Compute how many times, in the past 4 months, the usage felt in specific ranges
count_exprs = []
for p in range(1, 6):
    count_expr = sum(
        (pl.col(f'month_{i}_threshold') == f'P{p}').cast(pl.Int32)
        for i in range(1, 5)
    ).alias(f'times_in_p{p}')
    count_exprs.append(count_expr)

# Final computation
features = (
    features
    .join(
        thresholds_available_gb,
        on='rating_account_id',
        how='left')
    .with_columns(percentile_exprs)
    .with_columns(count_exprs)
).drop(['p25', 'p50', 'p75', 'month_1_threshold', 'month_2_threshold', 'month_3_threshold', 'month_4_threshold'])

CPU times: user 34.9 ms, sys: 20.5 ms, total: 55.3 ms
Wall time: 18.9 ms


In [29]:
# Check that the sum of times_in_p1 to times_in_p5 is always 4 (number of billing months) for each row
check_sum = features.select(
    (pl.col('times_in_p1') + pl.col('times_in_p2') + pl.col('times_in_p3') + pl.col('times_in_p4') + pl.col('times_in_p5')).alias('sum_p')
)

# Count how many rows do not have sum == 4
invalid_rows = check_sum.filter(pl.col('sum_p') != 4).height
print(f"Number of invalid rows: {invalid_rows}")

Number of invalid rows: 0


---

## `final features dataframe`

In [30]:
features.shape

(100000, 83)

In [31]:
features.head()

rating_account_id,customer_id,age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,…,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel,times_in_p1,times_in_p2,times_in_p3,times_in_p4,times_in_p5
str,str,i64,i64,i64,bool,bool,f64,f64,bool,i64,f64,bool,bool,bool,bool,bool,bool,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,bool,bool,u32,u32,f64,f64,f64,f64,u32,…,f64,f64,f64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i32,i32,i32,i32,i32
"""289094""","""4.161115""",36,878,325,False,False,20.0,70.0,False,1203,0.73,True,False,False,False,False,True,1,0.28,0.15,1.1,0.36,0.0,0.8,0.1,0.2,1,True,False,1,3,-0.27,0.29,-0.1,-0.6,0,…,-0.8,-0.8,-0.8,0,1,0,0.05,0.15,0.5,0.1,0.37,-0.1,-0.1,-0.6,-0.2,-0.7,-0.8,0.07,0.07,0.42,0.8,0.2,0.1,0.0,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""677626""","""2.429976""",34,998,614,False,False,0.0,5.0,False,1612,0.62,True,False,False,True,False,False,1,0.65,0.65,2.6,0.31,0.3,1.0,0.5,0.8,1,True,False,0,4,0.07,0.55,0.7,-0.3,1,…,0.2,0.2,0.2,1,0,0,0.65,0.4,0.65,0.6,0.53,0.7,-0.2,-0.3,0.5,-0.5,0.2,0.49,0.14,0.21,0.8,0.5,0.3,1.0,0,0,1,1,-1,-1,87,118,0,0,0,0,4
"""769928""","""3.875044""",36,37,-26,False,True,50.0,16.94,False,11,3.36,False,False,False,True,False,False,2,0.6,0.55,2.4,0.32,0.3,1.0,0.4,0.7,0,False,False,0,4,-0.2,0.56,0.4,-0.7,1,…,-0.6,-0.6,-0.6,0,1,0,0.55,0.5,0.65,0.47,0.67,-0.3,0.4,-0.7,0.1,-0.3,-0.6,0.21,0.28,0.49,1.0,0.3,0.7,0.4,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""873260""","""4.649933""",50,503,-149,False,True,20.0,30.2,True,354,1.42,False,False,False,False,False,True,1,0.38,0.25,1.5,0.36,0.1,0.9,0.2,0.3,0,False,False,0,4,0.03,0.67,0.6,-0.7,2,…,0.1,0.1,0.1,1,0,0,0.55,0.6,0.2,0.47,0.43,-0.7,0.6,0.2,-0.1,0.8,0.1,0.49,0.42,0.14,0.1,0.3,0.9,0.2,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0
"""109774""","""3.851059""",47,331,-328,True,True,24.9989516842939,46.12,False,3,110.33,False,False,False,True,False,False,3,0.35,0.35,1.4,0.29,0.1,0.6,0.1,0.6,1,True,False,0,4,0.0,0.5,0.5,-0.5,1,…,0.0,0.0,0.0,0,0,1,0.35,0.1,0.35,0.27,0.27,0.5,0.0,-0.5,0.5,-0.5,0.0,0.35,0.0,0.35,0.6,0.1,0.1,0.6,0,0,0,0,-1,-1,-1,-1,4,0,0,0,0


# Features cleaning

## Correlation analysis

In [32]:
numeric_and_booleans_features = features.select(cs.numeric() | cs.boolean())

In [33]:
'has_done_upselling' in numeric_and_booleans_features.columns

True

In [34]:
%%time

correlation_matrix = numeric_and_booleans_features.corr()

CPU times: user 27.5 ms, sys: 54.4 ms, total: 81.9 ms
Wall time: 54.5 ms


In [35]:
correlation_matrix

age,contract_lifetime_days,remaining_binding_days,has_special_offer,is_magenta1_customer,available_gb,gross_mrc,has_done_upselling,contract_binding_days,completion_rate,is_bounded,is_huawei,is_oneplus,is_samsung,is_xiaomi,is_iphone,n_contracts_per_customer,avg_monthly_usage_gb,median_monthly_usage_gb,total_usage_gb,usage_std_gb,min_monthly_usage_gb,max_monthly_usage_gb,usage_q25_gb,usage_q75_gb,months_with_roaming,ever_used_roaming,always_used_roaming,zero_usage_months,active_usage_months,avg_delta_1mo,delta_1mo_volatility,max_delta_1mo_increase,max_delta_1mo_decrease,months_with_delta_1mo_increase,months_with_delta_1mo_decrease,months_with_no_delta_1mo_change,…,avg_delta_3mo,max_delta_3mo_increase,max_delta_3mo_decrease,months_with_delta_3mo_increase,months_with_delta_3mo_decrease,months_with_no_delta_3mo_change,last_1_2mo_rolling_avg,last_2_2mo_rolling_avg,last_3_2mo_rolling_avg,last_1_3mo_rolling_avg,last_2_3mo_rolling_avg,last_1_delta_1mo,last_2_delta_1mo,last_3_delta_1mo,last_1_delta_2mo,last_2_delta_2mo,last_1_delta_3mo,last_1_2mo_rolling_stdev,last_2_2mo_rolling_stdev,last_3_2mo_rolling_stdev,last_1_month_usage_gb,last_2_month_usage_gb,last_3_month_usage_gb,last_4_month_usage_gb,n_rechnungsanfragen,n_produkte&services-tarifdetails,n_prolongation,n_produkte&services-tarifwechsel,days_since_last_rechnungsanfragen,days_since_last_produkte&services-tarifdetails,days_since_last_prolongation,days_since_last_produkte&services-tarifwechsel,times_in_p1,times_in_p2,times_in_p3,times_in_p4,times_in_p5
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.9999999999999999,-0.06295258790534654,0.007917425991875473,-0.004591802863005674,-0.000570452210647228,-0.0012253075175914954,-0.0014248647679731138,-0.04153326724228018,-0.046891894624137745,0.0035360243153289943,0.003974704146399675,-0.002902167339478592,0.0019611469676663455,0.0009508412788796635,0.0017371282874025114,0.0002704501086987172,-0.0033925108458090444,-0.07313075894941892,-0.07117159510218991,-0.07313072441998787,-0.07526670691780497,-0.06588505936769569,-0.07627248976389654,-0.06852675138912757,-0.07211471686541757,0.002421321337000976,0.0028884406207485515,0.002764312854141388,0.01018740634351683,-0.010187406343516843,0.0007023898149709595,-0.06986944684724943,-0.06434228889805589,0.06525763675673027,-0.002710957959640463,-0.003923417078655525,0.013449505856100885,…,0.0007018860866903407,0.0007018860866903407,0.0007018860866903407,-0.00279647316290522,-0.0001612291690173121,0.008278462206984286,-0.07023640985753905,-0.07118943304127509,-0.07216078833847925,-0.07239798432566148,-0.0725167554493371,-0.0022278083988556117,0.004810231937738107,-0.0018765816242891976,0.0025777466018753643,0.0029294087217661253,0.0007018860866903407,-0.058185068417960825,-0.06188408762322855,-0.05992545507252265,-0.06797684008539366,-0.06941992923036576,-0.06614590372886123,-0.06763631390301164,-0.0029619271116916672,-0.0036624528884943667,-0.0010602562815135994,0.002456956467397597,-0.0021735524770574617,-0.0017217017286032324,-0.004082781479781801,0.0007222265215138496,0.04729548025053901,0.01236204396638056,-0.009869349132019378,-0.026897493871128556,-0.04815135067381582
-0.06295258790534654,1,-0.000378391001072873,-0.002302567852481359,0.002191272514161899,-0.0024923207572409467,0.0030451716731356706,0.0043795501517610745,0.81715350460163,-0.07300261199878698,0.0026355061034911927,0.00028419739566269334,-0.004675920594288189,-0.002674383892171953,-0.005180140865737782,0.00549362737358115,0.0023650517704196113,0.7632866265204388,0.7508626128730529,0.7632872022690684,0.7272201981256766,0.7109202571111353,0.773235463186077,0.731859339386122,0.7537621693882461,-0.005097468591355158,-0.001969789293841624,-0.004880964016446409,-0.09475926466383608,0.09475926466383568,-0.0015123138415764302,0.6811245720092799,0.6307247798969278,-0.6291921881077684,0.032649381714104427,0.03404249035232128,-0.13522088707618493,…,-0.0015110509232823463,-0.0015110509232823463,-0.0015110509232823463,0.015538860151976279,0.013517621426203915,-0.08134120648697997,0.7434205175805317,0.7432780142548713,0.7427992699446556,0.7563778588356058,0.7563208014063175,-0.0039620653155460215,0.008440296747651644,-0.006002610013797876,0.004470249234288837,0.0024312511947035165,-0.0015110509232823463,0.5862943391397317,0.587859383432209,0.5894401962885601,0.7074652280240127,0.7068184459534229,0.708515899699905,0.7074916297772676,0.004214446606026722,0.0021811147714457923,0.002424094037176001,0.0038562618871115376,0.004366695774611266,0.0014775262795115643,-0.0016269760313556325,0.002949492743459896,-0.45995823470542785,-0.15633007601866283,0.0689412525542355,0.28351976205217794,0.4918916103943215
0.007917425991875473,-0.0003783910010728731,1,0.00283662162279771,-0.001727076244464288,0.0008559267718102365,0.0021796198528986886,-0.022705403240455765,0.5761108710174806,-0.15074952149015589,0.7966607385345018,0.00224021084383798,0.0017933132699779603,-0.0022144562948615398,-0.0001930799987491974,-0.00018573325638276183,0.003118438225211096,-0.003368056021030916,-0.003378621844462096,-0.0033670204985767533,-0.0022782913243612774,-0.0035119320813881854,-0.003135261219195255,-0.003881068847467617,-0.002925897534843186,-0.0033505124922028153,-0.002559247357206372,-0.0032729940735208313,-0.0010846215558287027,0.001084621555828698,0.004231075366245361,-0.0012913676246216748,-0.0005253656943313113,0.00124444822266277,0.0010398710334205044,-0.0009434670746599099,-0.00019916025160281116,…,0.004228160955008829,0.004228160955008829,0.004228160955008829,0.008375570520986973,-0.0067141906362208625,-0.004642390731301999,-0.0031345016415436563,-0.001615083942391334,-0.0034218387587696035,-0.002308263865614647,-0.0032358684666986552,-0.0013602383904919354,-0.003388871642487806,0.00900347689945319,-0.004733428090167086,0.005614739084775262,0.004228160955008829,-0.0037692239748396495,-0.001050758085637655,0.0012077114722155188,-0.006000834291976906,-0.0005025521403264938,-0.002567809623782947,-0.0034033625327004194,0.004289310345529879,-0.004545080383924224,-0.0005507299682038369,0.008401502292391555,0.00411487043439879,-0.004200652333155375,-0.001423916837225221,0.002532071796622155,0.0034989853085557075,0.0005367115434312151,-0.0018877810150013526,0.0014601049841854936,-0.004015182071796204
-0.004591802863005675,-0.002302567852481359,0.0028366216227977106,1,0.00044653260013578105,-0.0010260014136004802,0.003530696955822641,-0.0007153183529045534,-0.0002469677269840189,0.005304590702156605,0.0031468431797223263,-0.0018501656333436097,-0.0019855200479594413,0.004639694472271227,-0.0006994313105865521,-0.002148628346854387,-0.00739977837345587,-0.00036387406011781777,0.0003236738407220768,-0.0003643486087252134,-0.00013870475134969456,-0.0016886485359204765,-0.0007408673071812726,0.0005407046809743829,0.00014650502283508978,0.0026692221602342453,0.004323862588915077,-0.0033479916949770957,-0.00043438304413737387,0.0004343830441373694,0.001351562700263275,-0.000005663737023054304,0.00013351741162504746,-0.0002582157862526477,0.003004941634871091,-0.0015036729066845605,-0.003052345920330754,…,0.001349932506413348,0.001349932506413348,0.001349932506413348,0.0009211738308007091,-0.001097640478703321,0.0004951572325244248,-0.0007434587933056054,-0.0006425903034163682,0.00003486163028740664,-0.000310647923446531,-0.0006073192648943468,0.0034742949891955095,-0.003787841108221768,0.0016777168352904977,-0.0003171353616405198,-0.002106858528473486,0.001349932506413348,-0.0001492542358541921,-0.0007267262013228647,0.0007879608197672751,-0.00047936051186666203,0.0005479364355261377,-0.0017658823294664225,0.00035188335134827203,0.0018968546922942604,-0.004064966673514216,0.0006671059750849071,-0.0013340506894134276,0.002873786297482064,-0.005206987725898358,0.0008017037934854721,-0.0011162407077475994,0.005004791888093391,-0.0023282234309633825,-0.0005706454190165219,-0.005875019450550684,-0.002140377251416099
-0.0005704522106472281,0.0021912725141618994,-0.001727076244464288,0.000446532600135781,1,0.0035168351234423317,-0.0021994168695607065,0.034149530521502186,0.0007955623976277207,0.0008558560142030636,0.0005701964969026631,-0.0004967342566142408,-0.0026001613540921685,0.004291713228340905,-0.00033392055525746173,-0.002833838644160845,0.004800126165530714,0.0004998491989981693,0.00015949929910605016,0.000500947556480564,0.0027149049685117914,-0.00024960329153579685,0.0013383993034229867,-0.0006532016362665892,0.0008007297292978834,-0.0019588304706099576,-0.001945659756255781,0.0023120308307685035,0.0008860887027969941,-0.0008860887027970653,-0.0006902612895723765,0.00335921353267419,0.003963317145546036,-0.002631617811949843,-0.005140270866835341,0.0035799532097913114,0.003179914686298588,…,-0.0006910313027378003,-0.0006910313027378003,-0.0006910313027378003,-0.0020711702709546743,0.0028200550221587546,-0.0020992480963502847,0.0004149714165761175,-0.0002638984891702976,0.000560595971835459,0.0001651411895966272,0.00031691512375312295,0.0018720020621992353,0.0002461717794627952,-0.002810463135843386,0.0021095376520569174,-0.0025634997389915566,-0.0006910313027378003,0.0034654345737461906,0.003078397220572932,0.0005181311041372369,0.0013910333642018387,-0.0003273415275068238,-0.00017554152731129045,0.0009670697878181444,-0.001350850083372136,-0.0008705444982001594,0.0024878267534028005,0.0038602315018434323,0.0009160733249253232,-0.0036879796846019343,0.002072694952001781,0.002538730639622578,0.0020845329347505058,-0.0031370281050989582,-0.0012755023013305072,0.002378399061075557,-0.0007234401576974356
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.047295480250539014,-0.4599582347054279,0.003498985308555708,0.00500479188809339,0.0020845329347505058,0.41074311320965634,-0.0015786861049081466,-0.01611225629727956,-0.3739399203013364,0.035459140445478356,0.0031464814696557696,-0.007280092630399234,0.004721126924871832,0.003379246205920885,0.003865214673840889,-0.00006986134206189095,-0.0010109888745741093,-0.6532015094918177,-0.6434330219796277,-0.6532009821649973,-0.5876741807283106,-0.6290316043120352,-0.6506909666431732,-0.6324657577469324,-0.6417055651468832,-0.002339928911034705,-0.002786551247788963,-0.00023237922713744186,0.19577329571117694,-0.19577329571117694,0.0021193464701916576,-0.5511814953449027,-0.5098238848700553,0.5096504295526929,-0.04930234963309957,-0.05212650354847552,0.20565042600436303,…,0.0021196573915745177,0.0021196573915745177,0.0021196573915745177,-0.021970835872872917,-0.02260701316335844,0.1247941077532052,-0.6358571893415916,-0.6364195096921996,-0.6360110996388896,-0.6473137487204913,-0.6474466893113682,0.004221304921340401,-0.005920417786765333,0.0038387312512564637,-0.0016996110747849733,-0.0020770341939769323,0.0021196573915745177,-0.4749221577980604,-0.47554758721969975,-0.47615516812368636,-0.6053594590118944,-0.6056021261973304,-0.6062564611724778,-0.6048726041170314,0.00044736737231605597,-0.0010028211017341958,-0.0013228546515921295,-0.002016528673707046,0.0002502373139193412,-0.0038680425910153967,0.001178934978342407,-0.0014261710575648276,0.9999999999999999,-0.27003863757617325,-0.331974275508078,-0.35037080950102245,-0.7257952054068041
0.01236204396638056,-0.1563300760186628,0.0005367115434312151,-0.0023282234309633825,-0.0031370281050989582,0.10756719282353502,-0.0008442611540020387,-0.007837300879381276,-0.12747039563796564,0.01074090902428416,-0.00436545579924208,0.007211045832619525,-0.0007378019212895178,-0.002400816931445504,0.005234224875406014,-0.004964383510103827,-0.004161767624393396,-0.08181237536883991,-0.08141842955246964,-0.08181378725721433,-0.08892312664229374,-0.0670614995667678,-0.08580932019657628,-0.07831770470321618,-0.0825568240708893,0.004167809783582139,-0.0009295668277054467,0.002139927981488545,-0.10153684290979442,0.10153684290979442,-0.0030455101373349646,-0.08320157994258401,-0.07851366999893133,0.07535400040476069,0.023578692640120788,0.02199823050360579,-0.09241405697044042,…,-0.0030457801244474724,-0.0030457801244474724,-0.0030457801244474724,0.005358876693272951,0.014461123243094566,-0.05549039582309106,-0.08036444989873993,-0.07981113433535152,-0.07893620504338404,-0.08147382735201102,-0.08076256180971711,-0.00024640125486302797,-0.0019132215008516417,-0.0009102872620423625,-0.0021529744001753948,-0.0028212060860127192,-0.0030457801244474724,-0.07293266610317696,-0.07265557481199317,-0.07179805082976284,-0.07470839944552639,-0.07558751237826275,-0.07638566148416939,-0.07668647509758995,-0.004732616231234803,-0.0043673531194245,-0.0023263558988881566,-0.0009730109795919482,-0.006692711323621462,-0.005582105002282609,-0.00016098785851173595,-0.0008166769075911994,-0.27003863757617325,1,0.13511359308253995,-0.07662669191963369,-0.32386239243083964
-0.009869349132019378,0.06894125255423551,-0.0018877810150013526,-0.0005706454190165219,-0.0012755023013305072,0.034594821895855075,0.002295932513771814,0.00012890568655893182,0.05526246803540288,-0.0035533747733877104,-0.004294075449542992,0.0022519244635989823,0.00041258982955683885,-0.004002642892526174,-0.001915376251856022,0.0025271983150237497,0.0014722816962629767,0.12527121993123702,0.11767730563819331,0.125271361646168,0.14613057334841675,0.1099003591441631,0.13831725337910641,0.10824330100118279,0.12324590955920568,0.0061048409105693485,0.004619926962495058,0.0031525000296970464,-0.0733360312776686,0.0733360312776686,0.0017253479148058807,0.13860206988347704,0.12880495142977866,-0.12797482915194966,0.023172612709400024,0.016976907560337737,-0.08141812649817545,…,0.0017240803123286402,0.0017240803123286402,0.0017240803123286402,0.012356788959336042,0.0054245321253456645,-0.04977409676084924,0.12312696091847403,0.12218627936272483,0.12079032299069292,0.12442239007145964,0.12397942066825765,-0.002765041085094363,0.006359078032766285,-0.0018585208136332083,0.003587092627628026,0.004494718628979057,0.0017240803123286402,0.1170701201423162,0.1233349096538844,0.11446164560609781,0.11531401788344127,0.11466860255006307,0.11798849356263658,0.11653256054395876,0.0029864581039968094,0.0034695921455709386,-0.0028471738873987198,0.0000503437677622621,0.004024428496935386,0.00080512165272705,-0.00427177884925051,0.0017648342134535492,-0.33197427550807795,0.13511359308253992,1,0.18687638364941214,-0.08924055510122225
-0.026897493871128556,0.28351976205217794,0.0014601049841854933,-0.005875019450550684,0.0023783990610755567,0.08874231344518639,-0.0014950827643136725,-0.004972239842683682,0.23258264024821926,-0.0220574374591715,-0.00010245889453231445,-0.0008076087426728769,-0.004759380196814446,0.000742118462571893,-0.0038007559488619155,0.002653688672789939,0.00004274946163062313,0.3310746468163069,0.31600409883569264,0.33107461338526695,0.324408831255903,0.3163266900766678,0.3459461160313032,0.30695803445711256,0.3180549581705854,-0.0016422669788970952,0.00034891563647536014,0.0013803172049966447,-0.07441030308164268,0.07441030308164268,-0.010412616977344738,0.3049504934375339,0.27825954194405145,-0.2859461260682165,0.01947570551696185,0.02720064853223305,-0.09462592062673433,…,-0.010413655528002497,-0.010413655528002497,-0.010413655528002497,0.0036034431493937925,0.016523646323257136,-0.05635224792254933,0.3202618858279348,0.3234820968866977,0.32438831963441705,0.32737497316272746,0.3294930059353755,-0.008098305959651815,-0.00019721184168413684,-0.0022106727009925903,-0.008260700359072334,-0.0024068605622494413,-0.010413655528002497,0.2581752957377955,0.26843940541619027,0.26688122282290866,0.30883210635979624,0.30880111299094937,0.30717248101321687,0.30283273286247475,-0.0021037359551696585,0.00008754415074318521,-0.0010313698059023465,0.001204290285092542,0.0019055219036404258,0.002108052317379778,-0.002473512312720446,-0.0023514797594838613,-0.35037080950102245,-0.07662669191963369,0.18687638364941211,1,0.06890945004794564


In [36]:
fig = px.imshow(
    correlation_matrix.to_numpy(),
    labels=dict(x='Features', y='Features', color='Correlation'),
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    aspect='auto'
)
fig.update_layout(
    width=1800,
    height=1800,
    title='Correlation Matrix Heatmap'
)
fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

# Blue means that variable X and variable Y follow the same behaviour (both increasing or decreasing)
# Red means that variable X has the opposite behaviour of variable Y

From the plot, it seems that the target `has_done_upselling` is **not correlated** with any other column

In [37]:
%%time 

correlation_analysis = (
    correlation_matrix
    .unpivot(index=None, variable_name='col1', value_name='correlation')
    .with_columns([
        pl.repeat(correlation_matrix.columns, correlation_matrix.height).flatten().alias('col2'),
        pl.col('correlation').abs()
    ])
    .filter(pl.col('col1') != pl.col('col2'))
    .filter(pl.col('col1') < pl.col('col2'))  # Keep only one pair per combination
    .with_columns([
        pl.when(pl.col('correlation') > 0.99)
            .then(pl.lit('identical'))
        .when(pl.col('correlation') > 0.9)
            .then(pl.lit('high'))
        .when(pl.col('correlation') > 0.8)
            .then(pl.lit('high-medium'))
        .when(pl.col('correlation') > 0.7)
            .then(pl.lit('medium'))
        .otherwise(pl.lit('ok'))
        .alias('analysis')
    ])
    .sort('correlation', descending=True)
)

CPU times: user 2.76 ms, sys: 3.71 ms, total: 6.47 ms
Wall time: 5.81 ms


In [38]:
correlation_analysis.group_by('analysis', maintain_order=True).len()

analysis,len
str,u32
"""identical""",19
"""high""",100
"""high-medium""",58
"""medium""",101
"""ok""",2962


In [39]:
has_done_upselling_corr = correlation_analysis.filter(
    (pl.col('col1') == 'has_done_upselling') | (pl.col('col2') == 'has_done_upselling')
)
has_done_upselling_corr

col1,correlation,col2,analysis
str,f64,str,str
"""available_gb""",0.05250459291014327,"""has_done_upselling""","""ok"""
"""age""",0.04153326724228019,"""has_done_upselling""","""ok"""
"""has_done_upselling""",0.034149530521502186,"""is_magenta1_customer""","""ok"""
"""gross_mrc""",0.02619134627808327,"""has_done_upselling""","""ok"""
"""has_done_upselling""",0.02389323675469545,"""times_in_p5""","""ok"""
…,…,…,…
"""has_done_upselling""",0.0004813386841002739,"""last_3_2mo_rolling_stdev""","""ok"""
"""has_done_upselling""",0.0004042028172025814,"""max_delta_1mo_decrease""","""ok"""
"""ever_used_roaming""",0.00023763468320468282,"""has_done_upselling""","""ok"""
"""has_done_upselling""",0.00012890568655893182,"""times_in_p3""","""ok"""


### Anlysis correlated networks

In [40]:
G_70, remove_70, clusters_70 = correlation_graph_analysis(correlation_analysis, 0.7)

In [41]:
plot_correlation_network(G_70, pos=nx.spring_layout(G_70, seed=42))

In [42]:
G_80, remove_80, clusters_80 = correlation_graph_analysis(correlation_analysis, 0.8)

In [43]:
plot_correlation_network(G_80, pos=nx.spring_layout(G_80, seed=42))

In [44]:
G_90, remove_90, clusters_90 = correlation_graph_analysis(correlation_analysis, 0.9)

In [45]:
plot_correlation_network(G_90, pos=nx.spring_layout(G_90, seed=42))

In [46]:
G_95, remove_95, clusters_95 = correlation_graph_analysis(correlation_analysis, 0.95)

In [47]:
plot_correlation_network(G_95, pos=nx.spring_layout(G_95, seed=42))

## Variance analysis

In [48]:
only_numeric_features = features.select(cs.numeric() & ~cs.boolean())

In [49]:
%%time

# Compute the standard deviation
variance_analysis = (
    features
    .select([
        pl.col(col).std()
        for col in numeric_and_booleans_features.columns
    ])
    .unpivot(variable_name='feature', value_name='stdev')
    .sort('stdev', descending=True)
)

CPU times: user 11.2 ms, sys: 2.62 ms, total: 13.8 ms
Wall time: 3.16 ms


In [50]:
variance_analysis

feature,stdev
str,f64
"""contract_binding_days""",599.6380492222537
"""contract_lifetime_days""",490.1271217887316
"""remaining_binding_days""",345.64345852492005
"""total_usage_gb""",69.21684430203867
"""days_since_last_produkte&services-tarifwechsel""",42.254754177537286
…,…
"""active_usage_months""",0.23464950586759353
"""months_with_no_delta_3mo_change""",0.17850662773735188
"""is_oneplus""",0.15636891670783257
"""is_xiaomi""",0.15450390873559847


In [51]:
%%time

# Unique count dataframe (excluding boolean columns)
unique_count = (
    features
    .select([
        pl.col(col).n_unique().alias(col)
        for col in only_numeric_features.columns 
    ])
    .unpivot(variable_name="feature", value_name="unique_count")
    .sort("unique_count")
)


CPU times: user 66.1 ms, sys: 6.94 ms, total: 73 ms
Wall time: 12 ms


In [52]:
unique_count

feature,unique_count
str,u32
"""months_with_delta_3mo_increase""",2
"""months_with_delta_3mo_decrease""",2
"""months_with_no_delta_3mo_change""",2
"""months_with_delta_2mo_increase""",3
"""months_with_delta_2mo_decrease""",3
…,…
"""completion_rate""",2487
"""contract_binding_days""",2516
"""usage_std_gb""",2671
"""avg_monthly_usage_gb""",2748


## Cleaning

In [58]:
do_not_remove = ['has_done_upselling']

In [59]:
stdev_threshold = 0.1
stdev_1quantile = variance_analysis['stdev'].quantile(0.1)

In [60]:
low_variance_features = variance_analysis.filter(
    pl.col('stdev') <= stdev_threshold
)['feature'].unique()

low_variance_features

feature
str
"""always_used_roaming"""


In [61]:
highly_correlated_keep = [
    'active_usage_months', 'total_usage_gb', 'avg_monthly_usage_gb', 'max_monthly_usage_gb',
    'contract_lifetime_days', 'last_1_delta_3mo', 'months_with_delta_3mo_increase', 'months_with_delta_2mo_increase', 'months_with_delta_3mo_increase'
]

features_from_clusters = sorted(set().union(*clusters_80))
highly_correlated_remove = list(set(features_from_clusters) - set(highly_correlated_keep))

In [63]:
# Cleaning the dataset
features_to_remove = (
    set(low_variance_features.to_list()) | set(highly_correlated_remove)
) - set(do_not_remove)

print(f'Dimensions before cleaning: {features.shape}')

features = features.drop(list(features_to_remove))

print(f'Dimensions after cleaning: {features.shape}')

Dimensions before cleaning: (100000, 83)
Dimensions after cleaning: (100000, 52)


In [64]:
features.columns

['rating_account_id',
 'customer_id',
 'age',
 'contract_lifetime_days',
 'remaining_binding_days',
 'has_special_offer',
 'is_magenta1_customer',
 'available_gb',
 'gross_mrc',
 'has_done_upselling',
 'completion_rate',
 'is_bounded',
 'is_huawei',
 'is_oneplus',
 'is_samsung',
 'is_xiaomi',
 'is_iphone',
 'n_contracts_per_customer',
 'avg_monthly_usage_gb',
 'total_usage_gb',
 'max_monthly_usage_gb',
 'months_with_roaming',
 'ever_used_roaming',
 'active_usage_months',
 'months_with_no_delta_1mo_change',
 'avg_delta_2mo',
 'delta_2mo_volatility',
 'max_delta_2mo_increase',
 'max_delta_2mo_decrease',
 'months_with_delta_2mo_increase',
 'months_with_no_delta_2mo_change',
 'months_with_delta_3mo_increase',
 'months_with_no_delta_3mo_change',
 'last_1_delta_1mo',
 'last_2_delta_1mo',
 'last_3_delta_1mo',
 'last_1_delta_2mo',
 'last_2_delta_2mo',
 'last_1_delta_3mo',
 'n_rechnungsanfragen',
 'n_produkte&services-tarifdetails',
 'n_prolongation',
 'n_produkte&services-tarifwechsel',
 'days

# Storing

In [67]:
%%time 

output_dir = code_dir / 'data'
output_dir.mkdir(parents=True, exist_ok=True)
features.write_parquet(output_dir / 'features' / 'features_cleaned-v1.parquet')


X = features.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = features.select('has_done_upselling')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train_df = pl.concat([X_train.with_columns(y_train),], how='horizontal')
test_df = pl.concat([X_test.with_columns(y_test),], how='horizontal')

train_dir = output_dir / 'train'
train_dir.mkdir(parents=True, exist_ok=True)
train_df.write_parquet(train_dir / 'data-v1-80.parquet')


test_dir = output_dir / 'test'
test_dir.mkdir(parents=True, exist_ok=True)
test_df.write_parquet(test_dir / 'data-v1-20.parquet')

CPU times: user 227 ms, sys: 29.6 ms, total: 257 ms
Wall time: 209 ms


**Features version 1**: 

- age imputed with mean value
    
- stdev_threshold = 0.1
    
- explain we choose specif correlated # TODO