In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="darkgrid", palette="pastel")

from zipfile import ZipFile

import warnings
warnings.filterwarnings('ignore')

## Load data

In [4]:
data_zip_path = "data/turkcell/RegionA.zip"
# Zip file reading
def read_table_from_zip(zip_path, table_name):
    with ZipFile(zip_path) as zip_file:
        with zip_file.open(table_name) as file:
            df = pd.read_csv(file, sep="\t", index_col=0)
            if "datetime" in df:
                df["datetime"] = pd.to_datetime(df["datetime"])
            return df

# Example read for RL-KPIS
rl_kpis = read_table_from_zip(data_zip_path, "rl-kpis.tsv")
print(f"rl_kpis.shape: {rl_kpis.shape}")
print(rl_kpis.datetime.min(), rl_kpis.datetime.max())
rl_kpis.head()

rl_kpis.shape: (1992986, 18)
2018-12-31 00:00:00 2020-12-25 00:00:00


Unnamed: 0,type,datetime,tip,mlid,mw_connection_no,site_id,card_type,adaptive_modulation,freq_band,severaly_error_second,error_second,unavail_second,avail_time,bbe,rxlevmax,capacity,modulation,rlf
0,ENK,2018-12-31,FAR,A0BE,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-31.0,456.0,1024QAM,False
1,ENK,2018-12-31,FAR,A0BI,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-30.7,456.0,1024QAM,False
2,ENK,2018-12-31,FAR,A5AB,1344018,RL_;ABDV,cardtype4,Enable,f3,0.0,0.0,0.0,86.4,0.0,-34.4,406.0,512QAM,False
3,NEC,2018-12-31,FAR,A8CQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False
4,NEC,2018-12-31,FAR,A8DQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False


## Prepare labels

In [5]:
# Firstly, only get unique entry identifiers and RLF column.
# For rl-kpis, unique entry identifiers are 'datetime', 'site_id' and 'mlid'.
df_labels = rl_kpis[["datetime", "site_id", "mlid"]]
df_labels.head()

Unnamed: 0,datetime,site_id,mlid
0,2018-12-31,RL_;ABDV,A0BE
1,2018-12-31,RL_;ABDV,A0BI
2,2018-12-31,RL_;ABDV,A5AB
3,2018-12-31,RL_;ABDV,A8CQ
4,2018-12-31,RL_;ABDV,A8DQ


## Prepare target days (prediction days)

In [6]:
#  Prepare columns for the following days. We will join data with these columns to find RLF
prediction_interval = 5

for i in range(prediction_interval):
  df_labels[f"T+{i+1}"] = df_labels["datetime"] + pd.DateOffset(days=i+1)
df_labels.head()

Unnamed: 0,datetime,site_id,mlid,T+1,T+2,T+3,T+4,T+5
0,2018-12-31,RL_;ABDV,A0BE,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05
1,2018-12-31,RL_;ABDV,A0BI,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05
2,2018-12-31,RL_;ABDV,A5AB,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05
3,2018-12-31,RL_;ABDV,A8CQ,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05
4,2018-12-31,RL_;ABDV,A8DQ,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05


## Join dataset to get RLF colunms for the target days

In [7]:
rl_kpis_view = rl_kpis[["datetime", "site_id", "mlid", "rlf"]]
for i in range(prediction_interval):
  target_day_column_name = f"T+{i+1}"

  df_labels = df_labels.merge(rl_kpis_view, 
                  how = "left", 
                  left_on = ("site_id", "mlid", target_day_column_name),
                  right_on = ("site_id", "mlid", "datetime"),
                  suffixes = ("", "_y")
  )
  df_labels.rename(columns={"rlf": f"{target_day_column_name}_rlf"}, inplace=True)
df_labels.drop(columns=["datetime_y"], inplace=True)
df_labels.head()

Unnamed: 0,datetime,site_id,mlid,T+1,T+2,T+3,T+4,T+5,T+1_rlf,T+2_rlf,T+3_rlf,T+4_rlf,T+5_rlf
0,2018-12-31,RL_;ABDV,A0BE,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
1,2018-12-31,RL_;ABDV,A0BI,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
2,2018-12-31,RL_;ABDV,A5AB,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
3,2018-12-31,RL_;ABDV,A8CQ,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False
4,2018-12-31,RL_;ABDV,A8DQ,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,False,False,False,False,False


## Finalize labels for 1-day and 5-day predictions

For each link (site_id, mlid), we found the RLF for the 5-days (T+1.. T+5) following the given day (T). Now, we will finalize labelling

In [8]:
# 1 day predict is equal to T+1 rlf
df_labels["1-day-predict"] = df_labels["T+1_rlf"]

# Interval predict (5-day predict) is based on T+1, T+2, T+3, T+4 and T+5
following_days_rlf_columns = [f"T+{i+1}_rlf" for i in range(prediction_interval)]

df_labels["5-day-predict"] = df_labels[following_days_rlf_columns].any(axis=1)
df_labels = df_labels[["datetime", "site_id", "mlid", "1-day-predict", "5-day-predict"]]

print(f"df_labels.shape: {df_labels.shape}")
print(f"df_labels 1-day rlf sum: {df_labels['1-day-predict'].sum()}")
print(f"df_labels 5-day rlf sum: {df_labels['5-day-predict'].sum()}")
df_labels.head()

df_labels.shape: (1992986, 5)
df_labels 1-day rlf sum: 1204
df_labels 5-day rlf sum: 5159


Unnamed: 0,datetime,site_id,mlid,1-day-predict,5-day-predict
0,2018-12-31,RL_;ABDV,A0BE,False,False
1,2018-12-31,RL_;ABDV,A0BI,False,False
2,2018-12-31,RL_;ABDV,A5AB,False,False
3,2018-12-31,RL_;ABDV,A8CQ,False,False
4,2018-12-31,RL_;ABDV,A8DQ,False,False


In [9]:
# Now join labels with rl-kpis
rl_kpis_with_labels = rl_kpis.merge(df_labels, 
                                    how="left", 
                                    on=["datetime", "site_id", "mlid"])
rl_kpis_with_labels.head()

Unnamed: 0,type,datetime,tip,mlid,mw_connection_no,site_id,card_type,adaptive_modulation,freq_band,severaly_error_second,error_second,unavail_second,avail_time,bbe,rxlevmax,capacity,modulation,rlf,1-day-predict,5-day-predict
0,ENK,2018-12-31,FAR,A0BE,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-31.0,456.0,1024QAM,False,False,False
1,ENK,2018-12-31,FAR,A0BI,1349988,RL_;ABDV,cardtype1,Enable,f3,0.0,0.0,0.0,86.4,0.0,-30.7,456.0,1024QAM,False,False,False
2,ENK,2018-12-31,FAR,A5AB,1344018,RL_;ABDV,cardtype4,Enable,f3,0.0,0.0,0.0,86.4,0.0,-34.4,406.0,512QAM,False,False,False
3,NEC,2018-12-31,FAR,A8CQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False,False,False
4,NEC,2018-12-31,FAR,A8DQ,1351204,RL_;ABDV,cardtype5,Enable,f2,0.0,0.0,0.0,86.4,0.0,-35.3,247.0,2048QAM*,False,False,False


## Example Model Training

In this example, we will only use rl-kpis and train a simple decision tree model. This is a simplified example.

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support

## Future Selection

In [11]:
categorical_features = ["card_type", "freq_band",]
numerical_features = ["severaly_error_second", "error_second", "unavail_second", "bbe", "rxlevmax"]

features = categorical_features + numerical_features

## Under Sampling

In [12]:
# Simple undersampling
np.random.seed(1234)

cond_rlf = rl_kpis_with_labels["5-day-predict"]
rlf_count = cond_rlf.sum()
print("rlf count: ", rlf_count)

# Get sample index from non rlf columns with 1:3 ratio
sampled_non_rlf_indicies = np.random.choice(rl_kpis_with_labels[~cond_rlf].index, size=rlf_count * 2)
rlf_indicies = np.array(rl_kpis_with_labels[cond_rlf].index)

sampled_data_indicies = list(sampled_non_rlf_indicies) + list(rlf_indicies)
sampled_data = rl_kpis_with_labels.loc[sampled_data_indicies]
sampled_data.shape


rlf count:  5159


(15477, 20)

## Test Train Split

In [13]:
df_train, df_test = train_test_split(sampled_data, test_size=0.2)
print(f"df_train.shape: {df_train.shape} | df_test.shape: {df_test.shape}")

df_train.shape: (12381, 20) | df_test.shape: (3096, 20)


## Preprocessing

In [14]:
# Convert categorical columns to one hot vector
# Merge them with numerical columns
# Return X data, column names, and encoder for future usage

def preprocessing(df, numerical_columns=[], categorical_columns=[], one_hot_encoder=None):
  # Handle NA by removing rows with missing values
  df = df.dropna()

  if one_hot_encoder is None:
    print("Creating new one hot encoder")
    # Initialize and fit the OneHotEncoder
    one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
    one_hot_encoder.fit(df[categorical_columns])
    
    print("one_hot_encoder: ", one_hot_encoder)
    print("*" * 50)
    print("Feature names: ", one_hot_encoder.get_feature_names_out())
    print("*" * 50)
    print("Categories: ", one_hot_encoder.categories_)

  # Extract numerical data
  arr_numerical = df[numerical_columns].values

  # Transform categorical data
  arr_categorical = one_hot_encoder.transform(df[categorical_columns]).toarray()  # Convert sparse to dense

  # Combine both arrays
  feature_names = numerical_columns + list(one_hot_encoder.get_feature_names_out())
  arr_x = np.concatenate((arr_numerical, arr_categorical), axis=1)
  
  return df.copy(), arr_x, feature_names, one_hot_encoder

# Example usage
df_train_dropped, train_x, feature_names, one_hot_encoder = preprocessing(df_train, 
                                                                         numerical_columns=numerical_features, 
                                                                         categorical_columns=categorical_features)

Creating new one hot encoder
one_hot_encoder:  OneHotEncoder(handle_unknown='ignore')
**************************************************
Feature names:  ['card_type_cardtype1' 'card_type_cardtype10' 'card_type_cardtype11'
 'card_type_cardtype2' 'card_type_cardtype4' 'card_type_cardtype5'
 'card_type_cardtype6' 'freq_band_f1' 'freq_band_f2' 'freq_band_f3'
 'freq_band_f4' 'freq_band_f5']
**************************************************
Categories:  [array(['cardtype1', 'cardtype10', 'cardtype11', 'cardtype2', 'cardtype4',
       'cardtype5', 'cardtype6'], dtype=object), array(['f1', 'f2', 'f3', 'f4', 'f5'], dtype=object)]


## Build Model

In [15]:
clf_1_day_pred = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=4)
clf_5_day_pred = tree.DecisionTreeClassifier(min_samples_leaf=5,max_depth=4)

## Train Models

In [16]:
# train_x is prepared at preprocessing step
train_y_1_day_pred = df_train_dropped["1-day-predict"].astype('int').to_numpy()
train_y_5_day_pred = df_train_dropped["5-day-predict"].astype('int').to_numpy()

In [17]:
_ = clf_1_day_pred.fit(train_x, train_y_1_day_pred)
_= clf_5_day_pred.fit(train_x, train_y_5_day_pred)

## Test Models

In [18]:
# Preprocess test data
df_test_dropepd, test_x, _, _ = preprocessing(df_test, numerical_columns=numerical_features, 
                             categorical_columns=categorical_features, 
                             one_hot_encoder=one_hot_encoder)
test_y_1_day_pred = df_test_dropepd["1-day-predict"].astype('int').to_numpy()
test_y_5_day_pred = df_test_dropepd["5-day-predict"].astype('int').to_numpy()

pred_1_day = clf_1_day_pred.predict(test_x)
pred_5_day = clf_5_day_pred.predict(test_x)
pred_1_day.sum(), pred_5_day.sum()

(13, 445)

## Score test results

Reminder: These are undersampled data results, the actual results are probably different from them.

In [19]:
precision, recall, fscore, _ = precision_recall_fscore_support(pred_1_day, 
                                                               test_y_1_day_pred, 
                                                               average="binary", # 
                                                               labels=[0, 1], # labels
                                                               beta=1) # f1 score

print("*********** SCORE for 1-DAY predict")
print(f"precision : {precision:.4f}")
print(f"recall    : {recall:.4f}")
print(f"f-score   : {fscore:.4f}")


precision, recall, fscore, _ = precision_recall_fscore_support(pred_5_day,        # y'
                                                               test_y_5_day_pred, # y
                                                               average="binary",  # focus only True class
                                                               labels=[0, 1],     # labels
                                                               beta=1)            # f1 score
print()
print("*********** SCORE for 5-DAY predict ***********")
print(f"precision : {precision:.4f}")
print(f"recall    : {recall:.4f}")
print(f"f-score   : {fscore:.4f}")

*********** SCORE for 1-DAY predict
precision : 0.0082
recall    : 0.1538
f-score   : 0.0155

*********** SCORE for 5-DAY predict ***********
precision : 0.2867
recall    : 0.6360
f-score   : 0.3953
