In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
import sys, os

#sys.path.append(os.path.abspath(os.path.join("../..")))
sys.path.append(os.path.abspath(os.path.join("./scripts")))

In [3]:
import cleaning_functions as clnf
import plotting_functions as pltf

Import data as a dataframe

In [4]:
df = pd.read_csv("data/Week1_challenge_data_source.csv")

Before Cleaning the dataset, we explore what has been stored in it. The column names, how many data points, number of colum etc. All these questions needs to be answered

In [5]:
print("The data has {} rows and {} columns".format(df.shape[0], df.shape[1]))

The data has 150001 rows and 55 columns


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        148848 non-null  object 
 10  Avg RTT DL (ms)     

# Cleaning The Data

In [8]:
miss_df = clnf.missing_values_table(df)

Your selected dataframe has 55 columns.
There are 41 columns that have missing values.


In [9]:
miss_df

Unnamed: 0,Missing Values,% of Total Values,Dtype
Nb of sec with 37500B < Vol UL,130254,86.8,float64
Nb of sec with 6250B < Vol UL < 37500B,111843,74.6,float64
Nb of sec with 125000B < Vol DL,97538,65.0,float64
TCP UL Retrans. Vol (Bytes),96649,64.4,float64
Nb of sec with 31250B < Vol DL < 125000B,93586,62.4,float64
Nb of sec with 1250B < Vol UL < 6250B,92894,61.9,float64
Nb of sec with 6250B < Vol DL < 31250B,88317,58.9,float64
TCP DL Retrans. Vol (Bytes),88146,58.8,float64
HTTP UL (Bytes),81810,54.5,float64
HTTP DL (Bytes),81474,54.3,float64


We see that some columns are missing a lot of values. Handeling missing values depend on:
1. percentage of missing values
2. type of values
3. distribution of values

In [10]:
fl_descp = pd.read_excel("data/Field_Descriptions.xlsx")

In [11]:
fl_descp.head(5)

Unnamed: 0,Fields,Description
0,bearer id,xDr session identifier
1,Dur. (ms),Total Duration of the xDR (in ms)
2,Start,Start time of the xDR (first frame timestamp)
3,Start ms,Milliseconds offset of start time for the xDR ...
4,End,End time of the xDR (last frame timestamp)


1. Columns that include unique identifiers, like 'Bearer Id', 'IMSI', 'IMEI'. Rows that have missing ids are useless and should be droped

In [12]:
fl_descp[fl_descp['Fields'].apply(lambda x: x in ['bearer id', 'IMSI', 'IMEI'])]

Unnamed: 0,Fields,Description
0,bearer id,xDr session identifier
7,IMSI,International Mobile Subscriber Identity
9,IMEI,International Mobile Equipment Identity


In [13]:
df_clean = df.dropna(subset=['Bearer Id', 'IMSI', 'IMEI'])

# update what are the missing values in this new dataframe df_clean

miss_df = clnf.missing_values_table(df_clean)

Your selected dataframe has 55 columns.
There are 24 columns that have missing values.


2. "Last Location Name", "MSI SDN/Number" and "DL TP < 50 Kbps (%)" columns have little missing data, we can drop them

In [14]:
fl_descp[fl_descp['Fields'].apply(lambda x: x in ["Last Location Name", "MSISDN/Number"])]

Unnamed: 0,Fields,Description
8,MSISDN/Number,MS International PSTN/ISDN Number of mobile - ...
10,Last Location Name,User location call name (2G/3G/4G) at the end ...


In [15]:
fl_descp[fl_descp['Fields'] == 'DL TP < 50 Kbps (%)'].Description.to_list()

['Duration ratio when Bearer Downlink Throughput < ….']

In [16]:
df_clean = df_clean.dropna(subset=["Last Location Name", "MSISDN/Number"])

In [17]:
miss_df = clnf.missing_values_table(df_clean)

Your selected dataframe has 55 columns.
There are 22 columns that have missing values.


In [18]:
DL_columns = ['DL TP > 1 Mbps (%)', '250 Kbps < DL TP < 1 Mbps (%)', 
              '50 Kbps < DL TP < 250 Kbps (%)', 'DL TP < 50 Kbps (%)']

UL_columns = ['UL TP > 300 Kbps (%)', '50 Kbps < UL TP < 300 Kbps (%)', 
              '10 Kbps < UL TP < 50 Kbps (%)', 'UL TP < 10 Kbps (%)']

3. DL_columns and UL_columns have little missing data, we can drop them

In [19]:
fl_descp[fl_descp['Fields'].apply(lambda x: (x in DL_columns) | (x in UL_columns))]

Unnamed: 0,Fields,Description
17,DL TP < 50 Kbps (%),Duration ratio when Bearer Downlink Throughput...
18,50 Kbps < DL TP < 250 Kbps (%),Duration ratio when Bearer Downlink Throughput...
19,250 Kbps < DL TP < 1 Mbps (%),Duration ratio when Bearer Downlink Throughput...
20,DL TP > 1 Mbps (%),Duration ratio when Bearer Downlink Throughput...
21,UL TP < 10 Kbps (%),Duration ratio when Bearer Uplink Throughput < ….
22,10 Kbps < UL TP < 50 Kbps (%),Duration ratio when Bearer Uplink Throughput r...
23,50 Kbps < UL TP < 300 Kbps (%),Duration ratio when Bearer Uplink Throughput r...
24,UL TP > 300 Kbps (%),Duration ratio when Bearer Uplink Throughput > ….


In [20]:
df_clean= df_clean.dropna(how = 'all', subset= DL_columns)
df_clean= df_clean.dropna(how = 'all', subset= UL_columns)

miss_df = clnf.missing_values_table(df_clean)

Your selected dataframe has 55 columns.
There are 14 columns that have missing values.


In [23]:
df_clean= df_clean.dropna(subset= ['Nb of sec with Vol DL < 6250B', 'Nb of sec with Vol UL < 1250B'])

miss_df = clnf.missing_values_table(df_clean)
miss_df

Your selected dataframe has 55 columns.
There are 12 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Dtype
Nb of sec with 37500B < Vol UL,127353,86.7,float64
Nb of sec with 6250B < Vol UL < 37500B,109246,74.4,float64
Nb of sec with 125000B < Vol DL,95243,64.8,float64
TCP UL Retrans. Vol (Bytes),94650,64.4,float64
Nb of sec with 31250B < Vol DL < 125000B,91458,62.3,float64
Nb of sec with 1250B < Vol UL < 6250B,90777,61.8,float64
Nb of sec with 6250B < Vol DL < 31250B,86336,58.8,float64
TCP DL Retrans. Vol (Bytes),86305,58.8,float64
HTTP UL (Bytes),80085,54.5,float64
HTTP DL (Bytes),80031,54.5,float64
