# User Experience Analytics

## Imports

In [1]:
import numpy as np
import pandas as pd
from math import floor
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import zscore
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from df_selector import DfSelector
from df_outlier import DfOutlier
from vis_seaborn import *
from vis_plotly import *

2021-07-16 21:07:04,216 — DfSelector — DEBUG — Loaded successfully!
2021-07-16 21:07:04,223 — DfOutlier — DEBUG — Loaded successfully!


## Data reading

In [4]:
df = pd.read_csv("../data/clean_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146882 entries, 0 to 146881
Data columns (total 54 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   bearer_id                       146882 non-null  int64  
 1   start                           146882 non-null  object 
 2   start_ms                        146882 non-null  float64
 3   end                             146882 non-null  object 
 4   end_ms                          146882 non-null  float64
 5   dur_(ms)                        146882 non-null  float64
 6   imsi                            146882 non-null  int64  
 7   msisdn_number                   146882 non-null  int64  
 8   imei                            146882 non-null  int64  
 9   last_location_name              146882 non-null  object 
 10  avg_rtt_dl_(ms)                 146882 non-null  float64
 11  avg_rtt_ul_(ms)                 146882 non-null  float64
 12  avg_bearer_tp_dl

## Task 3. 1

Aggregate, per customer, the following information(treat missing & outliers by replacing by the mean or the mode of the corresponding variable

- Average TCP retransmission
- Average RTT
- Handset type
- Average throughput

In [5]:
user_engagement_df = df[[
    "msisdn_number",
    "avg_rtt_dl_(ms)",
    "avg_rtt_ul_(ms)",
    "avg_bearer_tp_dl_(kbps)",
    "avg_bearer_tp_ul_(kbps)",
    "tcp_dl_retrans_vol_(bytes)",
    "tcp_ul_retrans_vol_(bytes)",
    "handset_type"]].copy()

In [6]:
user_engagement_df['total_avg_rtt'] = user_engagement_df['avg_rtt_dl_(ms)'] + user_engagement_df['avg_rtt_ul_(ms)']
user_engagement_df['total_avg_tp'] = user_engagement_df['avg_bearer_tp_dl_(kbps)'] + user_engagement_df['avg_bearer_tp_ul_(kbps)']
user_engagement_df['total_avg_tcp'] = user_engagement_df['tcp_dl_retrans_vol_(bytes)'] + user_engagement_df['tcp_ul_retrans_vol_(bytes)']
user_engagement_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146882 entries, 0 to 146881
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   msisdn_number               146882 non-null  int64  
 1   avg_rtt_dl_(ms)             146882 non-null  float64
 2   avg_rtt_ul_(ms)             146882 non-null  float64
 3   avg_bearer_tp_dl_(kbps)     146882 non-null  float64
 4   avg_bearer_tp_ul_(kbps)     146882 non-null  float64
 5   tcp_dl_retrans_vol_(bytes)  146882 non-null  float64
 6   tcp_ul_retrans_vol_(bytes)  146882 non-null  float64
 7   handset_type                146882 non-null  object 
 8   total_avg_rtt               146882 non-null  float64
 9   total_avg_tp                146882 non-null  float64
 10  total_avg_tcp               146882 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 12.3+ MB


In [7]:
_user_engagement_df = user_engagement_df.groupby('msisdn_number').agg({
    'total_avg_rtt': 'sum',
    'total_avg_tp': 'sum',
    'total_avg_tcp': 'sum',
    'handset_type': [lambda x: x.mode()[0]]})

user_engagement_df = pd.DataFrame(columns=[
    "total_avg_rtt",
    "total_avg_tp",
    "total_avg_tcp",
    "handset_type"])

user_engagement_df["total_avg_rtt"] = _user_engagement_df["total_avg_rtt"]['sum']
user_engagement_df["total_avg_tp"] = _user_engagement_df["total_avg_tp"]['sum']
user_engagement_df["total_avg_tcp"] = _user_engagement_df["total_avg_tcp"]['sum']
user_engagement_df["handset_type"] = _user_engagement_df["handset_type"]['<lambda>']
user_engagement_df.head()

Unnamed: 0_level_0,total_avg_rtt,total_avg_tp,total_avg_tcp,handset_type
msisdn_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33601001722,46.0,76.0,729692.0,Huawei P20 Lite Huawei Nova 3E
33601001754,31.0,99.0,15743.0,Apple iPhone 7 (A1778)
33601002511,50.0,97.0,4032874.25,Huawei B528S-23A
33601007832,84.0,248.0,5056.0,Apple iPhone 5S (A1457)
33601008617,119.0,43204.5,8980965.88,Apple iPhone Se (A1723)


## Task 3.2 

Compute & list 10 of the top, bottom and most frequent:

### TCP values in the dataset.


In [20]:
sorted_by_tcp = user_engagement_df.sort_values('total_avg_tcp', ascending=False)
top_10 = sorted_by_tcp.head(10)['total_avg_tcp']
last_10 = sorted_by_tcp.tail(10)['total_avg_tcp']
most_10 = user_engagement_df['total_avg_tcp'].value_counts().head(10)

In [21]:
mult_hist([top_10, last_10, most_10, top_10], 1,
          3, "TCP values in the dataset", ['Top 10', 'Last 10', 'Most 10'])

### RTT values in the dataset.


In [22]:
sorted_by_rtt = user_engagement_df.sort_values(
    'total_avg_rtt', ascending=False)
top_10 = sorted_by_rtt.head(10)['total_avg_rtt']
last_10 = sorted_by_rtt.tail(10)['total_avg_rtt']
most_10 = user_engagement_df['total_avg_rtt'].value_counts().head(10)


In [23]:
mult_hist([top_10, last_10, most_10, top_10], 1,
          3, "TCP values in the dataset", ['Top 10', 'Last 10', 'Most 10'])


### Throughput values in the dataset.


In [29]:
sorted_by_tp = user_engagement_df.sort_values(
    'total_avg_tp', ascending=False)
top_10 = sorted_by_tp.head(10)['total_avg_tp']
last_10 = sorted_by_tp.tail(10)['total_avg_tp']
most_10 = user_engagement_df['total_avg_tp'].value_counts().head(10)

In [30]:
mult_hist([top_10, last_10, most_10, top_10], 1,
          3, "TCP values in the dataset", ['Top 10', 'Last 10', 'Most 10'])

## Task 3.3

### The distribution of the average throughput  per handset type and provide interpretation for your findings.

### The average TCP retransmission view per handset type and provide interpretation for your findings.