# User Behavior Overview

## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [4]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from df_selector import DfSelector
from vis_plotly import *

## Data reading

In [5]:
df = pd.read_csv("../data/clean_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146882 entries, 0 to 146881
Data columns (total 46 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   bearer_id                       146882 non-null  int64  
 1   start                           146882 non-null  object 
 2   start_ms                        146882 non-null  float64
 3   end                             146882 non-null  object 
 4   end_ms                          146882 non-null  float64
 5   dur_(ms)                        146882 non-null  float64
 6   imsi                            146882 non-null  int64  
 7   msisdn_number                   146882 non-null  int64  
 8   imei                            146882 non-null  int64  
 9   last_location_name              146882 non-null  object 
 10  avg_rtt_dl_(ms)                 146882 non-null  float64
 11  avg_rtt_ul_(ms)                 146882 non-null  float64
 12  avg_bearer_tp_dl

##  Top 10 handsets used by the customers

In [6]:
selector = DfSelector()

In [7]:
count_df = selector.filter_by_count(df, 'handset_type')
count_df.head(10)

Unnamed: 0,handset_type,count
0,Huawei B528S-23A,28190
1,Apple iPhone 6S (A1688),9369
2,Apple iPhone 6 (A1586),8967
3,Apple iPhone 7 (A1778),6240
4,Apple iPhone Se (A1723),5151
5,Apple iPhone 8 (A1905),4961
6,Apple iPhone Xr (A2105),4542
7,Samsung Galaxy S8 (Sm-G950F),4404
8,Apple iPhone X (A1901),3788
9,Samsung Galaxy A5 Sm-A520F,3672


In [8]:
fig = px.histogram(count_df.nlargest(10, 'count'), x="handset_type", y='count')
fig.show()

## Top 3 handset manufacturers

In [9]:
top_manufacturers = selector.filter_by_count(df, 'handset_manufacturer')
top_manufacturers.head(3)

Unnamed: 0,handset_manufacturer,count
0,Apple,67889
1,Samsung,39897
2,Huawei,33754


##  Top 5 handsets per top 3 handset manufacturer

In [10]:
top3_manufacturers = top_manufacturers.head(3)['handset_manufacturer'].values
top_manufacturers = df[df["handset_manufacturer"].isin(top3_manufacturers)]
top_manufacturers['handset_type'].groupby(
    df['handset_manufacturer']).apply(lambda x: x.value_counts().head(5))

handset_manufacturer                                
Apple                 Apple iPhone 6S (A1688)            9369
                      Apple iPhone 6 (A1586)             8967
                      Huawei B528S-23A                   8817
                      Apple iPhone 7 (A1778)             6240
                      Apple iPhone Se (A1723)            5151
Huawei                Huawei B528S-23A                  19373
                      Huawei E5180                       2062
                      Huawei P20 Lite Huawei Nova 3E     2000
                      Huawei P20                         1471
                      Huawei Y6 2018                      984
Samsung               Samsung Galaxy S8 (Sm-G950F)       4404
                      Samsung Galaxy A5 Sm-A520F         3672
                      Samsung Galaxy J5 (Sm-J530)        3663
                      Samsung Galaxy J3 (Sm-J330)        3431
                      Samsung Galaxy S7 (Sm-G930X)       3106
Name: handset_typ

# Task 1.1

## Number of xDR sessions

In [11]:
sessions = selector.find_agg(df, 'msisdn_number', 'count', 'bearer_id', 10, False)
sessions

Unnamed: 0,msisdn_number,bearer_id
13381,33626320676,18
6353,33614892860,17
13037,33625779332,17
36669,33659725664,16
91945,33760536639,15
75563,33675877202,15
64471,33667163239,13
13845,33627080969,12
91604,33760413819,12
1258,33604515716,12


## Session duration

In [12]:
duration = df.groupby('msisdn_number').agg({'dur_(ms)': 'sum'})
duration = duration.rename(columns={'dur_(ms)': 'total_duration(ms)'})
duration.sort_values(by=['total_duration(ms)'], ascending=False).head(10)

Unnamed: 0_level_0,total_duration(ms)
msisdn_number,Unnamed: 1_level_1
33625779332,4137804.0
33626320676,4006827.0
33614892860,3899805.0
33659725664,3492198.0
33760536639,3396476.0
33675877202,3331851.0
33667163239,3113767.0
33603127838,2855128.0
33627080969,2782060.0
33604515716,2779753.0


In [13]:
duration.describe()

Unnamed: 0,total_duration(ms)
count,105714.0
mean,139190.15
std,129395.17
min,7142.0
25%,72723.5
50%,103107.0
75%,172799.0
max,4137804.0


## The total download (DL) and upload (UL) data

In [14]:
usage = df.groupby('msisdn_number')[['total_dl_(bytes)', 'total_ul_(bytes)']].sum()
usage['total_data'] = usage['total_dl_(bytes)'] + usage['total_ul_(bytes)']
usage.nlargest(20, 'total_data')

Unnamed: 0_level_0,total_dl_(bytes),total_ul_(bytes),total_data
msisdn_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33614892860,8156743493.0,689483001.0,8846226494.0
33760536639,7811295382.0,703478581.0,8514773963.0
33625779332,7770043342.0,729577380.0,8499620722.0
33626320676,7301516540.0,669650721.0,7971167261.0
33675877202,7309541816.0,581568792.0,7891110608.0
33659725664,7081602462.0,624260321.0,7705862783.0
33666464084,6903439962.0,405060976.0,7308500938.0
33760413819,6610851624.0,521518890.0,7132370514.0
33664712899,6400773755.0,471562499.0,6872336254.0
33698792269,6010556021.0,530343105.0,6540899126.0


In [15]:
usage.describe()

Unnamed: 0,total_dl_(bytes),total_ul_(bytes),total_data
count,105714.0,105714.0,105714.0
mean,631711503.63,57135093.16,688846596.79
std,460920510.16,35268950.2,486996120.5
min,8827082.0,9493162.0,33249009.0
25%,314222219.75,36375624.75,357909940.5
50%,569321331.5,46763051.0,616921019.0
75%,806265196.5,65502749.0,856440351.75
max,8156743493.0,729577380.0,8846226494.0


## The total data volume (in Bytes) during this session for each application

In [16]:
df["social_media"] = df["social_media_dl_(bytes)"] + df['social_media_ul_(bytes)']
df["google"] = df["google_dl_(bytes)"] + df["google_ul_(bytes)"]
df['email'] = df["email_dl_(bytes)"] + df["email_ul_(bytes)"]
df['youtube'] = df["youtube_dl_(bytes)"] + df["youtube_ul_(bytes)"]
df['netflix'] = df["netflix_dl_(bytes)"] + df["netflix_ul_(bytes)"]
df["gaming"] = df["gaming_dl_(bytes)"] + df["gaming_ul_(bytes)"]
df['other'] = df["other_dl_(bytes)"]+df["other_ul_(bytes)"]
df['total_data'] = df['total_dl_(bytes)'] + df['total_ul_(bytes)']

In [17]:
apps_df = df.groupby('msisdn_number')[['social_media', 'google', 'email', 'youtube', 'gaming', 'other', 'total_data']].sum().nlargest(20, 'total_data')
apps_df

Unnamed: 0_level_0,social_media,google,email,youtube,gaming,other,total_data
msisdn_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
33614892860,28294544.0,127973787.0,40788634.0,394370218.0,7749432234.0,7639263572.0,8846226494.0
33760536639,39783189.0,123223099.0,33693767.0,396289198.0,7461045228.0,4716134493.0,8514773963.0
33625779332,27135500.0,142307915.0,40633966.0,452958769.0,7326673487.0,6354583086.0,8499620722.0
33626320676,43374779.0,152191852.0,42418782.0,374483047.0,6887572116.0,8167877776.0,7971167261.0
33675877202,19222921.0,109860502.0,31514421.0,317410572.0,6970567597.0,6798515150.0,7891110608.0
33659725664,35412358.0,116516345.0,35999792.0,257991088.0,6725559211.0,6317415487.0,7705862783.0
33666464084,18629986.0,89320737.0,25557139.0,227336012.0,6646303338.0,4349141478.0,7308500938.0
33760413819,20777205.0,82738720.0,25576965.0,303169107.0,6268619592.0,4101645436.0,7132370514.0
33664712899,19390599.0,90389372.0,21426007.0,276834013.0,6103856008.0,3976960308.0,6872336254.0
33698792269,15728161.0,79736125.0,29059042.0,302661958.0,5753743069.0,4689876286.0,6540899126.0


In [20]:
sort_by_total_data = apps_df.sort_values(
    'total_data', ascending=False)['total_data']
hist(sort_by_total_data)

In [None]:
sorted_by_tp = user_experience_df.sort_values(
    'total_avg_tp', ascending=False)
top_10 = sorted_by_tp.head(10)['total_avg_tp']
last_10 = sorted_by_tp.tail(10)['total_avg_tp']
most_10 = user_engagement_df['total_avg_tp'].value_counts().head(10) mult_hist([top_10, last_10, most_10, top_10], 1,
                                                                               3, "TCP values in the dataset", ['Top 10', 'Last 10', 'Most 10'])


In [None]:
df.to_csv('../data/clean_data.csv', index=False)