# User Behavior Overview

## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
# pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from vis_plotly import *

## Data reading

In [4]:
df = pd.read_csv("../data/clean_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146891 entries, 0 to 146890
Data columns (total 34 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   bearer_id                146891 non-null  float64
 1   msisdn_number            146891 non-null  float64
 2   imei                     146891 non-null  float64
 3   imsi                     146891 non-null  float64
 4   last_location_name       146891 non-null  object 
 5   handset_manufacturer     146891 non-null  object 
 6   handset_type             146891 non-null  object 
 7   start                    146891 non-null  object 
 8   end                      146891 non-null  object 
 9   dur_(ms)                 146891 non-null  float64
 10  social_media_dl_(bytes)  146891 non-null  float64
 11  social_media_ul_(bytes)  146891 non-null  float64
 12  google_dl_(bytes)        146891 non-null  float64
 13  google_ul_(bytes)        146891 non-null  float64
 14  emai

##  Top 10 handsets used by the customers

In [5]:
class DfSelector():

    def __init__(self):
        pass

    def filter_by_count(self, df, column):
      new_df = df[column].value_counts(ascending=False).reset_index().copy()
      new_df = new_df.rename(
          columns={'index': column, column: "count"})
      return new_df

    def filter_by_sum(self, df, column):
      new_df = df[column].sum(ascending=False).reset_index().copy()

      new_df = new_df.rename(
          columns={'index': column, column: "sum"})
      return new_df

    
    def find_agg(self, df: pd.DataFrame, agg_column: str, agg_metric: str, col_name: str, top: int, order=False) -> pd.DataFrame:
        new_df = df.groupby(agg_column)[agg_column].agg(agg_metric).reset_index(name=col_name).\
        sort_values(by=col_name, ascending=order)[:top]
        return new_df
    
selector = DfSelector()


In [6]:
count_df = selector.filter_by_count(df, 'handset_type')
count_df.head(10)

Unnamed: 0,handset_type,count
0,Huawei B528S-23A,28193
1,Apple iPhone 6S (A1688),9369
2,Apple iPhone 6 (A1586),8967
3,Apple iPhone 7 (A1778),6240
4,Apple iPhone Se (A1723),5151
5,Apple iPhone 8 (A1905),4961
6,Apple iPhone Xr (A2105),4542
7,Samsung Galaxy S8 (Sm-G950F),4404
8,Apple iPhone X (A1901),3788
9,Samsung Galaxy A5 Sm-A520F,3673


In [7]:
fig = px.histogram(count_df.nlargest(10, 'count'), x="handset_type", y='count')
fig.show()

## Top 3 handset manufacturers

In [8]:
top_manufacturers = selector.filter_by_count(df, 'handset_manufacturer')
top_manufacturers.head(3)

Unnamed: 0,handset_manufacturer,count
0,Apple,67892
1,Samsung,39902
2,Huawei,33755


##  Top 5 handsets per top 3 handset manufacturer

In [9]:
top3_manufacturers = top_manufacturers.head(3)['handset_manufacturer'].values
top_manufacturers = df[df["handset_manufacturer"].isin(top3_manufacturers)]
top_manufacturers['handset_type'].groupby(
    df['handset_manufacturer']).apply(lambda x: x.value_counts().head(5))

handset_manufacturer                                
Apple                 Apple iPhone 6S (A1688)            9369
                      Apple iPhone 6 (A1586)             8967
                      Huawei B528S-23A                   8820
                      Apple iPhone 7 (A1778)             6240
                      Apple iPhone Se (A1723)            5151
Huawei                Huawei B528S-23A                  19373
                      Huawei E5180                       2062
                      Huawei P20 Lite Huawei Nova 3E     2000
                      Huawei P20                         1471
                      Huawei Y6 2018                      984
Samsung               Samsung Galaxy S8 (Sm-G950F)       4404
                      Samsung Galaxy A5 Sm-A520F         3673
                      Samsung Galaxy J5 (Sm-J530)        3664
                      Samsung Galaxy J3 (Sm-J330)        3431
                      Samsung Galaxy S7 (Sm-G930X)       3106
Name: handset_typ

# Task 1.1

## Number of xDR sessions

In [10]:
sessions = selector.find_agg(df, 'msisdn_number', 'count', 'bearer_id', 10, False)
sessions

Unnamed: 0,msisdn_number,bearer_id
13381,33626320000.0,18
6353,33614890000.0,17
13037,33625780000.0,17
36669,33659730000.0,16
75564,33675880000.0,15
91949,33760540000.0,15
64472,33667160000.0,13
653,33603130000.0,12
91608,33760410000.0,12
13845,33627080000.0,12


## Session duration

In [22]:
duration = selector.find_agg(df, 'msisdn_number', 'sum', 'dur_(ms)', -1, False)
duration.nlargest(20, 'dur_(ms)')


Unnamed: 0,msisdn_number,dur_(ms)
105718,337000000000000.0,337000000000000.0
13381,33626320000.0,605273800000.0
13037,33625780000.0,571638200000.0
6353,33614890000.0,571453200000.0
36669,33659730000.0,538555600000.0
91949,33760540000.0,506408000000.0
75564,33675880000.0,505138200000.0
64472,33667160000.0,437673100000.0
91608,33760410000.0,405125000000.0
13845,33627080000.0,403525000000.0


In [25]:
duration.describe()


count    1.057180e+05
mean     4.997656e+10
std      1.036670e+12
min      3.360100e+10
25%      3.365960e+10
50%      3.367059e+10
75%      6.724550e+10
max      3.370000e+14
Name: dur_(ms), dtype: float64

## The total download (DL) and upload (UL) data

In [13]:
usage = df.groupby('msisdn_number')[['total_dl_(bytes)', 'total_ul_(bytes)']].sum()
usage['total_data'] = usage['total_dl_(bytes)'] + usage['total_ul_(bytes)']
usage.nlargest(20, 'total_data')

Unnamed: 0_level_0,total_dl_(bytes),total_ul_(bytes),total_data
msisdn_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33614890000.0,8156743000.0,689483001.0,8846226000.0
33760540000.0,7811295000.0,703478581.0,8514774000.0
33625780000.0,7770043000.0,729577380.0,8499621000.0
33626320000.0,7301517000.0,669650721.0,7971167000.0
33675880000.0,7309542000.0,581568792.0,7891111000.0
33659730000.0,7081602000.0,624260321.0,7705863000.0
33666460000.0,6903440000.0,405060976.0,7308501000.0
33760410000.0,6610852000.0,521518890.0,7132371000.0
33664710000.0,6400774000.0,471244453.0,6872018000.0
33698790000.0,6010556000.0,530343105.0,6540899000.0


In [14]:
usage.describe()

Unnamed: 0,total_dl_(bytes),total_ul_(bytes),total_data
count,105719.0,105719.0,105719.0
mean,631730000.0,57136010.0,688866100.0
std,460920900.0,35271800.0,486996400.0
min,8827082.0,2866892.0,33249010.0
25%,314240600.0,36378490.0,357950100.0
50%,569379700.0,46762980.0,616933500.0
75%,806298100.0,65504100.0,856473800.0
max,8156743000.0,729577400.0,8846226000.0


## The total data volume (in Bytes) during this session for each application

In [15]:
df["social_media"] = df["social_media_dl_(bytes)"] + df['social_media_ul_(bytes)']
df["google"] = df["google_dl_(bytes)"] + df["google_ul_(bytes)"]
df['email'] = df["email_dl_(bytes)"] + df["email_ul_(bytes)"]
df['youtube'] = df["youtube_dl_(bytes)"] + df["youtube_ul_(bytes)"]
df['netflix'] = df["netflix_dl_(bytes)"] + df["netflix_ul_(bytes)"]
df["gaming"] = df["gaming_dl_(bytes)"] + df["gaming_ul_(bytes)"]
df['other'] = df["other_dl_(bytes)"]+df["other_ul_(bytes)"]
df['total_data'] = df['total_dl_(bytes)'] + df['total_ul_(bytes)']

In [16]:
apps_df = df.groupby('msisdn_number')[['social_media', 'google', 'email', 'youtube', 'gaming', 'other', 'total_data']].sum().nlargest(20, 'total_data')
apps_df

Unnamed: 0_level_0,social_media,google,email,youtube,gaming,other,total_data
msisdn_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
33614890000.0,28294544.0,127973787.0,40788634.0,394370218.0,7749432000.0,7639264000.0,8846226000.0
33760540000.0,39783189.0,123223099.0,33693767.0,396289198.0,7461045000.0,4716134000.0,8514774000.0
33625780000.0,27135500.0,142307915.0,40633966.0,452958769.0,7326673000.0,6354583000.0,8499621000.0
33626320000.0,43374779.0,152191852.0,42418782.0,374483047.0,6887572000.0,8167878000.0,7971167000.0
33675880000.0,19222921.0,109860502.0,31514421.0,317410572.0,6970568000.0,6798515000.0,7891111000.0
33659730000.0,35412358.0,116516345.0,35999792.0,257991088.0,6725559000.0,6317415000.0,7705863000.0
33666460000.0,18629986.0,89320737.0,25557139.0,227336012.0,6646303000.0,4349141000.0,7308501000.0
33760410000.0,20777205.0,82738720.0,25576965.0,303169107.0,6268620000.0,4101645000.0,7132371000.0
33664710000.0,19390599.0,90389372.0,21426007.0,276834013.0,6103856000.0,3976960000.0,6872018000.0
33698790000.0,15728161.0,79736125.0,29059042.0,302661958.0,5753743000.0,4689876000.0,6540899000.0


In [17]:
df.to_csv('../data/clean_data.csv', index=False)