# Data Analysis

## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
# pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from df_overview import DfOverview
from df_cleaner import DfCleaner
from vis_seaborn import *

## Data reading

In [4]:
df = pd.read_csv("../data/clean_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146891 entries, 0 to 146890
Data columns (total 34 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   bearer_id                146891 non-null  float64
 1   msisdn_number            146891 non-null  float64
 2   imei                     146891 non-null  float64
 3   imsi                     146891 non-null  float64
 4   last_location_name       146891 non-null  object 
 5   handset_manufacturer     146891 non-null  object 
 6   handset_type             146891 non-null  object 
 7   start                    146891 non-null  object 
 8   end                      146891 non-null  object 
 9   dur_(ms)                 146891 non-null  float64
 10  social_media_dl_(bytes)  146891 non-null  float64
 11  social_media_ul_(bytes)  146891 non-null  float64
 12  google_dl_(bytes)        146891 non-null  float64
 13  google_ul_(bytes)        146891 non-null  float64
 14  emai

In [156]:
cols_all = ['social_media',
        'google',
        'email',
        'youtube',
        'netflix',
        'gaming',
        'other',
        'total_data']
cols_dl = ['social_media_dl_(bytes)',
           'google_dl_(bytes)',
           'email_dl_(bytes)',
           'youtube_dl_(bytes)',
           'netflix_dl_(bytes)',
           'gaming_dl_(bytes)',
           'other_dl_(bytes)',
           'total_dl_(bytes)']
cols_ul = ['social_media_ul_(bytes)',
           'google_ul_(bytes)',
           'email_ul_(bytes)',
           'youtube_ul_(bytes)',
           'netflix_ul_(bytes)',
           'gaming_ul_(bytes)',
           'other_ul_(bytes)',
           'total_ul_(bytes)']


In [131]:
def myLayout(title, x_title, y_title, mode, width, height, margin):
    return go.Layout(
        title=title,
        yaxis=dict(title=x_title),
        xaxis=dict(title=y_title),
        legend=dict(x=0, y=1.0, bgcolor='rgba(255, 255, 255, 0)',
                    bordercolor='rgba(255, 255, 255, 0)'),
        barmode=mode,
        bargap=0.15,
        bargroupgap=0.1,
        width=width,
        height=height,
        margin=margin
    )


def BarTrace(x, y, names):
    trace = []
    for i in range(y.shape[0]):
        trace1 = go.Bar(
            x=x,
            y=y[i],
            name=names[i]
        )
        trace.append(trace1)
    return trace


def barChart(x, y, names, title="", x_title="x", y_title="y", mode='group', full=False):
    width = None
    height = None
    margin = None
    if not (full):
        width = 540
        height = 460
        margin = dict(b=12, l=12, pad=0, r=6, t=54)
    trace = BarTrace(x=x, y=y, names=names)
    fig = go.Figure(data=trace, layout=myLayout(
        title, x_title, y_title, mode, width, height, margin))
    fig.show()

    
def scatter(df, x, y, c=None, s=None, mx=None, my=None, af=None, fit=None):
    fig = px.scatter(df, x=x, y=y, color=c, size=s, marginal_y=my,
                     marginal_x=mx, trendline=fit, animation_frame=af)
    fig.show()


def scatter3D(df, x, y, z, c=None, s=None, mx=None, my=None, af=None, fit=None):
    fig = px.scatter_3d(df, x=x, y=y, z=z, color=c, size=s, animation_frame=af)
    fig.show()


In [37]:
x = df['handset_type'].value_counts().head(10)

In [41]:
top_ten = df['handset_type'].value_counts(ascending=False).head(10).reset_index().copy()
top_ten = top_ten.rename(columns={'index': "handset_type", "handset_type": "count"})
top_ten

Unnamed: 0,handset_type,count
0,Huawei B528S-23A,28193
1,Apple iPhone 6S (A1688),9369
2,Apple iPhone 6 (A1586),8967
3,Apple iPhone 7 (A1778),6240
4,Apple iPhone Se (A1723),5151
5,Apple iPhone 8 (A1905),4961
6,Apple iPhone Xr (A2105),4542
7,Samsung Galaxy S8 (Sm-G950F),4404
8,Apple iPhone X (A1901),3788
9,Samsung Galaxy A5 Sm-A520F,3673


In [45]:
top_ten[['count']]


Unnamed: 0,count
0,28193
1,9369
2,8967
3,6240
4,5151
5,4961
6,4542
7,4404
8,3788
9,3673


In [49]:
np.array(top_ten[['count']]).reshape(1, 10)

array([[28193,  9369,  8967,  6240,  5151,  4961,  4542,  4404,  3788,
         3673]])

In [54]:
barChart(x=top_ten['handset_type'],
         y=np.array(top_ten[['count']]).transpose(),
         names=["Rest of world"],
         )

In [55]:
count_df = df.groupby(['handset_type', 'handset_manufacturer'] ).size().reset_index(name='counts')
count_df

Unnamed: 0,handset_type,handset_manufacturer,counts
0,A-Link Telecom I. Cubot A5,A-Link Telecom International Co Limited,1
1,A-Link Telecom I. Cubot Note Plus,A-Link Telecom International Co Limited,1
2,A-Link Telecom I. Cubot Note S,A-Link Telecom International Co Limited,1
3,A-Link Telecom I. Cubot Nova,A-Link Telecom International Co Limited,1
4,A-Link Telecom I. Cubot Power,A-Link Telecom International Co Limited,1
...,...,...,...
1358,Zte Racer Iii Mini Zte Switch X1 Bouygues Telecom Bs 402 Blade Q Mini Zte Blade G Pro,Zte,5
1359,Zte Zte Blade C2 Smartphone Android By Sfr Startrail 4 Zte Blade Flex T809 Zte T809,Zte,2
1360,Zyxel Communicat. Lte7460,Zyxel Communications Corp,1
1361,Zyxel Communicat. Sbg3600,Zyxel Communications Corp,1


In [56]:
px.histogram(count_df.nlargest(20, "counts"),
             x="handset_manufacturer", y="counts", color="handset_type")


In [57]:
px.histogram(count_df.nlargest(20, "counts"),
             x="handset_type", y="counts", color="handset_manufacturer")


In [68]:
df[cols]


Unnamed: 0,social_media,google,email,youtube,netflix,gaming,other,total_data
0,1570185.0,2905912.0,3701304.0,18355943.0,17855187.0,292426453.0,180558843.0,345629377.0
1,1933278.0,4414096.0,937385.0,39359124.0,35565545.0,609920783.0,541959383.0,707185356.0
2,1726277.0,10229119.0,3363124.0,34425237.0,23751202.0,229980251.0,414908351.0,307690973.0
3,657493.0,11811761.0,2070983.0,36534765.0,15092588.0,810387875.0,761837216.0,889352748.0
4,912788.0,7748843.0,2110349.0,34222253.0,17539799.0,531237049.0,564619822.0,607681403.0
...,...,...,...,...,...,...,...,...
146886,993175.0,1265634.0,4280107.0,24553645.0,29618711.0,797160102.0,336273648.0,872761860.0
146887,3517065.0,12784914.0,690876.0,27955095.0,37561864.0,535806880.0,16751926.0,631804110.0
146888,2352181.0,4415361.0,2121718.0,22165518.0,40643294.0,631628095.0,714638145.0,705783925.0
146889,1260239.0,6585469.0,2058277.0,24516413.0,15029702.0,566933800.0,132415585.0,627698629.0


In [118]:
dl_df = pd.DataFrame(df[cols_dl].sum(axis=0, skipna=True)).reset_index()
dl_df = dl_df.rename(columns={'index': "app", 0: "sum"})
dl_df

Unnamed: 0,app,sum
0,social_media_dl_(bytes),263643600000.0
1,google_dl_(bytes),844832100000.0
2,email_dl_(bytes),263192100000.0
3,youtube_dl_(bytes),1708843000000.0
4,netflix_dl_(bytes),1707828000000.0
5,gaming_dl_(bytes),61997530000000.0
6,other_dl_(bytes),61841670000000.0
7,total_dl_(bytes),66785870000000.0


In [119]:
ul_df = pd.DataFrame(df[cols_ul].sum(axis=0, skipna=True)).reset_index()
ul_df = ul_df.rename(columns={'index': "app", 0: "sum"})
ul_df


Unnamed: 0,app,sum
0,social_media_ul_(bytes),4834995000.0
1,google_ul_(bytes),302089500000.0
2,email_ul_(bytes),68646130000.0
3,youtube_ul_(bytes),1617062000000.0
4,netflix_ul_(bytes),1616222000000.0
5,gaming_ul_(bytes),1217764000000.0
6,other_ul_(bytes),1213744000000.0
7,total_ul_(bytes),6040362000000.0


In [120]:
y = np.array([dl_df['sum'], ul_df['sum']])
y.shape

(2, 8)

In [121]:
y = [dl_df['sum'], ul_df['sum']]
y

[0    2.636436e+11
 1    8.448321e+11
 2    2.631921e+11
 3    1.708843e+12
 4    1.707828e+12
 5    6.199753e+13
 6    6.184167e+13
 7    6.678587e+13
 Name: sum, dtype: float64,
 0    4.834995e+09
 1    3.020895e+11
 2    6.864613e+10
 3    1.617062e+12
 4    1.616222e+12
 5    1.217764e+12
 6    1.213744e+12
 7    6.040362e+12
 Name: sum, dtype: float64]

In [130]:
barChart(np.array([cols, cols]),
         np.array(y),
         ["Rest of world", "China"],
         "US Export of Plastic Scrap",
         "USD (millions)",
         "year",
         mode='stack',
         full=True
         )


In [None]:
x = [1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
         2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]



barChart(np.array([x, x]),
         np.array(),
         ["Rest of world", "China"],
         "US Export of Plastic Scrap",
         "USD (millions)",
         "year",
         mode='group',
         full=True
         )


In [134]:
count_df


Unnamed: 0,handset_type,handset_manufacturer,counts
0,A-Link Telecom I. Cubot A5,A-Link Telecom International Co Limited,1
1,A-Link Telecom I. Cubot Note Plus,A-Link Telecom International Co Limited,1
2,A-Link Telecom I. Cubot Note S,A-Link Telecom International Co Limited,1
3,A-Link Telecom I. Cubot Nova,A-Link Telecom International Co Limited,1
4,A-Link Telecom I. Cubot Power,A-Link Telecom International Co Limited,1
...,...,...,...
1358,Zte Racer Iii Mini Zte Switch X1 Bouygues Telecom Bs 402 Blade Q Mini Zte Blade G Pro,Zte,5
1359,Zte Zte Blade C2 Smartphone Android By Sfr Startrail 4 Zte Blade Flex T809 Zte T809,Zte,2
1360,Zyxel Communicat. Lte7460,Zyxel Communications Corp,1
1361,Zyxel Communicat. Sbg3600,Zyxel Communications Corp,1


In [138]:
scatter(count_df.nlargest(20, "counts"), x="handset_type", y="counts",
        c="handset_manufacturer")


In [140]:
x = df.groupby('handset_manufacturer').count
x

<bound method DataFrameGroupBy.count of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x12020ee80>>

In [144]:
fig = px.scatter(df, x="total_dl_(bytes)", y="total_ul_(bytes)", color="handset_manufacturer",
                 log_x=True, size_max=60)
fig.show()


In [157]:
new_cols = cols_all
new_cols.extend(['handset_manufacturer', 'handset_type'])
new_cols


['social_media',
 'google',
 'email',
 'youtube',
 'netflix',
 'gaming',
 'other',
 'total_data',
 'handset_manufacturer',
 'handset_type']

In [160]:
x = df.groupby(new_cols).size().reset_index(name='counts')
x = x.nlargest(25, "counts")
x

Unnamed: 0,social_media,google,email,youtube,netflix,gaming,other,total_data,handset_manufacturer,handset_type,counts
0,1563.0,3531073.0,2718720.0,20373145.0,37839764.0,354278713.0,331972162.0,424126106.0,Huawei,Huawei P8 Lite,1
1,1753.0,1249552.0,2450344.0,29091215.0,22158460.0,557766077.0,627388258.0,622175722.0,Samsung,Samsung Galaxy S7 (Sm-G930X),1
2,2050.0,8332684.0,2464499.0,28270874.0,19802993.0,580324660.0,345118647.0,641433912.0,Telit,Telit Le910-Eu V2,1
3,3111.0,10195185.0,797024.0,17389313.0,25851145.0,235625749.0,385270297.0,297296850.0,Huawei,Huawei Honor 8X,1
4,4469.0,11311821.0,1637343.0,12137801.0,16569197.0,525103350.0,247972465.0,569284853.0,Samsung,Samsung Galaxy A5 Sm-A520F,1
5,5179.0,8074058.0,640129.0,32638505.0,27686562.0,842659779.0,635166500.0,926734220.0,Samsung,Samsung Galaxy A8 (2018),1
6,5441.0,6415690.0,3219752.0,29030023.0,12457818.0,221855645.0,790884552.0,285666941.0,Apple,Apple iPhone Se (A1723),1
7,6641.0,9200445.0,1454337.0,29254338.0,21709780.0,12585567.0,134330796.0,81563529.0,Apple,Apple iPhone Se (A1723),1
8,6893.0,7874713.0,1897339.0,14124191.0,25076541.0,364074681.0,50680189.0,414366346.0,Apple,Apple iPhone 8 Plus (A1897),1
9,7382.0,7488358.0,1601901.0,28145979.0,17084479.0,98463113.0,803849336.0,153594283.0,Samsung,Samsung Galaxy S7 (Sm-G930X),1


In [164]:
scatter(x, x="gaming", y="total_data",
        c="handset_manufacturer", mx='box', my='violin')


In [165]:
scatter(x, x="netflix", y="youtube",
        c="handset_type", mx='box', my='violin')

In [166]:

def scatter3D(df, x, y, z, c=None, s=None, mx=None, my=None, af=None, fit=None):
    fig = px.scatter_3d(df, x=x, y=y, z=z, color=c, size=s,
                        animation_frame=af, size_max=18)
    fig.show()


In [167]:
scatter3D(x, x="netflix", y="youtube", z="social_media",
          c="handset_manufacturer")
