# Data Analysis

## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
# pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from df_overview import DfOverview
from df_cleaner import DfCleaner
from df_selector import DfSelector
from vis_seaborn import *
from vis_plotly import *

2021-07-17 10:10:37,277 — DfOverview — DEBUG — Loaded successfully!
2021-07-17 10:10:37,280 — DfCleaner — DEBUG — Loaded successfully!
2021-07-17 10:10:37,284 — DfSelector — DEBUG — Loaded successfully!


## Data reading

In [4]:
df = pd.read_csv("../data/clean_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146882 entries, 0 to 146881
Data columns (total 54 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   bearer_id                       146882 non-null  int64  
 1   start                           146882 non-null  object 
 2   start_ms                        146882 non-null  float64
 3   end                             146882 non-null  object 
 4   end_ms                          146882 non-null  float64
 5   dur_(ms)                        146882 non-null  float64
 6   imsi                            146882 non-null  int64  
 7   msisdn_number                   146882 non-null  int64  
 8   imei                            146882 non-null  int64  
 9   last_location_name              146882 non-null  object 
 10  avg_rtt_dl_(ms)                 146882 non-null  float64
 11  avg_rtt_ul_(ms)                 146882 non-null  float64
 12  avg_bearer_tp_dl

## Task 1.2

### Describe all  relevant variables and associated data types (slide). 

The columns we would be doing this Univariate Analysis are given in task 1.1 They are
- number of xDR sessions
- Session duration
- the total download(DL) and upload(UL) data
- the total data volume for Applications

I choose the following columns based on that

In [7]:
relavant_cols = ["msisdn_number",
                 "handset_manufacturer",
                 "bearer_id", 
                 "dur_(ms)",
                 'social_media',
                 'google',
                 'email',
                 'youtube',
                 'netflix',
                 'gaming',
                 'other',
                 'total_data']

In [8]:
df[relavant_cols].head()

Unnamed: 0,msisdn_number,handset_manufacturer,bearer_id,dur_(ms),social_media,google,email,youtube,netflix,gaming,other,total_data
0,33659219748,Samsung,-9223372036854775808,245071.0,1585413.0,8778341.0,766046.0,22911539.0,13185350.0,812946425.0,77671250.0,872988322.0
1,33665646348,Apple,7277825610512449536,245071.0,2697600.0,6532777.0,3780918.0,21981135.0,26779600.0,574923841.0,110548061.0,651527183.0
2,33664473872,Huawei,-9223372036854775808,245071.0,839351.0,8822396.0,2348652.0,19623308.0,20124838.0,505668618.0,538369717.0,568809782.0
3,33603291937,Samsung,-9223372036854775808,245071.0,1715492.0,13395961.0,3424203.0,26178247.0,22810379.0,769947925.0,45876928.0,848798988.0
4,33659219748,Samsung,-9223372036854775808,245071.0,2480347.0,7991932.0,3672362.0,25308492.0,26233613.0,255096405.0,50438697.0,331143097.0


## Analyze the basic metrics (mean, median, etc) in the Dataset (explain) & their importance for the global objective.


In [9]:
df[relavant_cols].describe()

Unnamed: 0,msisdn_number,bearer_id,dur_(ms),social_media,google,email,youtube,netflix,gaming,other,total_data
count,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0
mean,35968690000.0,-8.628466e+17,100178.017906,1827749.0,7807995.0,2259053.0,22641630.0,22629430.0,430348500.0,429261200.0,495777100.0
std,879229500000.0,8.256065e+18,58974.02781,1035576.0,3517123.0,1071134.0,9244443.0,9259341.0,244087400.0,243210300.0,244437900.0
min,33601000000.0,-9.223372e+18,7142.0,1563.0,40330.0,8359.0,129655.0,98432.0,306358.0,149045.0,28956110.0
25%,33651280000.0,-9.223372e+18,58526.0,931344.2,4943722.0,1358982.0,15996880.0,15981910.0,218689900.0,218467600.0,284477100.0
50%,33663690000.0,6.917538e+18,86399.0,1825410.0,7815465.0,2263822.0,22662870.0,22635170.0,431660100.0,429796700.0,496930300.0
75%,33683440000.0,7.349883e+18,133144.0,2727156.0,10683150.0,3159540.0,29292210.0,29288160.0,641563200.0,639663200.0,706614700.0
max,337000000000000.0,7.566056e+18,245071.0,3650861.0,15528780.0,4518036.0,45190080.0,45198150.0,859202800.0,859520900.0,953632000.0


## Non-Graphical Univariate Analysis

In [10]:
cols = ['social_media',
        'google',
        'email',
        'youtube',
        'netflix',
        'gaming',
        'other',
        'total_data']

In [11]:
df[cols].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
social_media,146882.0,1827749.0,1035576.0,1563.0,931344.2,1825410.0,2727156.0,3650861.0
google,146882.0,7807995.0,3517123.0,40330.0,4943722.0,7815465.0,10683150.0,15528782.0
email,146882.0,2259053.0,1071134.0,8359.0,1358982.0,2263821.5,3159540.0,4518036.0
youtube,146882.0,22641630.0,9244443.0,129655.0,15996880.0,22662867.5,29292210.0,45190078.0
netflix,146882.0,22629430.0,9259341.0,98432.0,15981910.0,22635170.0,29288160.0,45198153.0
gaming,146882.0,430348500.0,244087400.0,306358.0,218689900.0,431660065.0,641563200.0,859202784.0
other,146882.0,429261200.0,243210300.0,149045.0,218467600.0,429796668.5,639663200.0,859520934.0
total_data,146882.0,495777100.0,244437900.0,28956107.0,284477100.0,496930298.0,706614700.0,953632035.0


In [12]:
scaler = StandardScaler()
scaled_array = scaler.fit_transform(df[cols])

In [13]:
scaled_data = pd.DataFrame(scaled_array, columns=cols)
scaled_data.head()

Unnamed: 0,social_media,google,email,youtube,netflix,gaming,other,total_data
0,-0.234012,0.275893,-1.393861,0.029197,-1.019955,1.567468,-1.445626,1.543183
1,0.839971,-0.362575,1.420803,-0.071448,0.448216,0.592312,-1.310447,0.637179
2,-0.954446,0.288419,0.083649,-0.326502,-0.270495,0.30858,0.44862,0.298779
3,-0.108401,1.588794,1.087777,0.382569,0.019542,1.391308,-1.576354,1.444224
4,0.630181,0.052298,1.319456,0.288484,0.389249,-0.717992,-1.557597,-0.673523


In [14]:
scaled_data.describe()

Unnamed: 0,social_media,google,email,youtube,netflix,gaming,other,total_data
count,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0,146882.0
mean,8.030262e-17,3.599105e-17,-2.108186e-16,6.724135e-17,4.5279070000000007e-17,9.244476000000001e-17,7.730337e-17,7.401386e-18
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-1.763455,-2.208536,-2.101233,-2.435198,-2.433335,-1.761843,-1.764372,-1.90978
25%,-0.8656124,-0.8143825,-0.8402995,-0.7187847,-0.7179287,-0.8671456,-0.8667159,-0.864435
50%,-0.002258665,0.002123889,0.004452111,0.002297725,0.0006195494,0.005373446,0.002201772,0.00471785
75%,0.8685118,0.8174775,0.8406885,0.7194165,0.7191389,0.8653272,0.8651061,0.8625435
max,1.760486,2.195207,2.108972,2.439144,2.437409,1.756976,1.769091,1.873099


## Graphical Univariate Analysis

In [15]:
sample = scaled_data.sample(5000)
sample = np.array(sample[cols]).reshape(8, -1)
sample.shape


(8, 5000)

In [16]:
histogram(sample, cols, .1)

In [17]:
sample[0] = sample[0]-8
sample[1] = sample[0]-6
sample[2] = sample[0]-4
sample[3] = sample[0]-2
sample[4] = sample[0]-0
sample[5] = sample[0]-2
sample[6] = sample[0]-4
sample[7] = sample[0]-6


In [18]:
histogram(sample, cols, .1)

## Bivariate Analysis – explore the relationship between each application & the total DL+UL data using appropriate methods and interpret your findings. 


In [19]:
sample = scaled_data.sample(1000)
scatter(sample, x="google", y="total_data", fit='ols', mx='box', my='violin')

In [20]:
scatter(sample, x="gaming", y="total_data",
        fit='ols', mx='box', my='violin')

In [21]:
scatter(sample, x="netflix", y="total_data", fit='ols', mx='box', my='violin')

In [22]:
scatter(sample, x="other", y="total_data", fit='ols', mx='box', my='violin')

In [23]:
def corelation(df):
    fig = px.scatter_matrix(sample, dimensions=[
        "total_data",
        "netflix",
        "youtube",
        "gaming",
        "other",
        "google"
    ])
    fig.update_layout(width=1200, height=600)
    fig.show()


corelation(sample)


## Variable transformations – segment the users into top five decile classes based on the total duration for all sessions and compute the total data (DL+UL) per decile class. 


In [24]:
user_dur = df.groupby(
    'msisdn_number').agg({'dur_(ms)': 'sum', 'total_data': 'sum'})
user_dur

Unnamed: 0_level_0,dur_(ms),total_data
msisdn_number,Unnamed: 1_level_1,Unnamed: 2_level_1
33601001722,116720.0,8.786906e+08
33601001754,181230.0,1.568596e+08
33601002511,134969.0,5.959665e+08
33601007832,49878.0,4.223207e+08
33601008617,37104.0,1.457411e+09
...,...,...
33789967113,160461.0,2.081231e+08
33789980299,210389.0,1.094693e+09
33789996170,8810.0,7.146416e+08
33789997247,140988.0,4.803073e+08


In [25]:
user_dur['dur_decile'] = pd.qcut(user_dur['dur_(ms)'], 5, labels=np.arange(1, 6))
total_data_per_decile = user_dur.groupby('dur_decile').agg(
    {'dur_(ms)': 'sum', 'total_data': 'sum'})
total_data_per_decile

Unnamed: 0_level_0,dur_(ms),total_data
dur_decile,Unnamed: 1_level_1,Unnamed: 2_level_1
1,647418800.0,11429980000000.0
2,1778446000.0,12271550000000.0
3,2129877000.0,11565240000000.0
4,3414379000.0,13773620000000.0
5,6744227000.0,23780340000000.0


In [33]:
fig = make_subplots(
    rows=1, cols=2, subplot_titles=("duration", "total_data")
)
fig.add_trace(go.Scatter(x=np.array(range(1, 6)),
              y=total_data_per_decile['dur_(ms)']), row=1, col=1)
fig.add_trace(go.Scatter(x=np.array(range(1, 6)),
              y=total_data_per_decile['total_data']), row=1, col=2)
fig.update_layout(title_text="The Elbow Method", height=700)
fig.show()


## Correlation Analysis 

In [34]:
data_usage_df = df[['social_media',
                   'google',
                    'email',
                    'youtube',
                    'netflix',
                    'gaming',
                    'other', ]].copy()
data_usage_corr = data_usage_df.corr(method='pearson')
data_usage_corr


Unnamed: 0,social_media,google,email,youtube,netflix,gaming,other
social_media,1.0,-0.001914,0.004184,0.000593,0.001384,0.001904,0.005532
google,-0.001914,1.0,-4.8e-05,0.002164,-0.00199,-0.000825,-0.001301
email,0.004184,-4.8e-05,1.0,-0.00344,0.001524,5.5e-05,-0.000761
youtube,0.000593,0.002164,-0.00344,1.0,0.003619,-0.003728,0.003225
netflix,0.001384,-0.00199,0.001524,0.003619,1.0,-0.003505,-0.007931
gaming,0.001904,-0.000825,5.5e-05,-0.003728,-0.003505,1.0,-0.002828
other,0.005532,-0.001301,-0.000761,0.003225,-0.007931,-0.002828,1.0


In [35]:
fig = px.imshow(data_usage_corr.values)
fig.show()

## Dimensionality Reduction – perform a principal component analysis to reduce the dimensions of your data and provide a useful interpretation of the results (Provide your interpretation in four (4) bullet points-maximum). 

In our dataset the original feature space has 45 dimensions, known as p dimensions. PCA will project the data onto a smaller subspace of k dimensions (where k < p) while retaining as much of the variation as possible. These k dimensions are known as the principal components. 

By applying PCA, we lose some of the variance (i.e., information). By reducing the dimensionality of the data, PCA will reduce the size of the data.
- This will improve the performance of machine learning algorithms.
- This will reduce hardware requirements and speed up the training process.
- This will allow us to easily understand the underlying structure of the data.
- This will allow us to visualize the data on a 2d or 3d plot (if we choose the number of principal components as 2 or 3).

We follow the following steps:
1. Obtain the feature matrix

In [37]:
numeric_columns = [ 'start_ms',
                    'end_ms',
                    'dur_(ms)',
                    'avg_rtt_dl_(ms)',
                    'avg_rtt_ul_(ms)',
                    'avg_bearer_tp_dl_(kbps)',
                    'avg_bearer_tp_ul_(kbps)',
                    'tcp_dl_retrans_vol_(bytes)',
                    'tcp_ul_retrans_vol_(bytes)',
                    'dl_tp_<_50_kbps_(%)',
                    '50_kbps_<_dl_tp_<_250_kbps_(%)',
                    '250_kbps_<_dl_tp_<_1_mbps_(%)',
                    'dl_tp_>_1_mbps_(%)',
                    'ul_tp_<_10_kbps_(%)',
                    '10_kbps_<_ul_tp_<_50_kbps_(%)',
                    '50_kbps_<_ul_tp_<_300_kbps_(%)',
                    'ul_tp_>_300_kbps_(%)',
                    'activity_duration_dl_(ms)',
                    'activity_duration_ul_(ms)',
                    'nb_of_sec_with_vol_dl_<_6250b',
                    'nb_of_sec_with_vol_ul_<_1250b',
                    'social_media_dl_(bytes)',
                    'social_media_ul_(bytes)',
                    'google_dl_(bytes)',
                    'google_ul_(bytes)',
                    'email_dl_(bytes)',
                    'email_ul_(bytes)',
                    'youtube_dl_(bytes)',
                    'youtube_ul_(bytes)',
                    'netflix_dl_(bytes)',
                    'netflix_ul_(bytes)',
                    'gaming_dl_(bytes)',
                    'gaming_ul_(bytes)',
                    'other_dl_(bytes)',
                    'other_ul_(bytes)',
                    'total_ul_(bytes)',
                    'total_dl_(bytes)',
                    'social_media',
                    'google',
                    'email',
                    'youtube',
                    'netflix',
                    'gaming',
                    'other',
                    'total_data']


In [38]:
feature_matrix = df[numeric_columns].values
feature_matrix.shape

(146882, 45)

2. Standardize the features if necessary

In [39]:
scaler = StandardScaler()
scaler.fit(feature_matrix)

feature_matrix_scaled = scaler.transform(feature_matrix)

3. Choose the right number of dimensions(k)


In [58]:
pca_45 = PCA(n_components=45, random_state=1)
pca_45.fit(feature_matrix_scaled)
x_pca_45 = pca_45.transform(feature_matrix_scaled)

In [59]:
pd.set_option('display.float_format', '{:.2f}'.format)
print(f"percentage of variance kept: {sum(pca_45.explained_variance_ratio_ * 100):.4}%")

percentage of variance kept: 100.0%


Here we didn't loose any variability because we didn't reduce the feature space.

In [60]:
feat_mt_ratio = pca_45.explained_variance_ratio_ * 100
for i in range(len(numeric_columns)):
  print(f'{feat_mt_ratio[i]:.2f}')

21.97
9.75
6.22
4.92
4.91
4.88
4.85
4.82
3.55
3.38
2.74
2.49
2.46
2.44
2.44
2.44
2.43
2.41
2.20
2.14
1.79
1.40
1.01
0.96
0.46
0.31
0.27
0.16
0.09
0.06
0.04
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00
0.00


The first component alone captures about 21.97 % of the variability in the dataset and the second component alone captures about 9.75 % of the variability in the dataset and so on


In [65]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Scatter(x=np.array(range(0, 45)),
              y=np.cumsum(pca_45.explained_variance_ratio_)), row=1, col=1)
fig.update_layout(title_text="The Elbow Method")
fig.show()


We can see that the first 24 principal components keep about 98.61% % of the variability in the dataset while reducing 21 (45–24) features in the dataset.

In [70]:
pca_24 = PCA(n_components=24, random_state=1)
pca_24.fit(feature_matrix_scaled)
x_pca_24 = pca_24.transform(feature_matrix_scaled)

In [72]:
print(
    f"percentage of variance kept: {sum(pca_24.explained_variance_ratio_ * 100):.4}%")


percentage of variance kept: 98.61%
