# Task 1 - User Overview Analysis 


 Extract and Load the xDR Dataset

In [1]:
import os
import sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))


### Load the data from postgresql

In [2]:
# import load_data from scripts/utils
from db_connect import load_data

xdr_df = load_data()

Connection to database successful!


In [3]:
# Explre the first five rows
xdr_df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [4]:
# Let's check the size of the data
xdr_df.shape

(150001, 55)

In [5]:
# check missing data
# Assuming your dataframe is named 'df'
missing_values = xdr_df.isnull().sum()

# Print columns with missing values only
missing_columns = missing_values[missing_values > 0]
print(missing_columns.reset_index())

                                       index       0
0                                  Bearer Id     991
1                                      Start       1
2                                   Start ms       1
3                                        End       1
4                                     End ms       1
5                                  Dur. (ms)       1
6                                       IMSI     570
7                              MSISDN/Number    1066
8                                       IMEI     572
9                         Last Location Name    1153
10                           Avg RTT DL (ms)   27829
11                           Avg RTT UL (ms)   27812
12                   Avg Bearer TP DL (kbps)       1
13                   Avg Bearer TP UL (kbps)       1
14               TCP DL Retrans. Vol (Bytes)   88146
15               TCP UL Retrans. Vol (Bytes)   96649
16                       DL TP < 50 Kbps (%)     754
17            50 Kbps < DL TP < 250 Kbps (%)  

**Approach to Handle Missing Values:**

Handling Missing Values in Key Columns

`IMSI, MSISDN/Number, and IMEI`: These are key identifiers, and missing values here can mean lost user data. I may need to drop rows where these columns are missing, as they are essential for user-level aggregation.

`Avg RTT DL/UL (ms)`: These are performance metrics. If missing values are frequent, I might replace them with the mean/median of their respective columns.

Throughput Metrics `(DL TP and UL TP columns)`: Similarly, these columns can be filled with the mean or median values.

In [6]:
# Drop rows with missing IMSI, MSISDN/Number, IMEI, and Last Location Name (as they are key identifiers)
df_cleaned = xdr_df.dropna(subset=['IMSI', 'MSISDN/Number', 'IMEI', 'Last Location Name'])

# List of throughput-related columns that may contain NaN values
throughput_columns = ['Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
                      'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)', '250 Kbps < DL TP < 1 Mbps (%)', 
                      'DL TP > 1 Mbps (%)', 'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)', 
                      '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)']

# Filling NaN values with the mean for each column
df_cleaned.loc[:, throughput_columns] = df_cleaned[throughput_columns].fillna(df_cleaned[throughput_columns].mean())


In [7]:
# Fill NaN values in 'Total UL (Bytes)' and 'Total DL (Bytes)' with 0
df_cleaned.loc[:, ['Total UL (Bytes)', 'Total DL (Bytes)']] = df_cleaned[['Total UL (Bytes)', 'Total DL (Bytes)']].fillna(0)


In [8]:
# Check and drop duplicates
duplicate_count = df_cleaned.duplicated().sum()
print(f'Number of duplicate rows: {duplicate_count}')

Number of duplicate rows: 0


In [9]:
df_cleaned.dtypes

Bearer Id                                   float64
Start                                        object
Start ms                                    float64
End                                          object
End ms                                      float64
Dur. (ms)                                   float64
IMSI                                        float64
MSISDN/Number                               float64
IMEI                                        float64
Last Location Name                           object
Avg RTT DL (ms)                             float64
Avg RTT UL (ms)                             float64
Avg Bearer TP DL (kbps)                     float64
Avg Bearer TP UL (kbps)                     float64
TCP DL Retrans. Vol (Bytes)                 float64
TCP UL Retrans. Vol (Bytes)                 float64
DL TP < 50 Kbps (%)                         float64
50 Kbps < DL TP < 250 Kbps (%)              float64
250 Kbps < DL TP < 1 Mbps (%)               float64
DL TP > 1 Mb

Handle outliers if any


In [10]:
import numpy as np

# Ensure were are working on a copy of the DataFrame to avoid setting issues
df_cleaned = df_cleaned.copy()

# Select only numeric columns (exclude location and date)
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns

# Clip outliers for the numeric columns
for column in numeric_columns:
    lower_bound = df_cleaned[column].quantile(0.05)
    upper_bound = df_cleaned[column].quantile(0.95)
    df_cleaned.loc[:, column] = df_cleaned[column].clip(lower=lower_bound, upper=upper_bound)


In [11]:
# Convert 'Start' column to datetime format
import pandas as pd
df_cleaned['Start'] = pd.to_datetime(df_cleaned['Start'], format='%m/%d/%Y %H:%M')


In [12]:
# Check data type of 'Start' column
print(df_cleaned['Start'].dtype)

datetime64[ns]


Start by identifying the top 10 handsets used by the customers.

Then, identify the top 3 handset manufacturers

Next, identify the top 5 handsets per top 3 handset manufacturer

Make a short interpretation and recommendation to marketing teams


In [13]:
from handset_analysis import HandsetAnalysis

ha = HandsetAnalysis(df_cleaned)
# top 10 handsets
print('The top 10 handsets')
ha.top_handsets(top_n=10).reset_index()

The top 10 handsets


Unnamed: 0,Handset Type,count
0,Huawei B528S-23A,19724
1,Apple iPhone 6S (A1688),9391
2,Apple iPhone 6 (A1586),8991
3,undefined,8892
4,Apple iPhone 7 (A1778),6274
5,Apple iPhone Se (A1723),5165
6,Apple iPhone 8 (A1905),4977
7,Apple iPhone Xr (A2105),4555
8,Samsung Galaxy S8 (Sm-G950F),4459
9,Apple iPhone X (A1901),3805


In [14]:
# Top 3 manufactures
print('Top 3 Manufacturers')
top_3_manufacturers = ha.top_manufacturers(top_n=3).reset_index(name='no_of_manufacturer')
top_3_manufacturers.reset_index(drop=True)

Top 3 Manufacturers


Unnamed: 0,Handset Manufacturer,no_of_manufacturer
0,Apple,59304
1,Samsung,40393
2,Huawei,34269


In [15]:
# Top 5 handest type per top 3 manufacturers
top_3_manufacturers = top_3_manufacturers['Handset Manufacturer'].tolist()
top_handsets_by_manufacturer = ha.top_handsets_per_manufacturer(
    top_3_manufacturers,
    top_n_handsets=3)

# Display the results
for manufacturer, handsets in top_handsets_by_manufacturer.items():
    print(f"Top handsets for {manufacturer}:\n{handsets}\n")

Top handsets for Apple:
Handset Type
Apple iPhone 6S (A1688)    9391
Apple iPhone 6 (A1586)     8991
Apple iPhone 7 (A1778)     6274
Name: count, dtype: int64

Top handsets for Samsung:
Handset Type
Samsung Galaxy S8 (Sm-G950F)    4459
Samsung Galaxy A5 Sm-A520F      3699
Samsung Galaxy J5 (Sm-J530)     3674
Name: count, dtype: int64

Top handsets for Huawei:
Handset Type
Huawei B528S-23A                  19724
Huawei E5180                       2073
Huawei P20 Lite Huawei Nova 3E     2011
Name: count, dtype: int64



Interpretation and Recommendations
Interpretation:

Apple: Popular older models like iPhone 6S and 6 indicate strong brand loyalty.
Samsung: Top handsets include both flagship and mid-range models, showing broad market appeal.
Huawei: High demand for the B528S-23A suggests strong interest in connectivity solutions.
Recommendations:

Apple: Promote older iPhones as cost-effective and introduce trade-in programs to drive upgrades.
Samsung: Emphasize the diverse product range and consider bundling offers for value.
Huawei: Focus on marketing connectivity devices and affordable smartphones to attract different customer segments.

In [16]:
df_cleaned.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,2019-04-04 12:01:00,770.0,4/25/2019 14:35,662.0,241983.25,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,2019-04-09 13:04:00,235.0,4/25/2019 8:15,606.0,241983.25,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,2019-04-09 17:42:00,49.0,4/25/2019 11:58,652.0,241983.25,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,833230.75,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,2019-04-10 00:31:00,486.0,4/25/2019 7:36,171.0,241983.25,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1101487.5,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,834328852.0
4,1.311448e+19,2019-04-12 20:10:00,565.0,4/25/2019 10:40,947.0,241983.25,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,1101487.5,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


#### Exploratory Analaysis



Task 1.1 - Your employer wants to have an overview of the users’ behavior on those applications.   
Aggregate per user the following information in the column  
number of xDR sessions
Session duration
the total download (DL) and upload (UL) data
the total data volume (in Bytes) during this session for each application


In [17]:
 # Aggregate the required columns
agg_df = df_cleaned.groupby('Bearer Id').agg({
    'Start': 'count', # Number of xDR sessions
    'Dur. (ms)': 'sum', # Session duration
    'Total UL (Bytes)': 'sum', # Total Upload Data
    'Total DL (Bytes)': 'sum', # Total Download Data
    'Social Media UL (Bytes)': 'sum',
    'Social Media DL (Bytes)': 'sum',
    'Google UL (Bytes)': 'sum',
    'Google DL (Bytes)': 'sum',
    'Email UL (Bytes)': 'sum',
    'Email DL (Bytes)': 'sum',
    'Youtube UL (Bytes)': 'sum',
    'Youtube DL (Bytes)': 'sum',
    'Netflix UL (Bytes)': 'sum',
    'Netflix DL (Bytes)': 'sum',
    'Gaming UL (Bytes)': 'sum',
    'Gaming DL (Bytes)': 'sum',
    'Other UL (Bytes)': 'sum',
    'Other DL (Bytes)': 'sum'
    }).reset_index()

In [18]:
agg_df

Unnamed: 0,Bearer Id,Start,Dur. (ms),Total UL (Bytes),Total DL (Bytes),Social Media UL (Bytes),Social Media DL (Bytes),Google UL (Bytes),Google DL (Bytes),Email UL (Bytes),Email DL (Bytes),Youtube UL (Bytes),Youtube DL (Bytes),Netflix UL (Bytes),Netflix DL (Bytes),Gaming UL (Bytes),Gaming DL (Bytes),Other UL (Bytes),Other DL (Bytes)
0,7.277826e+18,7418,5.306130e+08,3.058524e+11,3.366540e+12,2.475217e+08,1.330394e+10,1.522613e+10,4.311983e+10,3.454899e+09,1.325459e+10,8.155435e+10,8.600894e+10,8.194519e+10,8.623155e+10,6.115664e+10,3.124499e+12,6.229426e+10,3.122192e+12
1,7.277826e+18,1,6.887700e+04,5.707265e+07,4.817962e+08,2.797300e+04,2.269516e+06,1.220528e+06,5.869328e+06,7.725770e+05,3.406560e+06,2.089958e+07,1.122514e+07,1.548653e+07,7.230967e+06,7.423661e+06,4.516869e+08,1.078116e+07,4.244685e+08
2,7.277826e+18,1,7.602100e+04,3.305897e+07,2.250455e+08,4.181500e+04,4.166680e+05,3.109674e+06,4.546531e+06,5.216030e+05,2.607471e+06,1.193405e+07,1.333614e+07,2.206298e+06,2.186236e+07,9.162277e+06,1.822763e+08,6.083246e+06,7.034757e+08
3,7.277826e+18,1,8.014600e+04,2.244628e+07,7.624866e+08,2.717100e+04,3.237444e+06,2.329723e+06,1.032950e+06,3.372180e+05,3.036469e+06,1.496347e+06,1.396068e+07,1.203133e+06,2.182588e+06,6.391838e+06,7.390364e+08,1.028773e+07,4.602590e+08
4,7.277826e+18,1,7.278300e+04,5.906656e+07,5.204721e+08,5.773100e+04,6.886570e+05,3.910386e+06,9.079387e+06,8.900002e+05,1.844014e+06,1.031531e+07,1.251745e+07,1.725614e+07,1.573452e+07,1.383384e+07,4.806081e+08,1.265240e+07,7.854272e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120367,1.311448e+19,1,1.170370e+05,3.977921e+07,3.236875e+08,3.323250e+03,3.604070e+05,1.218224e+06,4.145548e+06,6.918130e+05,3.294860e+06,1.643199e+07,1.522415e+07,1.240651e+06,1.789416e+07,7.897616e+06,2.827684e+08,1.229872e+07,6.983540e+08
120368,1.311448e+19,1,1.074450e+05,3.967894e+07,8.031791e+08,1.576300e+04,1.417853e+06,2.632734e+06,7.756684e+06,1.008780e+05,2.014807e+06,1.091687e+06,1.312379e+07,1.052329e+07,1.398739e+07,1.532348e+07,7.648786e+08,1.063649e+07,4.181899e+07
120369,1.311448e+19,1,1.304440e+05,4.527086e+07,3.491442e+08,4.605000e+04,2.818085e+06,7.767290e+05,6.167330e+06,7.708100e+04,2.555927e+06,1.105803e+07,3.371338e+06,7.033423e+06,2.087226e+07,1.277058e+07,3.133592e+08,1.350897e+07,6.934238e+08
120370,1.311448e+19,1,8.515700e+04,5.524453e+07,5.518549e+08,4.900700e+04,2.120020e+06,2.612447e+06,3.825571e+06,8.696320e+05,2.174372e+06,1.554476e+07,5.772994e+06,1.132260e+07,5.093485e+06,1.287876e+07,5.328685e+08,1.196733e+07,4.765936e+08


In [19]:
agg_df.isnull().sum().reset_index()

Unnamed: 0,index,0
0,Bearer Id,0
1,Start,0
2,Dur. (ms),0
3,Total UL (Bytes),0
4,Total DL (Bytes),0
5,Social Media UL (Bytes),0
6,Social Media DL (Bytes),0
7,Google UL (Bytes),0
8,Google DL (Bytes),0
9,Email UL (Bytes),0




Task 1.2 - Conduct an exploratory data analysis on those data & communicate useful insights. Ensure that you identify and treat all missing values and outliers in the dataset by replacing them with the mean or any possible solution of the corresponding column.

You’re expected to report about the following using Python script and slide  :
- Describe all relevant variables and associated data types (slide). findings. 


In [20]:
# Descriptions of relevant columns
import pandas as pd
descriptions = {
    'Bearer Id': 'Session Id for each user',
    'Start': 'Session start time',
    'Dur. (ms)': 'Session duration (ms)',
    'Total UL (Bytes)': 'Total upload data (bytes)',
    'Total DL (Bytes)': 'Total download data (bytes)',
    'Social Media UL (Bytes)': 'Upload data for social media (bytes)',
    'Social Media DL (Bytes)': 'Download data for social media (bytes)',
    'Google UL (Bytes)': 'Upload data for Google apps (bytes)',
    'Google DL (Bytes)': 'Download data for Google apps (bytes)',
    'Email UL (Bytes)': 'Upload data for email (bytes)',
    'Email DL (Bytes)': 'Download data for email (bytes)',
    'Youtube UL (Bytes)': 'Upload data for YouTube (bytes)',
    'Youtube DL (Bytes)': 'Download data for YouTube (bytes)',
    'Netflix UL (Bytes)': 'Upload data for Netflix (bytes)',
    'Netflix DL (Bytes)': 'Download data for Netflix (bytes)',
    'Gaming UL (Bytes)': 'Upload data for gaming (bytes)',
    'Gaming DL (Bytes)': 'Download data for gaming (bytes)',
    'Other UL (Bytes)': 'Upload data for other apps (bytes)',
    'Other DL (Bytes)': 'Download data for other apps (bytes)'
}
# Convert the descriptions dictionary into a DataFrame
desc = pd.DataFrame(list(descriptions.items()), columns=['Variable', 'Description'])
dtypes = df_cleaned[agg_df.columns].dtypes.reset_index(name='data types')['data types']
desc['data type'] = dtypes
desc

Unnamed: 0,Variable,Description,data type
0,Bearer Id,Session Id for each user,float64
1,Start,Session start time,datetime64[ns]
2,Dur. (ms),Session duration (ms),float64
3,Total UL (Bytes),Total upload data (bytes),float64
4,Total DL (Bytes),Total download data (bytes),float64
5,Social Media UL (Bytes),Upload data for social media (bytes),float64
6,Social Media DL (Bytes),Download data for social media (bytes),float64
7,Google UL (Bytes),Upload data for Google apps (bytes),float64
8,Google DL (Bytes),Download data for Google apps (bytes),float64
9,Email UL (Bytes),Upload data for email (bytes),float64


In [21]:
# Import the eda_pipeline module from scripts
from eda_pipeline import EDA

In [22]:
eda = EDA(df_cleaned)
decile_summary = eda.segment_users_by_decile()
print(decile_summary)

   Decile  Total Data (Bytes)     Dur. (ms)
0       5        7.357698e+12  1.527887e+09
1       6        7.329533e+12  1.971381e+09
2       7        7.377323e+12  2.444427e+09
3       8        7.384645e+12  3.335336e+09


User Engagement Analysis Based on Session Duration and Data Usage

Key Insights:

Decile 5:

Total Data: 7.36 TB
Total Duration: 425 hours
Decile 6:

Total Data: 7.33 TB
Total Duration: 547 hours
Decile 7:

Total Data: 7.38 TB
Total Duration: 678 hours
Decile 8:

Total Data: 7.38 TB
Total Duration: 927 hours

Recommendations:

Target High-Engagement Users:
Focus on users in deciles 7 and 8 for premium service offerings or personalized plans.

Optimize Data-Heavy Applications:
Improve performance for streaming and gaming apps to enhance user experience.

Retention Strategies:
Offer exclusive benefits or enhanced customer support to retain top-tier users.

Analysis of Statistical Metrics in the Dataset

To analyze the basic metrics like mean, median, standard deviation, and percentiles, we calculate these for key columns such as session duration and total data usage (upload + download).


**Quantitative Variables:**

`Dur. (ms)` (Session Duration in milliseconds): Measures the length of user sessions. This is a continuous variable that represents how long a session lasts.

`Total UL (Bytes)` (Total Upload Data): Represents the total amount of data uploaded by users in bytes. This is a continuous variable showing the volume of data uploaded.

`Total DL (Bytes)` (Total Download Data): Represents the total amount of data downloaded by users in bytes. This is also a continuous variable indicating the volume of data downloaded.

In [24]:
# Basic Statistic and despersion metrics
quantitative_vars = ['Dur. (ms)', 'Total UL (Bytes)', 'Total DL (Bytes)']
eda.compute_basic_metrics(quantitative_vars)

Unnamed: 0,Dur. (ms),Total UL (Bytes),Total DL (Bytes)
Mean,100167.1,41117430.0,454704400.0
Median,86399.0,41143750.0,456017700.0
Mode,86399.0,22446280.0,74556220.0
Standard Deviation,58257.19,10474660.0,240556400.0
Variance,3393900000.0,109718500000000.0,5.786738e+16
Range,221721.2,37298780.0,759772600.0
IQR,74591.75,15815750.0,422592900.0


**Basic Metrics Interpretation**

**Mean**:

Session Duration: 100,167 ms (100.2 seconds) indicates the average user session duration.
Total Upload (UL): 41 MB on average per session.
Total Download (DL): 454 MB on average per session.

Importance:

The mean provides insight into typical user behavior and data consumption, serving as a benchmark for assessing normal usage patterns.

**Median**:

Session Duration: 86,399 ms (86.4 seconds) shows the middle value of user session durations.
Total Upload (UL): 41 MB.
Total Download (DL): 456 MB.

Importance:

The median offers a robust measure of central tendency, less affected by outliers, representing the typical session duration and data usage.

**Mode:**

Session Duration: 86,399 ms (86.4 seconds) is the most frequently occurring session duration.
Total Upload (UL): 22 MB.
Total Download (DL): 75 MB.

Importance:

The mode highlights the most common session duration and data sizes, useful for identifying frequent patterns.


**Dispersion Metrics**

**Standard Deviation:**

Session Duration: 58,257 ms shows substantial variability in session lengths.
Total Upload (UL): 10.5 MB reflects moderate variability in upload sizes.
Total Download (DL): 240.6 MB indicates high variability in download sizes.

Importance:

Standard deviation measures the extent of dispersion around the mean, helping to understand the consistency or variability in session durations and data usage.

**Range:**

Session Duration: 221,721 ms indicates the spread between the shortest and longest sessions.
Total Upload (UL): 37.3 MB.
Total Download (DL): 759.8 MB.

Importance:

The range highlights the extent of variability within the data, pointing out the difference between minimum and maximum values.

**IQR (Interquartile Range):**

Session Duration: 74,591 ms represents the spread within the middle 50% of session durations.
Total Upload (UL): 15.8 MB.
Total Download (DL): 422.6 MB.

Importance:

IQR provides insight into the dispersion of the central portion of the data, useful for identifying and addressing outliers or significant spreads.

**Importance for the Global Objective:**

User Segmentation: These metrics allow us to classify users based on behavior, helping in targeting different segments for marketing or retention.

Service Optimization: By understanding the typical and extreme data usage, we can optimize services to cater to both average users and power users.

Product Strategy: Knowing the mean and variability in user engagement helps in designing personalized offerings that match user behavior.

---

**Non-Graphical Univariate Analysis with Dispersion Parameters**

Let's compute key dispersion parameters like variance, range, and interquartile range (IQR) for each quantitative variable, followed by useful interpretations.

In [None]:

dispersion_df = eda.compute_dispersion_params(quantitative_vars)
dispersion_df

Unnamed: 0,mean,median,std_dev,90th_percentile,min,max
Dur. (ms),100167.1,86399.0,58257.19,181786.0,20262.0,241983.2
Total UL (Bytes),41117430.0,41143748.5,10474660.0,55837733.5,22446282.75,59745060.0
Total DL (Bytes),454704400.0,456017654.0,240556400.0,792661990.0,74556220.25,834328900.0


In [None]:
# Import necessary functions
from data_preparation import prepare_data
from eda_pipeline import EDA

from user_analysis import UserAnalysis

# Load and clean data
file_path = "path_to_xdr_data.csv"
df = prepare_data(file_path)

# EDA
basic_statistics(df)
univariate_analysis(df)
bivariate_analysis(df)

# User Analysis
print("Top 10 Handsets:")
print(top_handsets(df))

print("\nTop 3 Manufacturers:")
top_manufacturers_list = top_manufacturers(df)
print(top_manufacturers_list)

for manufacturer in top_manufacturers_list.index:
    print(f"\nTop 5 Handsets for {manufacturer}:")
    print(top_handsets_per_manufacturer(df, manufacturer))

ImportError: cannot import name 'prepare_data' from 'data_preparation' (/home/noh/10Academy/10academy-aim-week1-challenge/scripts/data_preparation.py)

In [None]:
# notebooks/user_overview_analysis.ipynb

import pandas as pd
from scripts.data_aggregation import aggregate_user_data
from scripts.eda import handle_missing_values, handle_outliers, basic_statistics, plot_distribution, plot_correlation_matrix
from scripts.user_overview import top_10_handsets, top_3_manufacturers, segment_users_by_decile

# Load Data
df = pd.read_csv('path_to_your_xdr_dataset.csv')

# Aggregate Data
agg_df = aggregate_user_data(df)

# Handle Missing Values
agg_df = handle_missing_values(agg_df)

# Handle Outliers
agg_df = handle_outliers(agg_df)

# Basic Statistics
print(basic_statistics(agg_df))

# Plot Distributions
plot_distribution(agg_df, 'Total DL (Bytes)')

# Correlation Matrix
plot_correlation_matrix(agg_df)
