In [79]:
import  pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler




In [80]:
import sys
import os

# Add scripts path
sys.path.append(os.path.abspath('../scripts'))

In [81]:
from load_data import connect_to_db


In [None]:
query = "SELECT * FROM xdr_data;"
df = connect_to_db(query)
if df is not None:
    print("successfuly connected")
else:
    print("failed to connect")

In [None]:
df

In [None]:
top_handsets = df['Handset Type'].value_counts().head(10)
print("Top 10 Handsets:\n", top_handsets)


In [None]:
top_manufacturers = df['Handset Manufacturer'].value_counts().head(3)
print("Top 3 Handset Manufacturers:\n", top_manufacturers)


In [None]:
for manufacturer in top_manufacturers.index:
    top_handsets_per_manufacturer = df[df['Handset Manufacturer'] == manufacturer]['Handset Type'].value_counts().head(5)
    print(f"Top 5 Handsets for {manufacturer}:\n", top_handsets_per_manufacturer)


In [None]:
# Analyze User Behavior on Applications Analyze User Behavior on Applications
user_behavior = df.groupby('IMSI').agg({
    'Bearer Id': 'sum',
    'Dur. (ms)': 'sum',
    'Total DL (Bytes)': 'sum',
    'Total UL (Bytes)': 'sum',
})
user_behavior['total_data'] = user_behavior['Total DL (Bytes)'] + user_behavior['Total UL (Bytes)']
print(user_behavior.head())


In [None]:
# Check for missing values
print("Missing Values Summary:")
print(df.isnull().sum())

# Fill missing numeric columns with the mean
numeric_cols = df.select_dtypes(include=["number"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing categorical columns with 'Unknown'
categorical_cols = df.select_dtypes(exclude=["number"]).columns
df[categorical_cols] = df[categorical_cols].fillna("Unknown")


In [None]:
# Define a function to handle outliers using the IQR method
def handle_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return np.clip(col, lower_bound, upper_bound)

# Apply outlier treatment for numeric columns
for col in numeric_cols:
    df[col] = handle_outliers(df[col])


In [None]:
# Summary of data types and unique values
variable_summary = pd.DataFrame({
    "Data Type": df.dtypes,
    "Unique Values": df.nunique()
})
print("Variable Summary:")
print(variable_summary)

# Save this summary to a CSV for presentation purposes
variable_summary.to_csv("variable_summary.csv", index=True)


In [None]:
#Segment users into top five decile classes based on Dur. (ms) and compute total data for each decile.
df['total_data'] = df['Total DL (Bytes)'] + df['Total UL (Bytes)']
df['duration_decile'] = pd.qcut(df['Dur. (ms)'], 5, labels=False)

# Compute total data per decile
decile_data = df.groupby('duration_decile')['total_data'].sum().reset_index()
print("Decile Data Summary:")
print(decile_data)


In [None]:
basic_metrics = df.describe()
print("Basic Metrics Summary:")
print(basic_metrics)

# Save to CSV for reporting
basic_metrics.to_csv("basic_metrics.csv")


In [None]:
#Non-Graphical Univariate Analysis

dispersion_params = df[numeric_cols].agg(['mean', 'median', 'std', 'var', 'min', 'max'])
print("Dispersion Parameters:")
print(dispersion_params)

# Save to CSV for reporting
dispersion_params.to_csv("dispersion_params.csv")


In [None]:
# Histogram for numeric variables
df[numeric_cols].hist(bins=30, figsize=(20, 15))
plt.suptitle("Histograms for Numeric Variables")
plt.savefig("histograms.png")

# Boxplot for numeric variables
plt.figure(figsize=(15, 8))
sns.boxplot(data=df[numeric_cols])
plt.title("Boxplot for Numeric Variables")
plt.savefig("boxplot.png")


In [None]:
import numpy as np

# Scatterplot for total data vs applications (with log transform)
app_cols = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
            'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)', 'Other DL (Bytes)']

for col in app_cols:
    plt.figure(figsize=(8, 6))
    
    # Log-transform the data (avoid log(0) by adding a small constant)
    log_x = np.log1p(df[col])  # log(1 + x) ensures no issues with zero values
    log_y = np.log1p(df['total_data'])
    
    # Scatterplot with log-transformed data
    sns.scatterplot(x=log_x, y=log_y)
    plt.title(f"Log-Transformed Total Data vs {col}")
    plt.xlabel(f"Log {col}")
    plt.ylabel("Log Total Data")
    
    # Save the plot
    plt.show()


In [None]:
import numpy as np

# Apply log transformation
df['log_social_media_dl'] = np.log1p(df['Social Media DL (Bytes)'])
df['log_total_data'] = np.log1p(df['total_data'])

plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['log_social_media_dl'], y=df['log_total_data'], alpha=0.5)
plt.title("Scatter Plot (Log-Transformed): Total Data vs Social Media DL")
plt.xlabel("Log(Social Media DL Bytes)")
plt.ylabel("Log(Total Data)")
plt.savefig("log_scatter_social_media_dl.png")


In [None]:
# Correlation matrix
corr_matrix = df[app_cols].corr()
print("Correlation Matrix:")
print(corr_matrix)

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.savefig("correlation_heatmap.png")


In [None]:
# Dimensionality Reduction (PCA)
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[app_cols])

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Variance explained by each component
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Plot PCA results
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], s=10, alpha=0.6)
plt.title("PCA Results")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(alpha=0.3)


In [None]:
 # TASK 2

In [None]:
# Aggregating engagement metrics per user

engagement_metrics = df.groupby('MSISDN/Number').agg({
    'Dur. (ms)': 'sum',  
    'Total DL (Bytes)': 'sum',   
    'Total UL (Bytes)': 'sum',   
    'Bearer Id': 'count'      
}).rename(columns={
    'Dur. (ms)': 'Total_Duration',
    'Total DL (Bytes)': 'Total_DL',
    'Total UL (Bytes)': 'Total_UL',
    'Bearer Id': 'Session_Frequency'
})

# Total traffic (download + upload)
engagement_metrics['Total_Traffic'] = engagement_metrics['Total_DL'] + engagement_metrics['Total_UL']

# Top 10 customers per engagement metric
top_10_duration = engagement_metrics.nlargest(10, 'Total_Duration')
top_10_traffic = engagement_metrics.nlargest(10, 'Total_Traffic')
top_10_frequency = engagement_metrics.nlargest(10, 'Session_Frequency')

print("Top 10 customers by duration:")
print(top_10_duration)

print("Top 10 customers by traffic:")
print(top_10_traffic)

print("Top 10 customers by session frequency:")
print(top_10_frequency)


In [None]:
# Normalize Metrics and Perform K-Means Clustering

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

# Normalize the metrics
scaler = MinMaxScaler()
engagement_metrics_normalized = scaler.fit_transform(engagement_metrics[['Total_Duration', 'Total_Traffic', 'Session_Frequency']])

# Run k-means (k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
engagement_metrics['Cluster'] = kmeans.fit_predict(engagement_metrics_normalized)

print("Cluster Assignments:")
print(engagement_metrics['Cluster'].value_counts())


In [None]:
# Compute Min, Max, Average, and Total Metrics for Each Cluster


cluster_metrics = engagement_metrics.groupby('Cluster').agg({
    'Total_Duration': ['min', 'max', 'mean', 'sum'],
    'Total_Traffic': ['min', 'max', 'mean', 'sum'],
    'Session_Frequency': ['min', 'max', 'mean', 'sum']
})

print("Cluster Metrics:")
print(cluster_metrics)


In [None]:
# Aggregate user total traffic per application
app_cols = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
            'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)', 'Other DL (Bytes)']

application_metrics = []  # Store metrics for all applications

for app_col in app_cols:
    # Calculate total traffic for each user (MSISDN/Number) for this application
    app_data = df.groupby('MSISDN/Number').agg({
        app_col: 'sum',  # Aggregate the download traffic for this app
        'Total UL (Bytes)': 'sum'  # Aggregate the upload traffic
    }).rename(columns={
        app_col: 'Total_DL',  # Rename download column
        'Total UL (Bytes)': 'Total_UL'  # Rename upload column
    })
    
    # Compute total traffic (download + upload)
    app_data['Total_Traffic'] = app_data['Total_DL'] + app_data['Total_UL']
    
    # Add application name as a column
    app_data['Application'] = app_col.replace(' DL (Bytes)', '')  # Clean app name
    
    # Append to application_metrics list
    application_metrics.append(app_data)

# Concatenate all application metrics into a single DataFrame
application_metrics = pd.concat(application_metrics)

# Top 10 users per application
top_users_per_app = application_metrics.groupby('Application').apply(
    lambda x: x.nlargest(10, 'Total_Traffic')
)

print("Top 10 users per application:")
print(top_users_per_app)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get the top 3 applications by total traffic
top_apps = application_metrics.groupby('Application')['Total_Traffic'].sum().nlargest(3).index

# Filter data for top 3 applications
top_app_data = application_metrics[application_metrics['Application'].isin(top_apps)]

# Plot traffic distribution
sns.barplot(x='Application', y='Total_Traffic', data=top_app_data, estimator='sum', ci=None)
plt.title("Top 3 Most Used Applications by Total Traffic")
plt.xlabel("Application")
plt.ylabel("Total Traffic (Bytes)")
plt.show()


In [None]:
# Elbow method
sse = []
k_range = range(1, 10)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(engagement_metrics_normalized)
    sse.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, sse, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Sum of Squared Errors (SSE)")
plt.show()
