In [None]:
import pandas as pd

# Load a single CSV file
broadbands= pd.read_csv('../data/raw/broadbands.csv')
broadbands.tail()

In [None]:
#Create a commun format for the columns

broadbands.columns



In [None]:
# Clean Columns
broadbands.columns = broadbands.columns.str.strip().str.lower()  #Convert colummns to lowercase
broadbands


In [None]:
broadbands_cleaned = broadbands.dropna(axis=1, how='all') #removing all columns with all NaN values
broadbands_cleaned = broadbands_cleaned.dropna(axis=1, how='all') #removing all columns with all NaN values
broadbands_cleaned                  #These columns had NaN values so they were dropped (obs_flag, conf_status)  

In [None]:
broadbands_cleaned[broadbands_cleaned["inet_tec"]== "Long-term evolution (LTE)"].head(50)

In [None]:
#broadbands_cleaned["terrtypo"].value_counts()
broadbands_cleaned["terrtypo"].value_counts()

In [None]:
broadbands_cleaned["freq"].value_counts()

In [None]:
broadbands_cleaned["unit"].value_counts()

In [None]:
broadbands_cleaned["time_period"].value_counts()

In [None]:
broadbands_cleaned["geo"].nunique()

In [None]:
#broadbands_cleaned = broadbands_cleaned.drop(columns = ["dataflow", "last_update", "freq", "unit", "terrtypo"])

#broadbands_cleaned

#Drop specified columns from the DataFrame
# Drop specified columns from the DataFrame
#columns_to_drop = ["dataflow", "last_update", "freq", "unit", "terrtypo"]
#broadbands_cleaned = broadbands_cleaned.drop(columns=columns_to_drop)

# Display the updated DataFrame
#broadbands_cleaned


In [None]:
#broadbands_v2["inet_tec"].value_counts()

In [None]:
broadbands_v3 = broadbands_cleaned.pivot(index=['inet_tec', 'geo'], columns='time_period', values='obs_value').reset_index() #pivoting to add years on columns

broadbands_v3[broadbands_v3["inet_tec"]== "Long-term evolution (LTE)"]

In [None]:
# Dataframe for broadbands
broadbands_v3 = pd.DataFrame({
    "inet_tec": [
        "Digital subscriber line (DSL)",
        "Satellite",
        "Very high-speed digital subscriber line (VDSL)",
        "Very high-speed digital subscriber line 2 vectoring (VDSL2)",
        "Cable modem",
        "High speed packet access (HSPA)",
        "Long-term evolution (LTE)",
        "Fifth generation technology standard for broadband cellular networks (5G)"
    ],
    "type": ["Fixed"] * 5 + ["Mobile"] * 3,  # Categorizing as Fixed or Mobile
    "speed": ["Slow", "Slow", "Fast", "Fast", "Fast", "Slow", "Fast", "Fast"]  # Categorizing speed
})

# Ordering the rows by speed
broadbands_ordered = broadbands_v3.sort_values(by="speed", ascending=False)

# Display the ordered DataFrame
display(broadbands_ordered)


In [None]:
# Define the mapping for inet_tec values into the new column 'internet type'
internet_type_mapping = {
    "Digital subscriber line (DSL)": "Fixed Slow",
    "Satellite": "Fixed Slow",
    "Very high-speed digital subscriber line (VDSL)": "Fixed Fast",
    "Very high-speed digital subscriber line 2 vectoring (VDSL2)": "Fixed Fast",
    "Cable modem": "Fixed Fast",
    "High speed packet access (HSPA)": "Mobile Slow",
    "Long-term evolution (LTE)": "Mobile Fast",
    "Fifth generation technology standard for broadband cellular networks (5G)": "Mobile Fast"
}

# Apply the mapping before dropping inet_tec
broadbands_cleaned["internet type"] = broadbands_cleaned["inet_tec"].map(internet_type_mapping)

# Drop unnecessary columns after mapping
broadbands_cleaned = broadbands_cleaned.drop(
    columns=["dataflow", "freq", "terrtypo", "last frequency", "last update", 
             "fixed_slow", "fixed_fast", "mobile_slow", "mobile_fast", "inet_tec", "unit"], 
    errors='ignore'
)

# Rename column 'obs_value' to 'percentage of households'
broadbands_cleaned = broadbands_cleaned.rename(columns={"obs_value": "percentage of households"})


In [None]:
# Define regions mapping
regions = {
    'Western Europe': ['Austria', 'Belgium', 'France', 'Germany', 'Netherlands', 'Switzerland', 'Luxembourg'],
    'Central Europe': ['Czechia', 'Hungary', 'Poland', 'Slovakia', 'Slovenia'],
    'Northern Europe': ['Denmark', 'Finland', 'Iceland', 'Ireland', 'Norway', 'Sweden', 'United Kingdom'],
    'Eastern Europe': ['Bulgaria', 'Croatia', 'Estonia', 'Latvia', 'Lithuania', 'Romania', 'Serbia', 'North Macedonia', 'Montenegro', 'Bosnia and Herzegovina', 'Albania', 'Kosovo*'],
    'Southern Europe': ['Greece', 'Italy', 'Malta', 'Portugal', 'Spain', 'Cyprus', 'Türkiye']
}

# Create a reverse mapping from country to region
country_to_region = {country: region for region, countries in regions.items() for country in countries}

# Add a 'region' column
broadbands_cleaned['region'] = broadbands_cleaned['geo'].map(country_to_region)

# Drop the geo column
# Drop unnecessary columns after mapping
broadbands_cleaned = broadbands_cleaned.drop(columns=["geo"], errors='ignore')


In [None]:
# Create a pivot table grouping by region, internet type, and calculating the mean for each year
df_grouped = broadbands_cleaned.groupby(['region', 'internet type', 'time_period'])['percentage of households'].mean().unstack()

# Convert the index (years) to integers for proper plotting
df_grouped.columns = df_grouped.columns.astype(int)

# Round the mean values to two decimal places
df_grouped = df_grouped.round(2)

df_grouped

In [None]:
# Plot trends over time

import matplotlib.pyplot as plt

plt.figure(figsize=(14, 8))
for region in df_grouped.index.get_level_values(0).unique():
    subset = df_grouped.loc[region]
    subset.T.plot(title=f'Trend over Time for {region}', ylabel='Percentage of Households', xlabel='Year')
    plt.legend(title='Internet Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
internet_types = df_grouped.index.levels[1]

for internet_type in internet_types:
    plt.figure(figsize=(14, 8))
    for region in df_grouped.index.get_level_values(0).unique():
        subset = df_grouped.loc[(region, internet_type)]
        plt.plot(subset.index, subset.values, label=region)

    plt.title(f"Trend over Time for {internet_type}")
    plt.ylabel("Percentage of Households")
    plt.xlabel("Year")
    plt.legend(title="Region", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()