In [None]:
%run cleaning_purchases_individuals.ipynb
df_purchases_cleaned_v3, df_per_region = import_df() # type: ignore

import seaborn as sns

In [None]:
# i am going to filter by "last_online_purchase = in the 12 months" and "grouped_individuals = All individuals" to have a general view

df_purchases_global = df_purchases_cleaned_v3[df_purchases_cleaned_v3['last_online_purchase'] == "in the 12 months"]
df_purchases_global = df_purchases_global[df_purchases_global['grouped_individuals'] == "All individuals"].reset_index()
df_purchases_global = df_purchases_global.drop(columns = "index") # remove the column index for better visualization

In [None]:
df_cleaned = df_purchases_global.drop(columns=['last_online_purchase', 'grouped_individuals']) # dropping "last_online_purchase" and "grouped_individuals" columns	

In [None]:
df_cleaned.set_index('country', inplace=True) # set country as index

df_transposed = df_cleaned.T # transposing DF to have years as rows and country as columns

df_transposed.index = df_transposed.index.astype(int) # Convert the index (years) to integers for proper plotting /!\

In [None]:
# Plot the data
plt.figure(figsize=(14, 8))
for country in df_transposed.columns:
    plt.plot(df_transposed.index, df_transposed[country], label=country)

# Add labels and title
plt.xlabel('Year')
plt.ylabel('% of Individuals realizing Online Purchases')
plt.title('% of Individuals realizing Online Purchases by Country (2010-2024)')
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')

# Add grid for better readability
plt.grid(True)

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()

# This is too messy and we cannot have conclusons

# % of Individuals realizing Online Purchases by European Region (2010-2024)

In [None]:
# Lets try to simplify by grouping countries per regions

# Drop unnecessary columns
df_cleaned = df_per_region.drop(columns=['last_online_purchase', 'grouped_individuals'])

# Group by region and calculate the mean for each year
df_grouped = df_cleaned.groupby('region').mean(numeric_only=True).T

# Convert the index (years) to integers for proper plotting
df_grouped.index = df_grouped.index.astype(int)

df_grouped

In [None]:
# Plot the data
plt.figure(figsize=(14, 8))
for region in df_grouped.columns:
    plt.plot(df_grouped.index, df_grouped[region], marker='o', label=region)

# Add labels and title
plt.xlabel('Year')
plt.ylabel('% of Individuals realizing Online Purchases')
plt.title('% of Individuals realizing Online Purchases by European Region (2010-2024)')
plt.legend(title='Region', loc='upper left')

# Add grid for better readability
plt.grid(True)

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()

# This one is looking good, maybe we can cross-check it with connection availability :)

# % Individuals purchasing online on all EU countries - Year Pick

In [None]:
def purchasing_year(year):
    df_purchases_global.columns = df_purchases_global.columns.astype(str)
    df_purchasingyear = df_purchases_global[['country', year]].dropna().sort_values(year, ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='2020', y='country', data=df_2020) # SEABORN!
    plt.xlabel('Percentage of Individuals purchasing online at least once')
    plt.title(f'% Individuals purchasing online in {year}')
    print(plt.show())

In [None]:
purchasing_year("2020")

# Education level

In [None]:
# i am going to filter by "last_online_purchase = in the 12 months" and "grouped_individuals = educations" to have the detail
educationlevel = ["Individuals aged 16-24 with high formal education", "Individuals aged 16-24 with low education", "Individuals aged 16-24 with medium formal education", "Individuals aged 25 to 54 with high formal education", "Individuals aged 25 to 54 with low formal education", "Individuals aged 25 to 54 with medium formal education", "Individuals aged 55 to 74 with high formal education", "Individuals aged 55 to 74 with low formal education", "Individuals aged 55 to 74 with medium formal education"]

df_purchases_education = df_purchases_cleaned_v3[df_purchases_cleaned_v3['last_online_purchase'] == "in the 12 months"]
df_purchases_education = df_purchases_education[df_purchases_education['grouped_individuals'].isin(educationlevel)].reset_index(drop=True)

#also removing the values "Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)" or "European Union - 27 countries (from 2020)"
df_purchases_education = df_purchases_education[~df_purchases_education['country'].isin(["Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)", "European Union - 27 countries (from 2020)"])]

df_purchases_education = df_purchases_education.drop(columns="last_online_purchase") # drop last_online_purchase column

df_purchases_education

In [None]:
df_purchases_education["grouped_individuals"].value_counts()

In [None]:
# Lets try to group by education level low - medium - high
# Define education
from dictionaries import education # importing education dictionary from dictionaries.py


df_purchases_edugrouped = df_purchases_education.copy() # Reverse mapping from education description to level /!\

education_to_level = {education_desc: level for level, descriptions in education.items() for education_desc in descriptions}

# Add a new 'education_level' column using the mapping
df_purchases_edugrouped['education_level'] = df_purchases_edugrouped['grouped_individuals'].map(education_to_level)

df_cleaned2 = df_purchases_edugrouped.drop(columns='grouped_individuals') # Drop no needed columns

df_grouped2 = df_cleaned2.groupby('education_level').mean(numeric_only=True).T # Group by the 'education_level' and calculate the mean for each year

df_grouped2.index = df_grouped2.index.astype(int) # Convert the index (years) to integers for proper plotting

df_grouped2

In [None]:
# Define the DataFrame (assuming df_grouped2 is already created as you described)
# Columns are 'High', 'Low', 'Medium' and index are the years

plt.figure(figsize=(14, 8))

# Iterate over each education level and plot its respective line on the graph
for education_level in df_grouped2.columns:
    plt.plot(df_grouped2.index, df_grouped2[education_level], marker='o', label=education_level)

# Add labels and a title
plt.xlabel('Year')
plt.ylabel('% of Individuals Realizing Online Purchases')
plt.title('% of Individuals Realizing Online Purchases by Education Level (2010-2024)')

# Add a legend to identify education levels in the plot
plt.legend(title='Education Level', loc='upper left')

# Add a grid to improve readability
plt.grid(True)

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45)

# Optimize layout to prevent clipping
plt.tight_layout()

# Display the plot
plt.show()

# Grouping per ages

In [None]:
# removing all row values that add have more info than just group age.
# list with the values to keep:
age_groups = ["Individuals, 16 to 24 years old", "Individuals, 25 to 34 years old", "Individuals, 35 to 44 years old", "Individuals, 45 to 54 years old", "Individuals, 55 to 64 years old", "Individuals, 65 to 74 years old"]

df_purchases_agegrouped = df_purchases_cleaned_v3[df_purchases_cleaned_v3['last_online_purchase'] == "in the 12 months"]
df_purchases_agegrouped = df_purchases_agegrouped[df_purchases_agegrouped['grouped_individuals'].isin(age_groups)].reset_index(drop=True)

df_purchases_agegrouped = df_purchases_agegrouped.drop(columns='last_online_purchase') # Drop no needed columns

#also removing the values "Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)" or "European Union - 27 countries (from 2020)"
df_purchases_agegrouped = df_purchases_agegrouped[~df_purchases_agegrouped['country'].isin(["Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)", "European Union - 27 countries (from 2020)"])]


df_purchases_agegrouped

In [None]:
# I am detecting that those NaN seem to be values = 0 in most of cases so going to fill those NaN with 0:
df_purchases_agegrouped.fillna(0, inplace=True)

In [None]:
# going to group by grouped_individuals mean:
df_age_meanvalues = df_purchases_agegrouped.groupby('grouped_individuals').mean(numeric_only=True)

df_age_meanvalues

In [None]:
df_age_meanvalues = df_age_meanvalues.T # TRANSPOSING for plotting!

In [None]:
# Plotting
plt.figure(figsize=(12, 8))
for column in df_age_meanvalues.columns:
    plt.plot(df_age_meanvalues.index, df_age_meanvalues[column], marker='o', label=column)

# Add labels and title
plt.xlabel('Year')
plt.ylabel('Value')
plt.title('Line Graph of Values by Age Group (2010-2024)')
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()

# Show plot
plt.show()

# Individuals living places
we can use that to compare it with the available conectivity

In [None]:
# list with the values to keep:
living_area = ["Individuals living in cities", "Individuals living in rural areas", "Individuals living in towns and suburbs"]

df_purchases_livinggroup = df_purchases_cleaned_v3[df_purchases_cleaned_v3['last_online_purchase'] == "in the 12 months"]
df_purchases_livinggroup = df_purchases_livinggroup[df_purchases_livinggroup['grouped_individuals'].isin(living_area)].reset_index(drop=True)

df_purchases_livinggroup = df_purchases_livinggroup.drop(columns='last_online_purchase') # Drop no needed columns

df_purchases_livinggroup

In [None]:
# going to group by grouped_individuals mean:
df_living_meanvalues = df_purchases_livinggroup.groupby('grouped_individuals').mean(numeric_only=True)

df_living_meanvalues

In [None]:
df_living_meanvalues = df_living_meanvalues.T # TRANSPOSING!

In [None]:
# Plotting
plt.figure(figsize=(12, 8))
for column in df_living_meanvalues.columns:
    plt.plot(df_living_meanvalues.index, df_living_meanvalues[column], marker='o', label=column)

# Add labels and title
plt.xlabel('Year')
plt.ylabel('Value')
plt.title('Line Graph of Values by Living Area (all UE)')
plt.legend(title='Living Area Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()

# Show plot
plt.show()

In [None]:
#testing something (trying to deepdive on the living+ groups)
df_per_regionall = df_per_region[df_per_region['last_online_purchase'] == "in the 12 months"]
df_per_regionall = df_per_regionall.drop(columns = "last_online_purchase") # remove the extra columns
df_per_regionall = df_per_regionall[df_per_regionall['grouped_individuals'].isin(living_area)].reset_index(drop=True)

df_per_regionall

# % Individuals purchasing online grouped by living place - Country Pick

In [None]:
def living_country_graphic(country_pick):
    df_per_regionallcountry = df_per_regionall.drop(columns = "region")
    filtered_df = df_per_regionallcountry[df_per_regionallcountry['country'] == country_pick]
    filtered_df = filtered_df.drop(columns = "country") # remove the extra columns
    filtered_df.set_index('grouped_individuals', inplace=True)
    filtered_df = filtered_df.T # TRANSPOSING!

    plt.figure(figsize=(12, 6))

    for column in filtered_df.columns:
        plt.plot(filtered_df.index, filtered_df[column], marker='o', label=column)

    # Customize the plot
    plt.title(f"% People purchasing online in {country}")
    plt.xlabel('Year')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45)
    plt.legend(title='Grouped Individuals')
    plt.grid(True)
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
living_country_graphic("Spain")

# % Individuals purchasing online grouped by living place - Region Pick

In [None]:
# lets try doing a function:

def living_region_graphic(region_pick):
    filtered_df2 = df_per_regionall[df_per_regionall['region'] == region_pick] #filtering region
    filtered_df2 = filtered_df2.groupby('grouped_individuals').mean(numeric_only=True) # calculating mean for all the countries per living place
    filtered_df2 = filtered_df2.T # TRANSPOSING!
    plt.figure(figsize=(12, 6))

    for column in filtered_df2.columns:
        plt.plot(filtered_df2.index, filtered_df2[column], marker='o', label=column)

    # Customize the plot
    plt.title(f"% People purchasing online in {region_pick}")
    plt.xlabel('Year')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45)
    plt.legend(title='Grouped Individuals')
    plt.grid(True)
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
living_region_graphic("Eastern Europe")

In [None]:

# Example function definition
def living_region_graphic(region_pick):
    # Filter the DataFrame for the specified region
    filtered_df2 = df_per_regionall[df_per_regionall['region'] == region_pick]
    
    # Calculate the mean for all countries per category ('grouped_individuals')
    filtered_df2 = filtered_df2.groupby('grouped_individuals').mean(numeric_only=True)
    
    # Transpose the DataFrame to have years as index and categories as columns
    filtered_df2 = filtered_df2.transpose()
    
    # Set up the bar plot
    plt.figure(figsize=(12, 6))
    
    # The range of years from the DataFrame index
    years = filtered_df2.index
    
    # Plot each category as its own series of bars
    bar_width = 0.25  # Width of the bars
    indices = pd.Series(range(len(years)))
    
    for i, column in enumerate(filtered_df2.columns):
        plt.bar(indices + i * bar_width, filtered_df2[column], width=bar_width, label=column)
    
    # Configure axis ticks to align with the middle of the groups of bars
    plt.xticks(indices + bar_width, years, rotation=45)
    
    # Customize the plot
    plt.title(f"% People purchasing online in {region_pick}")
    plt.xlabel('Year')
    plt.ylabel('Percentage (%)')
    plt.ylim(0, 100)  # Set limit to 100% for y-axis
    plt.legend(title='Grouped Individuals')
    plt.grid(True, axis='y')  # Display grid only for y-axis
    plt.tight_layout()
    
    # Show the plot
    plt.show()

# Example usage
living_region_graphic('Eastern Europe')

In [None]:
# individials per per living place same region

# % Individuals purchasing online grouped by living place - Region Pick

In [None]:
df_per_regionall['grouped_individuals'] = df_per_regionall['grouped_individuals'].str.replace("Individuals living in ", "", regex=False) # removing "Last online purchase: " from column indic_is

def region_living_graphic(living_place_pick):

    filtered_df3 = df_per_regionall[df_per_regionall['grouped_individuals'] == living_place_pick] #filtering grouped_individuals
    filtered_df3 = filtered_df3.groupby('region').mean(numeric_only=True) # calculating mean for all the countries per living place
    filtered_df3 = filtered_df3.T # TRANSPOSING!
    plt.figure(figsize=(12, 6))
    for column in filtered_df3.columns:
        plt.plot(filtered_df3.index, filtered_df3[column], marker='o', label=column)

    # Customize the plot
    plt.title(f"% People purchasing online in {living_place_pick}")
    plt.xlabel('Year')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45)
    plt.legend(title='Grouped Individuals')
    plt.grid(True)
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
region_living_graphic("rural areas") # 3 options: cities / rural areas / towns and suburbs