In [1]:
import pandas as pd
import plotly_express as px

df = pd.read_excel("./Data/Folkhalsomyndigheten_Covid19_Vaccine.xlsx", sheet_name="Vaccinerade kommun och ålder",)
df.head()

Unnamed: 0,Län,Län_namn,Kommun,Kommun_namn,Ålder,Befolkning,Antal minst 1 dos,Antal minst 2 doser,Antal 3 doser,Antal 4 doser,Andel minst 1 dos,Andel minst 2 doser,Andel 3 doser,Andel 4 doser
0,1,Stockholms län,114,Upplands Väsby,12-15,2422,1206,1046,,,0.497936,0.431874,,
1,1,Stockholms län,114,Upplands Väsby,16-17,1203,839,755,,,0.697423,0.627598,,
2,1,Stockholms län,114,Upplands Väsby,18-29,6692,4887,4469,1959.0,,0.730275,0.667812,0.292738,
3,1,Stockholms län,114,Upplands Väsby,30-39,7332,5542,5240,2878.0,,0.755865,0.714675,0.392526,
4,1,Stockholms län,114,Upplands Väsby,40-49,6946,5592,5429,3719.0,,0.805068,0.781601,0.535416,


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Län                  2900 non-null   int64  
 1   Län_namn             2900 non-null   object 
 2   Kommun               2900 non-null   int64  
 3   Kommun_namn          2900 non-null   object 
 4   Ålder                2900 non-null   object 
 5   Befolkning           2900 non-null   int64  
 6   Antal minst 1 dos    2900 non-null   int64  
 7   Antal minst 2 doser  2900 non-null   int64  
 8   Antal 3 doser        2320 non-null   float64
 9   Antal 4 doser        870 non-null    float64
 10  Andel minst 1 dos    2900 non-null   float64
 11  Andel minst 2 doser  2900 non-null   float64
 12  Andel 3 doser        2320 non-null   float64
 13  Andel 4 doser        870 non-null    float64
dtypes: float64(6), int64(5), object(3)
memory usage: 317.3+ KB


In [3]:
df.isna().sum()


Län                       0
Län_namn                  0
Kommun                    0
Kommun_namn               0
Ålder                     0
Befolkning                0
Antal minst 1 dos         0
Antal minst 2 doser       0
Antal 3 doser           580
Antal 4 doser          2030
Andel minst 1 dos         0
Andel minst 2 doser       0
Andel 3 doser           580
Andel 4 doser          2030
dtype: int64

Att a first look at the data, we can see that there are some missing values in the data set.

Because the missing values only exist in 3 and 4 doses, we can predict that some people did not get the 3rd or 4th dose.

This is based on pure assumption, but it is the most likely explanation.

In [4]:
df.columns


Index(['Län', 'Län_namn', 'Kommun', 'Kommun_namn', 'Ålder', 'Befolkning',
       'Antal minst 1 dos', 'Antal minst 2 doser', 'Antal 3 doser',
       'Antal 4 doser', 'Andel minst 1 dos', 'Andel minst 2 doser',
       'Andel 3 doser', 'Andel 4 doser'],
      dtype='object')

### Uppgift 2a

In [5]:
regions = len(df["Län_namn"].unique())
print(f"There are {regions} regions represented in the dataset")


There are 21 regions represented in the dataset


### Uppgift 2b

In [6]:
municipalities = len(df["Kommun_namn"].unique())
print(f"There are {municipalities} municipalities represented in the dataset")


There are 290 municipalities represented in the dataset


### Uppgift 2c

In [7]:
population = df["Befolkning"].sum()
print(f"The population represented in the dataset is {population}")


The population represented in the dataset is 9092790


### Uppgift 2d

In [8]:
# get the sum of the population in the age group 12-15
kids_12_15 = df[df["Ålder"] == "12-15"]["Befolkning"].sum()

# get the sum of the population in the age group 16-17
kids_16_17 = df[df["Ålder"] == "16-17"]["Befolkning"].sum()

# sum the two age groups
kids_under_18 = kids_12_15 + kids_16_17
print(f"The number of kids under 18 represented in the dataset is {kids_under_18}")


The number of kids under 18 represented in the dataset is 745370


### Uppgift 2e

In [9]:
fig = px.histogram(
    df,
    x="Ålder",
    y="Befolkning",
    title="Vaccinerade per ålder",
)
fig.write_html("./Visualiseringar/ Vaccinerade per ålder.html")
fig.show()


### Uppgift 2f

In [10]:
# here we are adding all the similar regions names under one unique name
# we added dose 4 to the dataframe because we are going to use in the next part
df_region = (
    df[
        [
            "Län_namn",
            "Befolkning",
            "Antal minst 1 dos",
            "Antal minst 2 doser",
            "Antal 3 doser",
            "Antal 4 doser",
        ]
    ]
    .groupby("Län_namn") # group by region
    .sum() 
    .reset_index() 
)

# devide the number of people who have received a certain number of doses by the total population in the region to get the proportion
df_region["Andel minst 1 dos"] = df_region["Antal minst 1 dos"] / df_region["Befolkning"]
df_region["Andel 2 doser"] = df_region["Antal minst 2 doser"] / df_region["Befolkning"]
df_region["Andel 3 doser"] = df_region["Antal 3 doser"] / df_region["Befolkning"]
df_region["Andel 4 doser"] = df_region["Antal 4 doser"] / df_region["Befolkning"]
df_region.head(21)

Unnamed: 0,Län_namn,Befolkning,Antal minst 1 dos,Antal minst 2 doser,Antal 3 doser,Antal 4 doser,Andel minst 1 dos,Andel 2 doser,Andel 3 doser,Andel 4 doser
0,Blekinge län,139327,122500,120727,92259.0,25059.0,0.879227,0.866501,0.662176,0.179857
1,Dalarnas län,252075,221420,218009,164296.0,48976.0,0.878389,0.864858,0.651774,0.194291
2,Gotlands län,53924,48785,47930,37423.0,10885.0,0.904699,0.888844,0.693995,0.201858
3,Gävleborgs län,252216,220389,215267,159636.0,45642.0,0.873811,0.853503,0.632934,0.180964
4,Hallands län,295663,259143,255329,191997.0,50583.0,0.876481,0.863581,0.649378,0.171083
5,Jämtlands län,115398,102236,100525,73332.0,19952.0,0.885943,0.871116,0.63547,0.172897
6,Jönköpings län,317355,274960,270266,199488.0,51399.0,0.866411,0.85162,0.628596,0.161961
7,Kalmar län,216763,190931,188522,147192.0,42395.0,0.880828,0.869715,0.679046,0.195582
8,Kronobergs län,175503,149141,146494,103745.0,27664.0,0.849792,0.834709,0.591129,0.157627
9,Norrbottens län,220199,198514,195919,149293.0,40385.0,0.901521,0.889736,0.677991,0.183402


In [11]:
fig = px.bar(
    df_region,
    x="Län_namn",
    y=["Andel minst 1 dos", "Andel 2 doser", "Andel 3 doser"],
    title="Vaccinerade per län",
    barmode="group",
    labels={"Län_namn": "Län", "value": "Andel", "variable": "Antal doser"},
)
fig.show()
fig.write_html("./Visualiseringar/Vaccinerade per län.html")


In [13]:
# use the isin() function to filter the dataframe to only include the two regions we want
# got help from a friend to get the right code for this part
df_region_2 = df_region[df_region["Län_namn"].isin(["Västra Götalands län", "Stockholms län"])]

fig = px.bar(
    df_region_2,
    x="Län_namn",
    y=["Andel minst 1 dos", "Andel 2 doser", "Andel 3 doser", "Andel 4 doser"],
    title="Andel vaccinerade i Västra Götaland och Stockholms län",
    barmode="group",
    labels={"Län_namn": "Län", "value": "Andel", "variable": "Antal doser"},
)
fig.show()
fig.write_html("./Visualiseringar/Andel vaccinerade i Västra Götaland och Stockholms län.html")