# Data Cleaning and Initial Exploration

## 1. Read Data Files



In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

Balance_Sheet_Original=pd.read_csv("SQ_Balance_Sheet.csv",encoding="latin1")
Premiums_Original=pd.read_csv("SQ_Premiums_Claims_Expenses.csv",encoding="latin1")

In [2]:
Premiums_Original.head()

Unnamed: 0,Reporting country,Reference period,Item,Business type,Item code,Value,Date of extraction (yyyymmdd),"Number of submissions (per reporting country, reference date and undertaking type)"
0,AUSTRIA,2016 Q3,Premiums written,Non-Life,R0101,,20250811,41
1,AUSTRIA,2016 Q3,Gross - Direct Business,Non-Life,R0110,7014.674593,20250811,41
2,AUSTRIA,2016 Q3,Gross - Proportional reinsurance accepted,Non-Life,R0120,1227.558933,20250811,41
3,AUSTRIA,2016 Q3,Gross - Non-proportional reinsurance accepted,Non-Life,R0130,30.519672,20250811,41
4,AUSTRIA,2016 Q3,Reinsurers' share,Non-Life,R0140,2577.818515,20250811,41


### Create Dictionary of Codes

In [3]:
Balance_Sheet_Original['Item code'] = Balance_Sheet_Original['Item code'].astype(str) + '_BS'
Premiums_Original['Item code'] = Premiums_Original['Item code'].astype(str) + '_P'

In [4]:
Codes = (
    pd.concat([
        Balance_Sheet_Original[['Item code', 'Item name']],
        Premiums_Original[['Item code', 'Item']].rename(columns={'Item': 'Item name'})
    ])
    .drop_duplicates(subset=['Item code'])
    .reset_index(drop=True))

In [5]:
Codes.head(2)

Unnamed: 0,Item code,Item name
0,R0001_BS,Assets
1,R0010_BS,Goodwill


####Fix the Reference Period

In [6]:
Balance_Sheet_Original['Year'] = Balance_Sheet_Original['Reference period'].str.extract(r'(\d{4})').astype(int)
Balance_Sheet_Original['Quarter'] = Balance_Sheet_Original['Reference period'].str.extract(r'(Q\d)')[0]
Balance_Sheet_Original['Date'] = pd.PeriodIndex(    Balance_Sheet_Original['Year'].astype(str) + Balance_Sheet_Original['Quarter'],freq='Q').to_timestamp()
Balance_Sheet_Original.drop(columns=['Year', 'Quarter'], inplace=True)

In [7]:
Premiums_Original['Year'] = Premiums_Original['Reference period'].str.extract(r'(\d{4})').astype(int)
Premiums_Original['Quarter'] = Premiums_Original['Reference period'].str.extract(r'(Q\d)')[0]
Premiums_Original['Date'] = pd.PeriodIndex(Premiums_Original['Year'].astype(str) + Premiums_Original['Quarter'],freq='Q').to_timestamp()
Premiums_Original.drop(columns=['Year', 'Quarter'], inplace=True)

## 2. Restructure the data bases

1. Use only the columns we are interested

In [8]:
Balance_Sheet = Balance_Sheet_Original[['Reporting country','Date', 'Value','Item code']]
Premiums = Premiums_Original[['Reporting country','Date', 'Value','Item code']]



2.   Long to Wide format



In [9]:
Balance_Sheet_wide = (
    Balance_Sheet.pivot_table(
        index=['Reporting country', 'Date'],
        columns='Item code',
        values='Value'
    )
    .reset_index()
)

Premiums_wide = (
    Premiums.pivot_table(
        index=['Reporting country', 'Date'],
        columns='Item code',
        values='Value'
    )
    .reset_index()
)


In [10]:
Balance_Sheet_wide.head(2)

Item code,Reporting country,Date,R0030_BS,R0040_BS,R0050_BS,R0060_BS,R0070_BS,R0080_BS,R0090_BS,R0100_BS,...,R0810_BS,R0820_BS,R0830_BS,R0840_BS,R0850_BS,R0860_BS,R0870_BS,R0880_BS,R0900_BS,R1000_BS
0,AUSTRIA,2016-07-01,0.0,635.589342,16.469644,301.632308,34282.686491,2305.173964,8669.315493,301.982881,...,188.809899,198.867469,225.238887,633.932383,1179.240127,0.0,1179.240127,206.062792,37052.865349,10082.801509
1,AUSTRIA,2016-10-01,0.0,413.480323,16.085598,325.261748,34315.147061,2358.720577,9154.292202,373.338027,...,195.603543,284.100581,164.663051,563.849236,1128.268083,86.95179,1041.316293,139.493732,35048.441889,11420.830649


In [11]:
Premiums_wide.head(2)

Item code,Reporting country,Date,R0110_P,R0120_P,R0130_P,R0140_P,R0200_P,R0210_P,R0220_P,R0230_P,...,R1700_P,R1710_P,R1720_P,R1800_P,R1900_P,R2500_P,R2510_P,R2600_P,Z0001_P,Z0002_P
0,AUSTRIA,2016-07-01,7014.674593,1227.558933,30.519672,2577.818515,5694.934683,6658.226021,1168.639099,32.661139,...,,,,,,,,,,
1,AUSTRIA,2016-10-01,8820.385891,1614.83465,46.186761,3317.361617,7164.045685,8806.180214,1595.939149,46.382934,...,,,,,,,,,,


3.   Exclude 'EEA' from the 'Countries' in Premiums



In [12]:
Premiums_wide=Premiums_wide[Premiums_wide['Reporting country']!='EEA']

## 3. Merge the Data Bases

In [13]:
df = pd.merge(Balance_Sheet_wide, Premiums_wide, on=["Reporting country", "Date"], how="outer")
df.head(3)

Item code,Reporting country,Date,R0030_BS,R0040_BS,R0050_BS,R0060_BS,R0070_BS,R0080_BS,R0090_BS,R0100_BS,...,R1700_P,R1710_P,R1720_P,R1800_P,R1900_P,R2500_P,R2510_P,R2600_P,Z0001_P,Z0002_P
0,AUSTRIA,2016-07-01,0.0,635.589342,16.469644,301.632308,34282.686491,2305.173964,8669.315493,301.982881,...,,,,,,,,,,
1,AUSTRIA,2016-10-01,0.0,413.480323,16.085598,325.261748,34315.147061,2358.720577,9154.292202,373.338027,...,,,,,,,,,,
2,AUSTRIA,2017-01-01,0.0,365.481013,16.044866,306.114805,34096.227432,2395.571873,9241.99341,363.381924,...,,,,,,,,,0.86,0.25


In [14]:
print("Balance_Sheet shape:", Balance_Sheet_wide.shape)
print("Premiums shape:", Premiums_wide.shape)
print("df shape:", df.shape)

Balance_Sheet shape: (1049, 82)
Premiums shape: (1049, 44)
df shape: (1049, 124)


## 4. Missing Values and format

1. Replace N/A with 0


In [15]:
df = df.fillna(0)

2.   Make sure all columns except 'Reporting country' and 'Date' are float and round to 2 decimals.



In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1049 entries, 0 to 1048
Columns: 124 entries, Reporting country to Z0002_P
dtypes: datetime64[ns](1), float64(122), object(1)
memory usage: 1016.3+ KB


In [17]:
num_cols = df.columns.difference(['Reporting country', 'Date'])
df[num_cols] = df[num_cols].astype(float)
df[num_cols] = df[num_cols].round(2)

In [18]:
df.tail(4)

Item code,Reporting country,Date,R0030_BS,R0040_BS,R0050_BS,R0060_BS,R0070_BS,R0080_BS,R0090_BS,R0100_BS,...,R1700_P,R1710_P,R1720_P,R1800_P,R1900_P,R2500_P,R2510_P,R2600_P,Z0001_P,Z0002_P
1045,SWEDEN,2024-04-01,0.09,13.1,12.4,276.85,49066.14,1093.55,5707.14,8181.15,...,10282.34,0.0,0.0,0.0,679.31,0.0,-319.3,360.01,0.95,0.26
1046,SWEDEN,2024-07-01,0.1,11.13,11.21,275.02,50000.56,1099.37,5782.65,8597.19,...,15358.42,0.0,0.0,0.0,994.27,0.0,-423.07,571.2,0.91,0.27
1047,SWEDEN,2024-10-01,0.0,11.67,13.69,275.88,48984.13,1105.1,5782.56,8422.6,...,18759.73,0.0,0.0,0.0,1337.82,0.0,-519.22,818.59,0.92,0.27
1048,SWEDEN,2025-01-01,0.0,17.25,19.9,293.06,50777.99,1171.73,6047.75,8544.76,...,5751.79,0.0,0.0,0.0,359.08,0.0,-202.83,156.25,0.92,0.25


## 5. Check for duplicates

In [19]:
duplicates = df[df.duplicated(subset=['Reporting country', 'Date'], keep=False)]
print(duplicates)
print(f"\nTotal duplicated rows: {len(duplicates)}")


Empty DataFrame
Columns: [Reporting country, Date, R0030_BS, R0040_BS, R0050_BS, R0060_BS, R0070_BS, R0080_BS, R0090_BS, R0100_BS, R0110_BS, R0120_BS, R0130_BS, R0140_BS, R0150_BS, R0160_BS, R0170_BS, R0180_BS, R0190_BS, R0200_BS, R0210_BS, R0220_BS, R0230_BS, R0240_BS, R0250_BS, R0260_BS, R0270_BS, R0280_BS, R0290_BS, R0300_BS, R0310_BS, R0320_BS, R0330_BS, R0340_BS, R0350_BS, R0360_BS, R0370_BS, R0380_BS, R0390_BS, R0400_BS, R0410_BS, R0420_BS, R0500_BS, R0510_BS, R0520_BS, R0530_BS, R0540_BS, R0550_BS, R0560_BS, R0570_BS, R0580_BS, R0590_BS, R0600_BS, R0610_BS, R0620_BS, R0630_BS, R0640_BS, R0650_BS, R0660_BS, R0670_BS, R0680_BS, R0690_BS, R0700_BS, R0710_BS, R0720_BS, R0740_BS, R0750_BS, R0760_BS, R0770_BS, R0780_BS, R0790_BS, R0800_BS, R0810_BS, R0820_BS, R0830_BS, R0840_BS, R0850_BS, R0860_BS, R0870_BS, R0880_BS, R0900_BS, R1000_BS, R0110_P, R0120_P, R0130_P, R0140_P, R0200_P, R0210_P, R0220_P, R0230_P, R0240_P, R0300_P, R0310_P, R0320_P, R0330_P, R0340_P, R0400_P, R0410_P, R0420

##

In [22]:
import plotly.express as px

fig = px.line(
    df,
    x='Date',
    y='R0500_P',
    color='Reporting country',
    title='R0100 Over Time by Reporting Country'
)

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='R0500 Value',
    legend_title='Reporting Country'
)

fig.show()
