# Data Cleaning and Initial Exploration

## 1. Read Data Files



In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

Balance_Sheet_Original=pd.read_csv("SQ_Balance_Sheet.csv",encoding="latin1")
Premiums_Original=pd.read_csv("SQ_Premiums_Claims_Expenses.csv",encoding="latin1")

In [2]:
Premiums_Original.head()

Unnamed: 0,Reporting country,Reference period,Item,Business type,Item code,Value,Date of extraction (yyyymmdd),"Number of submissions (per reporting country, reference date and undertaking type)"
0,AUSTRIA,2016 Q3,Premiums written,Non-Life,R0101,,20250811,41
1,AUSTRIA,2016 Q3,Gross - Direct Business,Non-Life,R0110,7014.674593,20250811,41
2,AUSTRIA,2016 Q3,Gross - Proportional reinsurance accepted,Non-Life,R0120,1227.558933,20250811,41
3,AUSTRIA,2016 Q3,Gross - Non-proportional reinsurance accepted,Non-Life,R0130,30.519672,20250811,41
4,AUSTRIA,2016 Q3,Reinsurers' share,Non-Life,R0140,2577.818515,20250811,41


### Create Dictionary of Codes

In [3]:
Codes = (
    pd.concat([
        Balance_Sheet_Original[['Item code', 'Item name']],
        Premiums_Original[['Item code', 'Item']]])
    .drop_duplicates(subset=['Item code'])
    .reset_index(drop=True))

####Fix the Reference Period

In [4]:
Balance_Sheet_Original['Year'] = Balance_Sheet_Original['Reference period'].str.extract(r'(\d{4})').astype(int)
Balance_Sheet_Original['Quarter'] = Balance_Sheet_Original['Reference period'].str.extract(r'(Q\d)')[0]
Balance_Sheet_Original['Date'] = pd.PeriodIndex(    Balance_Sheet_Original['Year'].astype(str) + Balance_Sheet_Original['Quarter'],freq='Q').to_timestamp()
Balance_Sheet_Original.drop(columns=['Year', 'Quarter'], inplace=True)

In [5]:
Premiums_Original['Year'] = Premiums_Original['Reference period'].str.extract(r'(\d{4})').astype(int)
Premiums_Original['Quarter'] = Premiums_Original['Reference period'].str.extract(r'(Q\d)')[0]
Premiums_Original['Date'] = pd.PeriodIndex(Premiums_Original['Year'].astype(str) + Premiums_Original['Quarter'],freq='Q').to_timestamp()
Premiums_Original.drop(columns=['Year', 'Quarter'], inplace=True)

## 2. Restructure the data bases

1. Use only the columns we are interested

In [6]:
Balance_Sheet = Balance_Sheet_Original[['Reporting country','Date', 'Value','Item code']]
Premiums = Premiums_Original[['Reporting country','Date', 'Value','Item code']]



2.   Long to Wide format



In [7]:
Balance_Sheet_wide = (
    Balance_Sheet.pivot_table(
        index=['Reporting country', 'Date'],
        columns='Item code',
        values='Value'
    )
    .reset_index()
)

Premiums_wide = (
    Premiums.pivot_table(
        index=['Reporting country', 'Date'],
        columns='Item code',
        values='Value'
    )
    .reset_index()
)

In [8]:
Balance_Sheet_wide.head(2)

Item code,Reporting country,Date,R0030,R0040,R0050,R0060,R0070,R0080,R0090,R0100,...,R0810,R0820,R0830,R0840,R0850,R0860,R0870,R0880,R0900,R1000
0,AUSTRIA,2016-07-01,0.0,635.589342,16.469644,301.632308,34282.686491,2305.173964,8669.315493,301.982881,...,188.809899,198.867469,225.238887,633.932383,1179.240127,0.0,1179.240127,206.062792,37052.865349,10082.801509
1,AUSTRIA,2016-10-01,0.0,413.480323,16.085598,325.261748,34315.147061,2358.720577,9154.292202,373.338027,...,195.603543,284.100581,164.663051,563.849236,1128.268083,86.95179,1041.316293,139.493732,35048.441889,11420.830649


In [9]:
Premiums_wide.head(2)

Item code,Reporting country,Date,R0110,R0120,R0130,R0140,R0200,R0210,R0220,R0230,...,R1700,R1710,R1720,R1800,R1900,R2500,R2510,R2600,Z0001,Z0002
0,AUSTRIA,2016-07-01,7014.674593,1227.558933,30.519672,2577.818515,5694.934683,6658.226021,1168.639099,32.661139,...,,,,,,,,,,
1,AUSTRIA,2016-10-01,8820.385891,1614.83465,46.186761,3317.361617,7164.045685,8806.180214,1595.939149,46.382934,...,,,,,,,,,,


3.   Exclude 'EEA' from the 'Countries' in Premiums



In [10]:
Premiums_wide=Premiums_wide[Premiums_wide['Reporting country']!='EEA']

## 3. Merge the Data Bases

In [11]:
df = pd.merge(Balance_Sheet_wide, Premiums_wide, on=["Reporting country", "Date"], how="outer")
df.head(3)

Item code,Reporting country,Date,R0030,R0040,R0050,R0060,R0070,R0080,R0090,R0100,...,R1700,R1710,R1720,R1800,R1900,R2500,R2510,R2600,Z0001,Z0002
0,AUSTRIA,2016-07-01,0.0,635.589342,16.469644,301.632308,34282.686491,2305.173964,8669.315493,301.982881,...,,,,,,,,,,
1,AUSTRIA,2016-10-01,0.0,413.480323,16.085598,325.261748,34315.147061,2358.720577,9154.292202,373.338027,...,,,,,,,,,,
2,AUSTRIA,2017-01-01,0.0,365.481013,16.044866,306.114805,34096.227432,2395.571873,9241.99341,363.381924,...,,,,,,,,,0.86,0.25


In [12]:
print("Balance_Sheet shape:", Balance_Sheet_wide.shape)
print("Premiums shape:", Premiums_wide.shape)
print("df shape:", df.shape)

Balance_Sheet shape: (1049, 82)
Premiums shape: (1049, 44)
df shape: (1049, 124)


## 4. Missing Values and format

1. Replace N/A with 0


In [13]:
df = df.fillna(0)

2.   Make sure all columns except 'Reporting country' and 'Date' are float and round to 2 decimals.



In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1049 entries, 0 to 1048
Columns: 124 entries, Reporting country to Z0002
dtypes: datetime64[ns](1), float64(122), object(1)
memory usage: 1016.3+ KB


In [15]:
num_cols = df.columns.difference(['Reporting country', 'Date'])
df[num_cols] = df[num_cols].astype(float)
df[num_cols] = df[num_cols].round(2)

In [16]:
df.tail(4)

Item code,Reporting country,Date,R0030,R0040,R0050,R0060,R0070,R0080,R0090,R0100,...,R1700,R1710,R1720,R1800,R1900,R2500,R2510,R2600,Z0001,Z0002
1045,SWEDEN,2024-04-01,0.09,13.1,12.4,276.85,49066.14,1093.55,5707.14,8181.15,...,10282.34,0.0,0.0,0.0,679.31,0.0,-319.3,360.01,0.95,0.26
1046,SWEDEN,2024-07-01,0.1,11.13,11.21,275.02,50000.56,1099.37,5782.65,8597.19,...,15358.42,0.0,0.0,0.0,994.27,0.0,-423.07,571.2,0.91,0.27
1047,SWEDEN,2024-10-01,0.0,11.67,13.69,275.88,48984.13,1105.1,5782.56,8422.6,...,18759.73,0.0,0.0,0.0,1337.82,0.0,-519.22,818.59,0.92,0.27
1048,SWEDEN,2025-01-01,0.0,17.25,19.9,293.06,50777.99,1171.73,6047.75,8544.76,...,5751.79,0.0,0.0,0.0,359.08,0.0,-202.83,156.25,0.92,0.25


## 5. Check for duplicates

In [17]:
duplicates = df[df.duplicated(subset=['Reporting country', 'Date'], keep=False)]
print(duplicates)
print(f"\nTotal duplicated rows: {len(duplicates)}")


Empty DataFrame
Columns: [Reporting country, Date, R0030, R0040, R0050, R0060, R0070, R0080, R0090, R0100, R0110_x, R0120_x, R0130_x, R0140_x, R0150, R0160, R0170, R0180, R0190, R0200_x, R0210_x, R0220_x, R0230_x, R0240_x, R0250, R0260, R0270, R0280, R0290, R0300_x, R0310_x, R0320_x, R0330_x, R0340_x, R0350, R0360, R0370, R0380, R0390, R0400_x, R0410_x, R0420_x, R0500_x, R0510, R0520, R0530, R0540, R0550_x, R0560, R0570, R0580, R0590, R0600, R0610, R0620, R0630, R0640, R0650, R0660, R0670, R0680, R0690, R0700, R0710, R0720, R0740, R0750, R0760, R0770, R0780, R0790, R0800, R0810, R0820, R0830, R0840, R0850, R0860, R0870, R0880, R0900, R1000, R0110_y, R0120_y, R0130_y, R0140_y, R0200_y, R0210_y, R0220_y, R0230_y, R0240_y, R0300_y, R0310_y, R0320_y, R0330_y, R0340_y, R0400_y, R0410_y, R0420_y, R0430, ...]
Index: []

[0 rows x 124 columns]

Total duplicated rows: 0
