# Read and clean the data

In [1]:
import pandas as pd
import yaml

In [2]:
try:
    with open("./../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading the config file')

In [3]:
data = pd.read_csv(config["data"]["undata_raw"])

In [4]:
data.head()

Unnamed: 0,Country or Area,Commodity - Transaction,Year,Unit,Quantity,Quantity Footnotes
0,Germany,Electricity - Gross production,2020.0,"Kilowatt-hours, million",572666.0,
1,Germany,Electricity - Gross production,2019.0,"Kilowatt-hours, million",606917.0,
2,Germany,Electricity - Gross production,2018.0,"Kilowatt-hours, million",640468.0,
3,Germany,Electricity - Gross production,2017.0,"Kilowatt-hours, million",653723.0,
4,Germany,From combustible fuels – Main activity,2020.0,"Kilowatt-hours, million",250205.0,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Country or Area          249 non-null    object 
 1   Commodity - Transaction  249 non-null    object 
 2   Year                     247 non-null    float64
 3   Unit                     247 non-null    object 
 4   Quantity                 247 non-null    float64
 5   Quantity Footnotes       0 non-null      float64
dtypes: float64(3), object(3)
memory usage: 11.8+ KB


## Cleaning column names

In [6]:
def clean_cols(df:pd.DataFrame)->pd.DataFrame:
    df_temp = df.copy()

    df_temp.columns = df_temp.columns.str.lower().str.replace(' - ', '_').str.replace(' ', '_')

    return df_temp
        

In [7]:
data = clean_cols(data)

## Removing `quantity_footnotes` column
This column has no values, it can be removed

In [8]:
data.drop('quantity_footnotes', axis=1, inplace=True)
data.head()

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity
0,Germany,Electricity - Gross production,2020.0,"Kilowatt-hours, million",572666.0
1,Germany,Electricity - Gross production,2019.0,"Kilowatt-hours, million",606917.0
2,Germany,Electricity - Gross production,2018.0,"Kilowatt-hours, million",640468.0
3,Germany,Electricity - Gross production,2017.0,"Kilowatt-hours, million",653723.0
4,Germany,From combustible fuels – Main activity,2020.0,"Kilowatt-hours, million",250205.0


## Removing `unit` column
`quantity` column name can be renamed to include the unit

In [9]:
data.rename(columns = {'quantity':'quantity_GWh'}, inplace=True)

In [10]:
data.drop('unit', axis=1, inplace=True)

In [11]:
data.columns

Index(['country_or_area', 'commodity_transaction', 'year', 'quantity_GWh'], dtype='object')

## Dealing with null values

In [12]:
data.isnull().sum()

country_or_area          0
commodity_transaction    0
year                     2
quantity_GWh             2
dtype: int64

In [13]:
data[data.isna().any(axis=1)]

Unnamed: 0,country_or_area,commodity_transaction,year,quantity_GWh
247,fnSeqID,Footnote,,
248,1,Estimate,,


The rows with NaNs are dummy rows and can be removed from the dataset.

In [14]:
data = data[data['country_or_area']=='Germany']

In [15]:
data.isnull().sum()

country_or_area          0
commodity_transaction    0
year                     0
quantity_GWh             0
dtype: int64

In [16]:
data['country_or_area'].unique()

array(['Germany'], dtype=object)

## Setting data types
`year` should be an integer

In [17]:
data['year'] = data['year'].astype('int')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 247 entries, 0 to 246
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country_or_area        247 non-null    object 
 1   commodity_transaction  247 non-null    object 
 2   year                   247 non-null    int64  
 3   quantity_GWh           247 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 9.6+ KB


## Saving cleaned data


In [19]:
data.to_csv('../data/cleaned/undata_cleaned.csv', index=False)