# Data Cleaning Notebook

This notebook performs data cleaning operations, ensuring that the dataset is prepared for further analysis.

## 1. Importing Libraries

We first import the necessary Python libraries for data handling and analysis.

In [1]:
# Import the required libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

import logging
logging.basicConfig(level=logging.WARNING)

## 2. Load Data & Create Functions

Next, we load the dataset and inspect its structure, checking for missing values and inconsistencies.

In [2]:
# Read the combined dataset
df_combined = pd.read_csv('/work/fully-combined.csv')

In [3]:
# Create a function for a NaN analysis
def nan_analysis(df):
    # Count NaN values in each column
    nan_analysis = df.isna().sum()
    # Calculate percentage of NaN values
    nan_analysis_percent = (df.isna().mean() * 100).sort_values(ascending=False)
    # Combine NaN count and percentage into a DataFrame for better analysis
    nan_summary = pd.DataFrame({
        'NaN Count': nan_analysis,
        'Percentage': nan_analysis_percent
    }).sort_values(by='Percentage', ascending=False)
    return nan_summary[nan_summary['Percentage'] > 0]

In [4]:
# Perform a NaN analysis on the combined DataFrame
nan_analysis(df_combined)

Unnamed: 0,NaN Count,Percentage
ethnicity_ratio,74,5.882353
totalrefugees,74,5.882353
Human Development Index,38,3.020668
Voter turnout (highest score=1),38,3.020668
Media freedom (highest score=1),38,3.020668
Media bias (highest score=1),38,3.020668
Mean years of schooling (highest score=1),38,3.020668
Life expectancy (highest score=1),38,3.020668
Judicial Independence (highest score=1),38,3.020668
Infant mortality rate (highest score=1),38,3.020668


In [5]:
# Create a function to find the missing countries
def missing_countries(df,feature):
    return df[df[feature].isna()][['country', 'year', feature]]
def select_country(df, country, feature):
    return df[df['country'] == country][['year', feature]].copy()


## 3. Handling Missing Values

We identify and handle missing values using appropriate strategies such as imputation or removal.

In [6]:
# Create a function to impute NaN values with regression
def fill_missing_with_regression(imported_df, ct, f):
    """
    Fill missing values in the dataset using regression models.

    Parameters:
        imported_df (pd.DataFrame): The original dataset.
        ct (str): Country name to filter data.
        f (str): Feature (column) for which missing values will be predicted.

    Returns:
        pd.DataFrame: Modified DataFrame with filled missing values.
    """
    series = select_country(imported_df, ct, f).copy()
    cols = list(series.columns)
    year, value = cols[0], cols[1]

    known_data = series.dropna(subset=[value]).sort_values(by=[year])

    if known_data.empty:
        logging.warning(f"No known data for {ct} - {f}. Cannot perform regression.")
        return imported_df

    # Train-test split
    split_idx = int(len(known_data) * 0.8)
    X_train, X_test = known_data[[year]].iloc[:split_idx], known_data[[year]].iloc[split_idx:]
    y_train, y_test = known_data[value].iloc[:split_idx], known_data[value].iloc[split_idx:]

    # Train Models
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Degree 2": make_pipeline(PolynomialFeatures(2), LinearRegression()),
        "Polynomial Degree 3": make_pipeline(PolynomialFeatures(3), LinearRegression())
    }

    errors = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        errors[name] = mean_absolute_percentage_error(y_test, model.predict(X_test))

    # Select best model
    best_model_name = min(errors, key=errors.get)
    best_model = models[best_model_name]

    if errors[best_model_name] > 0.1:
        logging.warning(f"[WARNING] Best model for {f} in {ct} has MAPE: {errors[best_model_name]:.4f}")
        return imported_df  # Return unchanged

    # Predict missing values
    missing_years = series[series[value].isna()][[year]].values.reshape(-1, 1)
    if len(missing_years) > 0:
        predicted_values = np.clip(best_model.predict(missing_years), 0, None)
        series.loc[series[value].isna(), value] = predicted_values

        # Update original DataFrame
        imported_df = imported_df.copy()
        imported_df.loc[(imported_df['country'] == ct) & (imported_df[value].isna()), value] = series[value]

    return imported_df

# Improved iteration logic
msng_features = list(nan_analysis(df_combined).index)

for feature in msng_features:
    msng_countries_list = list(missing_countries(df_combined, feature).country)
    for country in msng_countries_list:
        df_combined = fill_missing_with_regression(df_combined, country, feature)



In [7]:
# Perform a Nan analysis after imputation with regression
nan_analysis(df_combined)

Unnamed: 0,NaN Count,Percentage
totalrefugees,36,2.861685
Media freedom (highest score=1),13,1.033386
Fair trial (highest score=1),12,0.953895
Voter turnout (highest score=1),9,0.715421
Election free and fair (highest score=1),8,0.63593
Free Political Parties (highest score=1),8,0.63593
ethnicity_ratio,8,0.63593
Freedom of the Press (highest score=1),7,0.556439
Media bias (highest score=1),7,0.556439
Harassment of journalists (highest score=1),7,0.556439


In [8]:
# Create a function to impute NaN values with the nearest value
def impute_nearest(df, country, col_name):
    df_imputed = select_country(df, country, col_name)
    cols = list(df_imputed.columns)
    year, value = cols[0], cols[1]

    df_imputed[value] = df_imputed[value].fillna(method="ffill")
    df_imputed[value] = df_imputed[value].fillna(method="bfill")

    if df_imputed[value].isna().sum() > 0:
        last_valid_index = df_imputed[value].last_valid_index()
        if last_valid_index is not None:
            df_imputed[value].fillna(df_imputed[value].iloc[last_valid_index], inplace=True)

    df = df.copy()
    df.loc[(df["country"] == country), value] = df_imputed[value]

    return df

In [9]:
# Create a list of features with missing values
msng_features = list(nan_analysis(df_combined).index);msng_features

['totalrefugees',
 'Media freedom (highest score=1)',
 'Fair trial (highest score=1)',
 'Voter turnout (highest score=1)',
 'Election free and fair (highest score=1)',
 'Free Political Parties (highest score=1)',
 'ethnicity_ratio',
 'Freedom of the Press (highest score=1)',
 'Media bias (highest score=1)',
 'Harassment of journalists (highest score=1)',
 'Election government intimidation (highest score=1)',
 'Predictable Enforcement (highest score=1)',
 'Educational equality (highest score=1)',
 'Freedom of Religion (highest score=1)',
 'Effective Parliament (highest score=1)',
 'Judicial Independence (highest score=1)',
 'Health equality (highest score=1)']

In [10]:
# Create a list of countries with missing values
msng_countries_list=list(missing_countries(df_combined,'totalrefugees').country);msng_countries_list

['Austria',
 'Belgium',
 'Canada',
 'Costa Rica',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'Israel',
 'Italy',
 'Mexico',
 'Norway',
 'Poland',
 'Slovenia',
 'Sweden',
 'Switzerland',
 'Turkey',
 'United States',
 'Austria',
 'Belgium',
 'Canada',
 'Costa Rica',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'Israel',
 'Italy',
 'Mexico',
 'Norway',
 'Poland',
 'Slovenia',
 'Sweden',
 'Switzerland',
 'Turkey',
 'United States']

In [11]:
# Impute missing values with the nearest value
for i in msng_features:
    msng_countries_list=list(missing_countries(df_combined,i).country)
    for j in msng_countries_list:
        df_combined=impute_nearest(df_combined, j, i)

In [12]:
# Perform a NaN analysis after imputation with the nearest value
nan_analysis(df_combined)

Unnamed: 0,NaN Count,Percentage


## 4. Export the Imputed Dataset

After imputing, we save the final dataset for further analysis.

In [13]:
# Export the CSV
df_combined.to_csv('combined-without-engineering.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=63ad4e1b-19bb-4dd7-a997-1fa3d2fd82a1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>