In [1]:
# ==================
# IMPORT LIBRARIES
# ==================

import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore') # Ignore warning
pd.set_option('display.max_columns', None) # Display any number of columns

In [2]:
# Standardize the number and order of columns 2015.CSV
# Read the CSV file
df = pd.read_csv("Dataset/2015.csv")

# Drop the column by specifying its name
df.drop('Region', axis=1, inplace=True)
df.drop('Standard Error', axis=1, inplace=True)
df.drop('Dystopia Residual', axis=1, inplace=True)

# Switch column 1 and column 2
cols = list(df.columns)
cols[0], cols[1] = cols[1], cols[0]  # Swap the first two columns

# Switch column 8 and column 9
cols[7], cols[8] = cols[8], cols[7]  # Swap the 8th and 9th columns (index starts at 0)

# Reorder the DataFrame with the updated column order
df = df[cols]

# Rename all columns (you can define new names in a list)
new_column_names = ['Rank', 'Country', 'Score', 'GDP per Capita', 'Family',
                    'Life Expectancy', 'Fredom', 'Generosity', 'Perceptions of Corruption']

# Ensure the number of new column names matches the number of columns in the DataFrame
df.columns = new_column_names[:len(df.columns)]

# Save the updated DataFrame back to a CSV file
df.to_csv("Dataset/2015.csv", index=False)

In [3]:
# Standardize the number and order of columns 2016.CSV
# Read the CSV file
df = pd.read_csv("Dataset/2016.csv")

# Drop the column by specifying its name
df.drop('Region', axis=1, inplace=True)
df.drop('Lower Confidence Interval', axis=1, inplace=True)
df.drop('Upper Confidence Interval', axis=1, inplace=True)
df.drop('Dystopia Residual', axis=1, inplace=True)

# Switch column 1 and column 2
cols = list(df.columns)
cols[0], cols[1] = cols[1], cols[0]  # Swap the first two columns

# Switch column 8 and column 9
cols[7], cols[8] = cols[8], cols[7]  # Swap the 8th and 9th columns (index starts at 0)

# Reorder the DataFrame with the updated column order
df = df[cols]

# Rename all columns (you can define new names in a list)
new_column_names = ['Rank', 'Country', 'Score', 'GDP per Capita', 'Family',
                    'Life Expectancy', 'Fredom', 'Generosity', 'Perceptions of Corruption']

# Ensure the number of new column names matches the number of columns in the DataFrame
df.columns = new_column_names[:len(df.columns)]

# Save the updated DataFrame back to a CSV file
df.to_csv("Dataset/2016.csv", index=False)

In [4]:
# Standardize the number and order of columns 2017.CSV
# Read the CSV file
df = pd.read_csv("Dataset/2017.csv")

# Drop the column by specifying its name
df.drop('Whisker.high', axis=1, inplace=True)
df.drop('Whisker.low', axis=1, inplace=True)
df.drop('Dystopia.Residual', axis=1, inplace=True)

# Switch column 1 and column 2
cols = list(df.columns)
cols[0], cols[1] = cols[1], cols[0]  # Swap the first two columns

# Reorder the DataFrame with the updated column order
df = df[cols]

# Rename all columns (you can define new names in a list)
new_column_names = ['Rank', 'Country', 'Score', 'GDP per Capita', 'Family',
                    'Life Expectancy', 'Fredom', 'Generosity', 'Perceptions of Corruption']

# Ensure the number of new column names matches the number of columns in the DataFrame
df.columns = new_column_names[:len(df.columns)]

# Save the updated DataFrame back to a CSV file
df.to_csv("Dataset/2017.csv", index=False)

In [5]:
# Standardize the number and order of columns 2018 and 2019.CSV
# Read the CSV file

df = pd.read_csv("Dataset/2018.csv")
# Rename all columns (you can define new names in a list)
new_column_names = ['Rank', 'Country', 'Score', 'GDP per Capita', 'Family',
                    'Life Expectancy', 'Fredom', 'Generosity', 'Perceptions of Corruption']

# Ensure the number of new column names matches the number of columns in the DataFrame
df.columns = new_column_names[:len(df.columns)]

# Save the updated DataFrame back to a CSV file
df.to_csv("Dataset/2018.csv", index=False)

df = pd.read_csv("Dataset/2019.csv")
# Rename all columns (you can define new names in a list)
new_column_names = ['Rank', 'Country', 'Score', 'GDP per Capita', 'Family',
                    'Life Expectancy', 'Fredom', 'Generosity', 'Perceptions of Corruption']

# Ensure the number of new column names matches the number of columns in the DataFrame
df.columns = new_column_names[:len(df.columns)]

# Save the updated DataFrame back to a CSV file
df.to_csv("Dataset/2019.csv", index=False)

In [6]:
# Read Dataset
# Dataset directory
directory = "Dataset/"

# List to store each dataframe
df_list = []

# Loop through all CSV files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        
        # Add 'Year' column
        df['Year'] = filename.split(".")[0]
        
        # Append the dataframe to the list
        df_list.append(df)

# Concatenate all dataframes into one
df = pd.concat(df_list, ignore_index=True)
df.info()
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Rank                       782 non-null    int64  
 1   Country                    782 non-null    object 
 2   Score                      782 non-null    float64
 3   GDP per Capita             782 non-null    float64
 4   Family                     782 non-null    float64
 5   Life Expectancy            782 non-null    float64
 6   Fredom                     782 non-null    float64
 7   Generosity                 782 non-null    float64
 8   Perceptions of Corruption  781 non-null    float64
 9   Year                       782 non-null    object 
dtypes: float64(7), int64(1), object(2)
memory usage: 61.2+ KB


Unnamed: 0,Rank,Country,Score,GDP per Capita,Family,Life Expectancy,Fredom,Generosity,Perceptions of Corruption,Year
176,19,Ireland,6.907,1.48341,1.16157,0.81455,0.54008,0.44963,0.29754,2016
59,60,Poland,5.791,1.12555,1.27948,0.77903,0.53122,0.16759,0.04212,2015
518,49,Belize,5.956,0.807,1.101,0.474,0.593,0.183,0.089,2018
612,143,Madagascar,3.774,0.262,0.908,0.402,0.221,0.155,0.049,2018
780,155,Central African Republic,3.083,0.026,0.0,0.105,0.225,0.235,0.035,2019
