In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats

import matplotlib.pyplot as plt

In [2]:
world_development_indicator_df = pd.read_csv('../Datasets\\World Development Indicators\\World_Development_Indicator.csv', skiprows=4)
Metadata_World_Development_Indicator = pd.read_csv('../Datasets\\World Development Indicators\\Metadata_World_Development_Indicator.csv')

# World Development Indicator

## Data Understanding

In [24]:
world_development_indicator_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
0,United Arab Emirates,ARE,"International migrant stock, total",SM.POP.TOTL,2194.0,,,,,25981.0,...,8095126.0,,,,,,,,,
1,United Arab Emirates,ARE,"Children in employment, wage workers (% of chi...",SL.WAG.0714.ZS,,,,,,,...,,,,,,,,,,
2,United Arab Emirates,ARE,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,,,,,,,...,,1.64,2.462,2.236,2.331,4.294,3.105,2.873,2.151,
3,United Arab Emirates,ARE,"Unemployment, female (% of female labor force)...",SL.UEM.TOTL.FE.NE.ZS,,,,,,,...,,4.02,7.135,5.95,5.348,5.715,6.186,7.066,4.266,
4,United Arab Emirates,ARE,"Share of youth not in education, employment or...",SL.UEM.NEET.MA.ME.ZS,,,,,,,...,5.381,5.313,5.29,4.183,4.647,10.508,6.179,5.253,5.035,


In [25]:
world_development_indicator_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1496 entries, 0 to 1495
Data columns (total 69 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    1496 non-null   object 
 1   Country Code    1496 non-null   object 
 2   Indicator Name  1496 non-null   object 
 3   Indicator Code  1496 non-null   object 
 4   1960            96 non-null     float64
 5   1961            115 non-null    float64
 6   1962            115 non-null    float64
 7   1963            115 non-null    float64
 8   1964            116 non-null    float64
 9   1965            115 non-null    float64
 10  1966            117 non-null    float64
 11  1967            119 non-null    float64
 12  1968            120 non-null    float64
 13  1969            122 non-null    float64
 14  1970            185 non-null    float64
 15  1971            218 non-null    float64
 16  1972            221 non-null    float64
 17  1973            249 non-null    f

In [26]:
for column in world_development_indicator_df.columns:
    unique_values = world_development_indicator_df[column].nunique()
    print("{:<30}{}".format(column, unique_values))

Country Name                  1
Country Code                  1
Indicator Name                1496
Indicator Code                1496
1960                          92
1961                          109
1962                          109
1963                          109
1964                          111
1965                          108
1966                          111
1967                          115
1968                          114
1969                          116
1970                          172
1971                          194
1972                          197
1973                          223
1974                          237
1975                          282
1976                          276
1977                          292
1978                          315
1979                          300
1980                          331
1981                          321
1982                          327
1983                          339
1984                          334
1985             

In [27]:
for column in world_development_indicator_df.columns:
    unique_values = world_development_indicator_df[column].value_counts()
    print("{:<30}{}".format(column, unique_values))

Country Name                  Country Name
United Arab Emirates    1496
Name: count, dtype: int64
Country Code                  Country Code
ARE    1496
Name: count, dtype: int64
Indicator Name                Indicator Name
International migrant stock, total                                                     1
Primary school age children out-of-school (%)                                          1
Electricity production from nuclear sources (% of total)                               1
Electricity production from renewable sources, excluding hydroelectric (% of total)    1
Energy imports, net (% of energy use)                                                  1
                                                                                      ..
GNI per capita (current LCU)                                                           1
GNI per capita, PPP (current international $)                                          1
Gross savings (% of GNI)                                        

In [28]:
world_development_indicator_df.duplicated().sum()

0

In [29]:
world_development_indicator_df.isnull().sum()

Country Name         0
Country Code         0
Indicator Name       0
Indicator Code       0
1960              1400
                  ... 
2020               704
2021               833
2022               964
2023              1182
Unnamed: 68       1496
Length: 69, dtype: int64

## Data Cleaning

In [3]:
world_development_indicator_df.drop(columns=['Country Code', 'Indicator Code','Unnamed: 68','Country Name'], inplace = True)

In [4]:
world_development_indicator_df.fillna(0, inplace=True)

In [8]:
world_development_indicator_pivot_df = world_development_indicator_df.melt(id_vars=["Indicator Name"], var_name="Year", value_name="Value")
world_development_indicator_pivot_df = world_development_indicator_pivot_df.pivot(index='Year', columns='Indicator Name', values='Value').reset_index()

In [10]:
# world_development_indicator_pivot_df.to_csv('../Cleaned Datasets\\World Development Indicators\\World_Development_Indicator_Pivoted.csv', index=False)

In [None]:
# world_development_indicator_df.to_csv('../Cleaned Datasets\\World Development Indicators\\World_Development_Indicator.csv', index=False)

# Metadata World Development Indicator

## Data Understanding

In [33]:
Metadata_World_Development_Indicator.head()

Unnamed: 0,INDICATOR_CODE,INDICATOR_NAME,SOURCE_NOTE,SOURCE_ORGANIZATION,Unnamed: 4
0,SM.POP.TOTL,"International migrant stock, total",International migrant stock is the number of p...,"United Nations Population Division, Trends in ...",
1,SL.WAG.0714.ZS,"Children in employment, wage workers (% of chi...",Wage workers (also known as employees) are peo...,Understanding Children's Work project based on...,
2,SL.UEM.TOTL.NE.ZS,"Unemployment, total (% of total labor force) (...",Unemployment refers to the share of the labor ...,International Labour Organization. “Labour For...,
3,SL.UEM.TOTL.FE.NE.ZS,"Unemployment, female (% of female labor force)...",Unemployment refers to the share of the labor ...,International Labour Organization. “Labour For...,
4,SL.UEM.NEET.MA.ME.ZS,"Share of youth not in education, employment or...","Share of youth not in education, employment or...",International Labour Organization. “ILO Modell...,


In [34]:
Metadata_World_Development_Indicator.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1496 entries, 0 to 1495
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   INDICATOR_CODE       1496 non-null   object 
 1   INDICATOR_NAME       1496 non-null   object 
 2   SOURCE_NOTE          1496 non-null   object 
 3   SOURCE_ORGANIZATION  1496 non-null   object 
 4   Unnamed: 4           0 non-null      float64
dtypes: float64(1), object(4)
memory usage: 58.6+ KB


# Pumping Data into Mongodb

In [35]:
# from pymongo import MongoClient
# from dotenv import load_dotenv
# import os

# # Connect to MongoDB
# load_dotenv()

# client = MongoClient(os.environ['MONGODB_URI_1'])  # Uses the connection string from your .env file
# db = client['tourism_db']

# # Convert DataFrame to dictionary
# world_development_indicator_df = world_development_indicator_df.to_dict('records')
# Metadata_World_Development_Indicator = Metadata_World_Development_Indicator.to_dict('records')

# # Insert into MongoDB collections
# db['world_development_indicator_df'].insert_many(world_development_indicator_df)
# db['Metadata_World_Development_Indicator'].insert_many(Metadata_World_Development_Indicator)

# print("Data pumped into MongoDB successfully!")