For those (like me) who cant remember how to start the dbs - 

For mac - Open a terminal window and run brew services start mongodb-community@6.0.
To begin using MongoDB open a new terminal window and run mongosh.

For Windows - To start the MongoDB server open a new command line window and run "C:\Program Files\MongoDB\Server\6.0\bin\mongod.exe" --dbpath="c:\data\db".
To begin using MongoDB open a new command line window and run mongosh.exe, or run mongosh.

In [1]:
import pandas as pd
from pymongo import MongoClient
from pprint import pprint
import seaborn as sns
import matplotlib.pyplot as plt

# connect to mdb
client = MongoClient("mongodb://localhost:27017/")


In [2]:
db = client["trust_database"]  # Database name
collection = db["country_trust"]  # Collection name

In [3]:
#loading "cleaned_trust_rate.csv"
csv_file = "../Resources/cleaned_trust_rate.csv"
df = pd.read_csv(csv_file)

In [4]:
# Load the population data CSV file
population_data = pd.read_csv("../Resources/UN_population_data.csv", encoding="utf-8")

# Checked the column names and their data types
print(population_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720210 entries, 0 to 720209
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   SortOrder    468660 non-null  float64
 1   LocID        720210 non-null  int64  
 2   Notes        113880 non-null  object 
 3   ISO3_code    346020 non-null  object 
 4   ISO2_code    344560 non-null  object 
 5   SDMX_code    416100 non-null  float64
 6   LocTypeID    468660 non-null  float64
 7   LocTypeName  468660 non-null  object 
 8   ParentID     468660 non-null  float64
 9   Location     720210 non-null  object 
 10  VarID        720210 non-null  int64  
 11  Variant      720210 non-null  object 
 12  Time         720210 non-null  int64  
 13  MidPeriod    720210 non-null  float64
 14  PopMale      720210 non-null  float64
 15  PopFemale    720210 non-null  float64
 16  PopTotal     720210 non-null  float64
 17  PopDensity   720210 non-null  float64
dtypes: float64(9), int64(3),

  population_data = pd.read_csv("../Resources/UN_population_data.csv", encoding="utf-8")


In [5]:
# Filtered for 2020 and "Medium" projection scenario
population_2020 = population_data[
    (population_data["Time"] == 2020) & 
    (population_data["Variant"] == "Medium")
]

# Verify the filtered data
print(population_2020.head())
print(population_2020.info())

      SortOrder  LocID Notes ISO3_code ISO2_code  SDMX_code  LocTypeID  \
70          NaN   5507   NaN       NaN       NaN        NaN        NaN   
1145        NaN   5512   NaN       NaN       NaN        NaN        NaN   
2220        NaN   5508   NaN       NaN       NaN        NaN        NaN   
3295        NaN   5509   NaN       NaN       NaN        NaN        NaN   
4370        NaN   5510   NaN       NaN       NaN        NaN        NaN   

     LocTypeName  ParentID                           Location  VarID Variant  \
70           NaN       NaN  ADB region: Central and West Asia      2  Medium   
1145         NaN       NaN              ADB region: Developed      2  Medium   
2220         NaN       NaN              ADB region: East Asia      2  Medium   
3295         NaN       NaN             ADB region: South Asia      2  Medium   
4370         NaN       NaN         ADB region: Southeast Asia      2  Medium   

      Time  MidPeriod     PopMale   PopFemale     PopTotal  PopDensity  
7

In [6]:
# Keeping only relevant columns
population_2020 = population_2020[["Location", "PopTotal"]]

# Renamed columns for clarity
population_2020 = population_2020.rename(columns={
    "Location": "country",
    "PopTotal": "population"
})

# Converted population from thousands to actual values
population_2020["population"] = population_2020["population"] * 1000

# Verify
print(population_2020.head())

                                country    population
70    ADB region: Central and West Asia  3.673709e+08
1145              ADB region: Developed  1.571182e+08
2220              ADB region: East Asia  1.512409e+09
3295             ADB region: South Asia  1.621716e+09
4370         ADB region: Southeast Asia  6.749885e+08


In [7]:
# Drop rows that refer to non-country locations if needed
non_countries = ["World", "Africa", "Asia", "Europe", "Oceania", "Latin America and the Caribbean", 
                 "Sub-Saharan Africa", "Northern America"]
population_2020 = population_2020[~population_2020["country"].isin(non_countries)]

# Strip leading/trailing whitespace from country names
population_2020["country"] = population_2020["country"].str.strip()

In [8]:
# converting numeric fields to floats and rounding to 2nd decimal
trust_fields= [
    "neighbourhood", "government", "scientist",
    "journalist", "doctor_and_nurses", "philantropist", "traditional_healers"
]
df[trust_fields] = df[trust_fields].apply(pd.to_numeric, errors="coerce").round(2)

In [9]:
df["country"] = df["country"].str.strip()
print(df["country"].head())

0      Albania
1      Algeria
2    Argentina
3    Australia
4      Austria
Name: country, dtype: object


In [10]:
merged_data = pd.merge(df, population_2020, on="country", how="left")
# Check for missing population data
missing_population = merged_data[merged_data["population"].isnull()]
print("Countries with missing population data:")
print(missing_population["country"])

Countries with missing population data:
9                 Bolivia
10     Bosnia Herzegovina
20      Congo Brazzaville
24         Czech Republic
40              Hong Kong
44                   Iran
49            Ivory Coast
54                 Kosovo
56                   Laos
65                Moldova
84                 Russia
91            South Korea
96                 Taiwan
98               Tanzania
101                Turkey
106         United States
109             Venezuela
110               Vietnam
Name: country, dtype: object


In [11]:
merged_data.to_csv("../Resources/trust_data_with_population.csv", index=False)

In [12]:
# sorting by population in decending order
merged_data = merged_data.sort_values(by="population", ascending=False)

In [13]:
# Extract the top 50 countries
top_50 = merged_data.head(50)

# Extract the bottom 59 countries
bottom_50 = merged_data.tail(50)

In [15]:
# Finding average trust rates for top 50 and bottom 50 countries
top_50_avg = top_50[trust_fields].mean()
bottom_50_avg = bottom_50[trust_fields].mean()

print("Top 50 Average Trust Rates:")
print(top_50_avg)

print("\nBottom 50 Average Trust Rates:")
print(bottom_50_avg)

Top 50 Average Trust Rates:
neighbourhood          72.4558
government             58.4298
scientist              79.7542
journalist             62.4642
doctor_and_nurses      83.9444
philantropist          69.7628
traditional_healers    43.2936
dtype: float64

Bottom 50 Average Trust Rates:
neighbourhood          74.8768
government             55.2438
scientist              78.5320
journalist             57.2936
doctor_and_nurses      84.6708
philantropist          70.8364
traditional_healers    45.5858
dtype: float64


In [None]:
comparison_df = pd.DataFrame({
    "Top 50 (Avg)": top_50_avg,
    "Bottom 50 (Avg)": bottom_50_avg
})

# Plot the bar chart
comparison_df.plot(kind="bar", figsize=(10, 6), color=["skyblue", "salmon"])
plt.title("Comparison of Average Trust Rates")
plt.ylabel("Trust Rate (%)")
plt.xlabel("Trust Categories")
plt.xticks(rotation=45)
plt.legend(title="Country Group")
plt.show()