# First exploration of selected files

In [229]:
import pandas as pd
import numpy as np
import re

In [230]:
# import dtale

We obtain the csv file with our categorization of the files in our dataset.

In [231]:
file_categorization = pd.read_csv("file-categorization.csv", sep = ';')

In [232]:
by_disorder_countries = {}
not_disorder_specific_countries = {}
survey_countries = {}

for i in range (len(file_categorization)):
    row = file_categorization.iloc[i]
    filename, category = row['filename'], row['category']
    if not (category == "unused" or category == "unsure"):
        if category == "by-disorder":
            df = pd.read_csv("mental-health-dataset/" + filename)
            by_disorder_countries[filename] = sorted(list(set(list(df['Entity']))))
        elif category == "not-disorder-specific":
            not_disorder_specific_countries[filename] = sorted(list(set(list(df['Entity']))))
        elif category == "survey":
            survey_countries[filename] = sorted(list(set(list(df['Entity']))))

In [233]:
all_entities = []

### Checking for cohesive country list in the category `by-disorder`

In [234]:
for file in by_disorder_countries:
    country_list = by_disorder_countries[file]
    all_entities = all_entities + country_list
    print(country_list)
    

['Afghanistan', 'African Region (WHO)', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Asia & Pacific (WB)', 'East Timor', 'Eastern Mediterranean Region (WHO)', 'Ecuador', 'Egypt', 'El Salvador', 'England', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Europe & Central Asia (WB)', 'European Region (WHO)', 'Fiji', 'Finland', 'France', 'G20', 'G

Our lists of countries do not match, meaning that we do not have all of the information for all of the countries (we'll have gaps/holes).

### Checking for cohesive country list in the category `not-disorder-specific`

In [235]:
for file in not_disorder_specific_countries:
    country_list = not_disorder_specific_countries[file]
    all_entities = all_entities + country_list
    print(country_list)

['Abkhazia', 'Afghanistan', 'Africa', 'Africa (IHME GBD)', 'Africa (UN)', 'Akrotiri and Dhekelia', 'Aland Islands', 'Albania', 'Algeria', 'America (IHME GBD)', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia', 'Asia (IHME GBD)', 'Asia (UN)', 'Australia', 'Austria', 'Austria-Hungary', 'Azerbaijan', 'Baden', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Bavaria', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Channel Islands', 'Chile', 'China', 'Christmas Island', 'Cocos Islands', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba'

Same lists!

### Checking for cohesive country list in the category `survey`

In [236]:
for file in survey_countries:
    country_list = survey_countries[file]
    all_entities = all_entities + country_list
    print(country_list)

['Abkhazia', 'Afghanistan', 'Africa', 'Africa (IHME GBD)', 'Africa (UN)', 'Akrotiri and Dhekelia', 'Aland Islands', 'Albania', 'Algeria', 'America (IHME GBD)', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia', 'Asia (IHME GBD)', 'Asia (UN)', 'Australia', 'Austria', 'Austria-Hungary', 'Azerbaijan', 'Baden', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Bavaria', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Channel Islands', 'Chile', 'China', 'Christmas Island', 'Cocos Islands', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba'

Same lists!

## Dividing our different entities

We can see from our lists, that we not only have information about each country. There's records for entire continents, for continents but from different sources... Let's try and assess exactly what entites we have:

First, let's build a dataframe with all of the entities we have.

In [237]:
all_entities = sorted(list(set(all_entities)))
entities = pd.DataFrame({'Entity': all_entities})

In [238]:
entities

Unnamed: 0,Entity
0,Abkhazia
1,Afghanistan
2,Africa
3,Africa (IHME GBD)
4,Africa (UN)
...,...
329,Yemen People's Republic
330,Yugoslavia
331,Zambia
332,Zanzibar


Now let's create new columns with the additional information we have seen our entities contain:

In [239]:
entities[['name', 'source']] = entities['Entity'].str.split('(', n=1, expand=True)
entities['source'] = entities['source'].str.replace(')', '')
entities['name'] = entities['name'].str.strip()
entities['source'] = entities['source'].str.strip()
entities

Unnamed: 0,Entity,name,source
0,Abkhazia,Abkhazia,
1,Afghanistan,Afghanistan,
2,Africa,Africa,
3,Africa (IHME GBD),Africa,IHME GBD
4,Africa (UN),Africa,UN
...,...,...,...
329,Yemen People's Republic,Yemen People's Republic,
330,Yugoslavia,Yugoslavia,
331,Zambia,Zambia,
332,Zanzibar,Zanzibar,


Let's try and the records we have that are NOT countries

In [240]:
entities['type'] = 'country'

In [241]:
entities.loc[~entities['source'].isnull(), 'type'] = 'non-country'

In [242]:
entities.loc[entities['type'] == 'non-country', :]

Unnamed: 0,Entity,name,source,type
3,Africa (IHME GBD),Africa,IHME GBD,non-country
4,Africa (UN),Africa,UN,non-country
5,African Region (WHO),African Region,WHO,non-country
10,America (IHME GBD),America,IHME GBD,non-country
21,Asia (IHME GBD),Asia,IHME GBD,non-country
22,Asia (UN),Asia,UN,non-country
81,East Asia & Pacific (WB),East Asia & Pacific,WB,non-country
84,Eastern Mediterranean Region (WHO),Eastern Mediterranean Region,WHO,non-country
94,Ethiopia (former),Ethiopia,former,non-country
96,Europe & Central Asia (WB),Europe & Central Asia,WB,non-country


In [243]:
entities.loc[entities['source'] == "former", "type"] = "former-country"
entities.loc[entities['source'] == "country", "type"] = "country"
entities.loc[entities['source'].fillna("-").str.contains("part"), "type"] = "country"
entities.loc[entities['type'] == 'non-country', :]

Unnamed: 0,Entity,name,source,type
3,Africa (IHME GBD),Africa,IHME GBD,non-country
4,Africa (UN),Africa,UN,non-country
5,African Region (WHO),African Region,WHO,non-country
10,America (IHME GBD),America,IHME GBD,non-country
21,Asia (IHME GBD),Asia,IHME GBD,non-country
22,Asia (UN),Asia,UN,non-country
81,East Asia & Pacific (WB),East Asia & Pacific,WB,non-country
84,Eastern Mediterranean Region (WHO),Eastern Mediterranean Region,WHO,non-country
96,Europe & Central Asia (WB),Europe & Central Asia,WB,non-country
97,Europe (IHME GBD),Europe,IHME GBD,non-country


In [244]:
continents = ['Africa', 'Asia', 'Europe', 'America', 'Oceania']
entities.loc[entities['name'].isin(continents), "type"] = "continent"
entities.loc[entities['name'].str.contains("Region"), "type"] = "region"
entities.loc[entities['name'].str.contains("Income"), "type"] = "income-classification"

In [247]:
display(entities[entities['type'] == "non-country"])
display(entities[entities['type'] == "continent"])
display(entities[entities['type'] == "region"])
display(entities[entities['type'] == "income-classification"])

Unnamed: 0,Entity,name,source,type
81,East Asia & Pacific (WB),East Asia & Pacific,WB,non-country
96,Europe & Central Asia (WB),Europe & Central Asia,WB,non-country
100,European Union (27),European Union,27,non-country
157,Latin America & Caribbean (WB),Latin America & Caribbean,WB,non-country
158,Latin America and the Caribbean (UN),Latin America and the Caribbean,UN,non-country
187,Middle East & North Africa (WB),Middle East & North Africa,WB,non-country
213,North America (WB),North America,WB,non-country
216,Northern America (UN),Northern America,UN,non-country
274,South Asia (WB),South Asia,WB,non-country
282,Sub-Saharan Africa (WB),Sub-Saharan Africa,WB,non-country


Unnamed: 0,Entity,name,source,type
2,Africa,Africa,,continent
3,Africa (IHME GBD),Africa,IHME GBD,continent
4,Africa (UN),Africa,UN,continent
10,America (IHME GBD),America,IHME GBD,continent
20,Asia,Asia,,continent
21,Asia (IHME GBD),Asia,IHME GBD,continent
22,Asia (UN),Asia,UN,continent
95,Europe,Europe,,continent
97,Europe (IHME GBD),Europe,IHME GBD,continent
98,Europe (UN),Europe,UN,continent


Unnamed: 0,Entity,name,source,type
5,African Region (WHO),African Region,WHO,region
84,Eastern Mediterranean Region (WHO),Eastern Mediterranean Region,WHO,region
99,European Region (WHO),European Region,WHO,region
239,Region of the Americas (WHO),Region of the Americas,WHO,region
279,South-East Asia Region (WHO),South-East Asia Region,WHO,region
323,Western Pacific Region (WHO),Western Pacific Region,WHO,region


Unnamed: 0,Entity,name,source,type
131,High Income (WB),High Income,WB,income-classification
166,Low Income (WB),Low Income,WB,income-classification
168,Lower Middle Income (WB),Lower Middle Income,WB,income-classification
188,Middle Income (WB),Middle Income,WB,income-classification


In [250]:
import dtale
dtale.show(entities[entities['type'] == "country"])



2024-09-03 13:33:08,360 - ERROR    - Exception occurred while processing request: object of type 'NoneType' has no len()
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/dtale/views.py", line 120, in _handle_exceptions
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/dtale/views.py", line 1587, in get_processes
    [_load_process(data_id) for data_id in global_state.keys()],
     ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/dtale/views.py", line 1572, in _load_process
    rows=len(data),
         ^^^^^^^^^
TypeError: object of type 'NoneType' has no len()
