In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
DATA_FOLDER = '../Data/'

ETHNICITY_PATH = DATA_FOLDER + 'ethnicities_data.tsv'

In [3]:
ethnicities_original = pd.read_csv(ETHNICITY_PATH, sep='\t',  
                               header=0, names=["Ethnicity ID", "Ethnicity"])

In [4]:
ethnicities = ethnicities_original.copy()
ethnicities = ethnicities[ethnicities['Ethnicity'].notna()]

In [5]:
ethnicities.head()

Unnamed: 0,Ethnicity ID,Ethnicity
1,/m/0x67,African Americans
2,/m/064b9n,Omaha people
3,/m/041rx,Jewish people
4,/m/033tf_,Irish Americans
5,/m/04gfy7,Indian Americans


In [6]:
from collections import defaultdict

asian_ethnicities = ['Aceh','Afghans in India','Agrawal','Asian people',
                     'Bengali Brahmins','Bengali Hindus','Bengali people',
                     'Bhutia','Bihari people','Brahmin caste','Bunt',
                     'Buryats','Chaliyan','Chettiar','Chinese Filipino',
                     'Chinese Indonesians','Chinese Singaporeans','Chitrapur Saraswat Brahmin',
                     'Dalit','Dogra','Ezhava','Filipino people','Filipino people of Spanish ancestry',
                     'Gaud Saraswat Brahmin','Gin people','Goans','Gujarati people','Han Chinese people',
                     'Hazaras','Hindkowans','Hindu','Hongkongers','Ilocano','Indian diaspora','Indian diaspora in France',
                     'Indians','Jaat','Japanese people','Jatt Sikh','Javanese',
                     'Kannada people','Kanyakubja Brahmins','Kapampangan people',
                     'Karnataka Brahmins','Kashmiri Pandit','Kashmiri people',
                     'Kayastha','Khatri','Konkani people','Koreans','Lao people',
                     'Malayali','Malaysian Chinese','Manchu','Marathi people',
                     'Marwari people','Mohyal','Mudaliar','Muhajir','Nair','Niyogi',
                     'Pakistanis','Parsi','Punjabi diaspora','Punjabis','Rohilla',
                     'Romani people','Romani people in Spain','Ryukyuan people','Sherpa',
                     'Sikh','Sindhis','Sinhalese','Sri Lankan Tamil diaspora','Sri Lankan Tamils',
                     'Taiwanese','Tamil','Tamil Brahmin','Telugu Brahmins','Telugu people',
                     'Thai Chinese people','Thai people','Tibetan people',
                     'Tulu people','Vietnamese people','Zhuang people','rajput']
black_ethnicities = ['African Americans','Aboriginal Australians','African people','Afro Trinidadians and Tobagonians',
                     'Akan people','Black British','Black Canadians','Black Hispanic and Latino Americans',
                     'Black Irish','Black people','Cubans','Dinka people','Ghanaians','Indigenous Australians',
                     'Kikuyu','Mandinka people','Sierra Leone Creole people',
                     'Sierra Leoneans in the United Kingdom','Somalis','Wolof people',
                     'Xhosa people','Yoruba people']
white_ethnicities = ['Acadians','Afrikaners','Albanians','American Jews',
                    'Americans','Anglo-Irish people','Argentines',
                    'Ashkenazi Jews','Australian American','Australians',
                    'Austrians','Austrians in the United Kingdom','Basque people',
                    'Belarusians','Belgians','Bosniaks','Bosnians',
                    'Brazilians','British Americans','British Jews','British people',
                    'Bulgarians','Cajun','Canadian Americans','Canadian Australian',
                    'Canadians in the United Kingdom','Castilians','Catalan people',
                    'Chileans','Chileans in the United Kingdom','Colombians',
                    'Corsicans','Criollo people','Croats','Czechs',
                    'Dalmatian Italians','Danes','Dene','Dutch','English Americans',
                    'English Australian','English Canadians','English people','Estonians',
                    'Finns','French','French Canadians','French-speaking Quebecer',
                    'Galicians','Georgians','Germans','Gibraltarian people','Greek Cypriots',
                    'Greeks in South Africa','Greeks in the United Kingdom',
                    'Hispanic and Latino Americans','Hondurans','Hungarians','Hutsuls',
                    'Icelanders','Irish Americans','Irish Australian','Irish Canadians',
                    'Irish migration to Great Britain','Irish people','Israeli Jews',
                    'Israelis','Italian immigration to Mexico','Italians','Italians in the United Kingdom',
                    'Kiwi','Latin American migration to the United Kingdom','Latino',
                    'Latvians','Lebanese','Lebanese immigration to Mexico','Lebanese people in the United Kingdom',
                    'Lithuanian Jews','Manx people','Mexicans','Norwegians','Peruvians in the United Kingdom',
                    'Poles','Poles in the United Kingdom','Portuguese','Quebeckers',
                    'Romanians','Romanichal','Russians','Scotch-Irish Americans','Scottish American',
                    'Scottish Australian','Scottish Canadians','Scottish people','Sephardi Jews',
                    'Serbs in North Macedonia','Serbs in the United Kingdom','Serbs of Bosnia and Herzegovina',
                    'Serbs of Croatia','Slavs','Slovaks','Slovenes','Soviet people',
                    'Spaniards','Spanish immigration to Mexico','Spanish people of Filipino ancestry',
                    'Swedes','Swedish-speaking population of Finland','Swiss','Sámi peoples',
                    'Tatars','Tejano','Transylvanian Saxons','Ukrainians','Uruguayans',
                    'Venezuelans','Welsh American','Welsh people','White Africans of European ancestry',
                    'White Americans','White British','White Latin American','White South Africans',
                    'Yugoslavs','white people']
mixed_ethnicities = ['Afro-Asians','Afro-Cuban',
                     'Afro-Guyanese','Albanian American','Anglo-Celtic Australians',
                     'Anglo-Indian people','Arab Americans','Arab Mexican',
                     'Armenian American','Aromanians','Asian Americans','Austrian Americans',
                     'Bahamian Americans','Baltic Russians','Bangladeshi American','Barbadian American',
                     'Bolivian American','Brazilian Americans','British African Caribbean people',
                     'British Asian','British Chinese','British Indian people','British Jamaican',
                     'British Nigerian','British Pakistanis','Bulgarian Canadians',
                     'Burmese Americans','Cambodian Americans','Chilean American',
                     'Chinese Americans','Chinese Canadians','Chinese Jamaicans','Colombian Americans',
                     'Colombian Australian','Croatian Americans','Croatian Australians',
                     'Croatian Canadians','Cuban American','Czech Americans','Czech Australians',
                     'Danish Americans','Danish Canadians','Dominican Americans','Dutch Americans',
                     'Dutch Australian','Dutch Canadians','Ecuadorian Americans','Eurasian','European Americans',
                     'Filipino Americans','Filipino Australians','Filipino mestizo','Finnish Americans',
                     'French Americans','French Chilean','German Americans','German Brazilians',
                     'German Canadians','Ghanaian American','Greek American','Greek Australian',
                     'Greek Canadians','Guyanese American','Haitian American','Hispanic','Hmong American',
                     'Honduran Americans','Hungarian Americans','Indian Americans','Indian Australian',
                     'Indo Caribbeans','Indo-Canadians','Indo-Guyanese','Indonesian Americans','Iranian Americans',
                     'Iranian Canadians','Iraqi Americans','Israeli Americans','Italian Americans',
                     'Italian Australian','Italian Brazilians','Italian Canadians','Japanese Americans',
                     'Japanese Brazilians','Korean American','Koryo-saram','Latvian American','Lebanese Americans',
                     'Lithuanian American','Louisiana Creole people','Lumbee','Luxembourgish Americans',
                     'Malagasy people','Mangaloreans','Mexican Americans','Moroccan Americans',
                     'Multiracial American','Métis','Nepali Indian','Nigerian American',
                     'Norwegian Americans','Pacific Islander Americans','Pakistani American','Pakistani Canadians',
                     'Palestinian American','Panamanian American','Persians','Polish Americans',
                     'Polish Australian','Polish Canadians','Portuguese Americans','Puerto Ricans',
                     'Romanian Americans','Russian Americans','Russian Canadians','Rusyn American',
                     'Salvadoran Americans','Samoan American','Samoan New Zealanders','Scandinavian Americans',
                     'Serbian Americans','Serbian Australian','Serbian Canadians','Sicilian Americans',
                     'Slovak Americans','Slovene American','South African American','Spanish American',
                     'Sri Lankan Americans','Stateside Puerto Ricans','Sudanese Australians',
                     'Swedish Americans','Swedish Australian','Swedish Canadians','Syrian Americans',
                     'Taiwanese Americans','Tamil Americans','Thai Americans','Turkish Americans',
                     'Ukrainian Americans','Ukrainian Canadians','Venezuelan Americans','Vietnamese Americans',
                     'Welsh Italians','multiracial people','peoples of the Caucasus']

list_ethnicities = np.unique(ethnicities['Ethnicity'])
ethnic_groups = {}  
ethnic_groups.update(dict.fromkeys(asian_ethnicities, 'Asian, Middle East and Tribes'))
ethnic_groups.update(dict.fromkeys(black_ethnicities, 'Black, Caribbean or African'))
ethnic_groups.update(dict.fromkeys(white_ethnicities, 'White'))
ethnic_groups.update(dict.fromkeys(mixed_ethnicities,'Mixed or multiple ethnic groups'))
ethnic_groups = defaultdict(lambda: 'Asian, Middle East and Tribes', ethnic_groups)

for ethnicity in list_ethnicities:
    print(ethnic_groups[ethnicity])

Black, Caribbean or African
White
Asian, Middle East and Tribes
Asian, Middle East and Tribes
Black, Caribbean or African
Black, Caribbean or African
White
Black, Caribbean or African
Mixed or multiple ethnic groups
Mixed or multiple ethnic groups
Mixed or multiple ethnic groups
Asian, Middle East and Tribes
Black, Caribbean or African
Mixed or multiple ethnic groups
White
White
White
Mixed or multiple ethnic groups
Mixed or multiple ethnic groups
White
Asian, Middle East and Tribes
Mixed or multiple ethnic groups
Mixed or multiple ethnic groups
Asian, Middle East and Tribes
Asian, Middle East and Tribes
White
Mixed or multiple ethnic groups
Asian, Middle East and Tribes
Asian, Middle East and Tribes
Asian, Middle East and Tribes
Mixed or multiple ethnic groups
White
Mixed or multiple ethnic groups
Asian, Middle East and Tribes
Asian, Middle East and Tribes
White
White
Mixed or multiple ethnic groups
White
White
Asian, Middle East and Tribes
Asian, Middle East and Tribes
Mixed or multi

In [7]:
# check if file already exists
if False:#os.path.exists(os.path.join(os.getcwd(), '../Data/','ethnicity_group_data.tsv')):
      print('File ethnicity_group_data.tsv already exists')
else:
      # get labels and save to csv file
      ethnicity_group_data = pd.DataFrame(ethnic_groups.items(),columns=['Ethnicity','Ethnic Group'])
      ethnicity_group_data.to_csv('../Data/ethnicity_group_data.tsv', sep='\t', index=False)
      print('Saved file ethnicities_data.tsv')

Saved file ethnicities_data.tsv


In [8]:
ethnicity_group_data.head()

Unnamed: 0,Ethnicity,Ethnic Group
0,Aceh,"Asian, Middle East and Tribes"
1,Afghans in India,"Asian, Middle East and Tribes"
2,Agrawal,"Asian, Middle East and Tribes"
3,Asian people,"Asian, Middle East and Tribes"
4,Bengali Brahmins,"Asian, Middle East and Tribes"
