In [1]:
import pandas as pd
import numpy as np
import re
from functools import reduce
import seaborn as sns
import missingno as msno
%matplotlib inline

from dask.dataframe import from_pandas
from dataprep.clean import clean_country

# functions to be used

In [2]:
from dataprep.clean import clean_country
def country_clean(df):

    # https://www.freethesaurus.com/
    # https://en.wikipedia.org/wiki/list_of_countries_and_dependencies_and_their_capitals_in_native_languages
    # https://en.wikipedia.org/wiki/list_of_alternative_country_names#
    #https://unstats.un.org/unsd/geoinfo/ungegn/docs/26th-gegn-docs/wp/wp54_ungegn%20wg%20country%20names%20document%202011.pdf
    # mesh terms ncbi

    # abkhazia = georgia 
    abkhazia = ['georgia','abkhazia','nagorno karabakh republic','nagorno karabakh','nagorno-karabakh','republic of artsakh','artsakh',
               'abkhaziya','abkhasian','abkhazian','sakartvelo','sakartvelos','sakartvelos respublika','abkhaz','abkhas',
               'republic of abkhazia','aphsny axwynthkharra','respublika abkhaziya','autonomous republic of abkhazia']
    afghanistan = ['afghanistan','kabul','afghanestan','islamic republic of afghanistan','da afġānistān islāmī jumhoryat',
                   'jomhūrīyyeh eslāmīyyeh afġānestān','islamic emirate of afghanistan','da afġānistān islāmī amārāt',
                  'islamic republic of afghanistan','da afġānistān islāmī jumhoryat','jomhūrīyyeh eslāmīyyeh afġānestān',
      'islamic emirate of afghanistan','da afġānistān islāmī amārāt']
    albania = ['albania','tirana','shqipëria','albanian','republic of albania','republika e shqipërisë',' arnavutluk','arbanon',
               "people's socialist republic of albania",'republika popullore socialiste e shqipërisë','albanian kingdom',
               'mbretnija shqiptare','durazzo','durres','republic of albania','republika e shqipërisë','arnavutluk','arbanon',"people's socialist republic of albania",
      'republika popullore socialiste e shqipërisë','albanian kingdom','mbretnija shqiptare']
    algeria = ['algeria','algerie','democratic and popular republic of algeria','algiers','annaba','oran','blida','timgad','dzayer',
               "people's democratic republic of algeria",'al-jazā’ir',"al jumhuriyya al jazaa'iriyya ad-dīmuqrāţiyya ash sha'biyya",
               'algerie','algiers',"people's democratic republic of algeria",'al-jazā’ir',
      "al jumhuriyya al jazaa'iriyya ad-dīmuqrāţiyya ash sha'biyya"]
    andorra = ['andorra','andorra la vella','principality of andorra',"principat d'andorra",'principality of the valleys of andorra']
    angola = ['angola','luanda','ngola','lwanda','republic of angola', 'república de angola']
    antigua_and_barbuda  = ['antigua and barbuda','antigua & barbu','wadadli',"saint john's","st. john's",'antigua','barbuda','barbuda and antigua','st. johns']
    argentina = ['argentina','buenos aires','rosario','argentine republic','la argentina','the argentine','argentine nation','united provinces of the río de la plata',
      'argentine confederation']
    armenia = ['armenia','yerevan','hayastán','armenian ssr','armenian s.s.r.',
              'republic of armenia','hayastan','հայաստանի հանրապետություն','hayastani hanrapetut’yun']
    australia = ['australia','canberra','norfolk island',"norf'k ailen",'kingston','christmas island','canton and enderbury islands',
                 'australian capital territory','new south wales','northern territory','queensland','south australia',
                 'tasmania','victoria','western australia','commonwealth of australia', 'new holland']
    austria = ['austria','österreich','wien','vienna','republic of austria','republik österreich','österreich',
      'federal republic of austria','cisleithania','cisleithanien']
    azerbaijan = ['azerbaijan','azərbaycan','bakı','baku','azerbaijan ssr','azerbaijan s.s.r.',
                 'republic of azerbaijan','azərbaycan respublikası','azerbaijan soviet socialist republic','azerbaijan ssr',
      'азәрбајҹан совет сосиалист республикасы']
    the_bahamas = ['the bahamas','nassau','bahamas','commonwealth of the bahamas','commonwealth of the bahama islands','bahama islands']
    bahrain = ['bahrain','manama','al-baḥrayn','al-manāmah','kingdom of bahrain', 'mamlakat al-baḥrayn', 'state of bahrain']
    bangladesh = ['bangladesh','dhaka','dhaka',"people's republic of bangladesh",'বাংলাদেশ','bangladesh','গণপ্রজাতন্ত্রী বাংলাদেশ','gônôprôjatôntri bangladesh',
      'বাংলা','bangla','বঙ্গ','bônggô','গঙ্গাঋদ্ধি','gônggarriddhi','bengal presidency','পূর্ববঙ্গ','purbô bônggô',
      'east bengal','পূর্ব পাকিস্তান','purbô pakistan','east pakistan']
    barbados = ['barbados','bridgetown','bim', 'bimshire', 'little england', 'the rock']
    belarus = ['belarus','minsk','bessr','bielaruś','belorussia','byelarus','belorussian ssr','belorussian s.s.r.','byelorussian ssr','belarus',
               'byelorussian s.s.r.','republic of belarus', 'gudija', 'byelorussia', 'white russia']
    belgium = ['belgium','brussels','belgië','belgique','belgien','brussel','bruxelles','brüssel',
              'kingdom of belgium','koninkrijk belgië','royaume de belgique','königreich belgien']
    belize = ['belize','belmopan','belize','british honduras']
    benin = ['benin','porto-novo','republic of benin','dahomey','republic of benin','république du bénin']
    bhutan = ['bhutan','thimphu','druk yul','kingdom of bhutan', 'druk gyal khap']
    bolivia = ['bolivia','la paz','chuqiyapu','buliwya','wuliwya','volívia','plurinational state of bolivia','estado plurinacional de bolivia','republic of bolívar','republic of bolivia']
    bosnia_and_herzegovina = ['bosnia and herzegovina','bosnia and herz','bosnia & herceg','sarajevo','сарајево','bosna i hercegovina','bosnia','bosnia-herzegovina','bosnia and hercegovina',
                             'bosnia and herzegovina','republic of bosnia and herzegovina','socialist republic of bosnia and herzegovina',
      'bosnia']
    botswana = ['botswana','gaborone','botswan','bophuthatswana','kalahari','republic of botswana', 'bechuanaland']
    brazil = ['brazil','brasília','brasil','brasilia','federative republic of brazil','pindorama','terra de santa cruz','ilha de vera cruz','empire of brazil',
      'united states of brazil','república federativa do brasil','terra di papaga']
    brunei = ['brunei','bandar seri begawan','bandar seri begawan','bandar','bandar seri begawan or bandar','brunei',
             'nation of brunei','the abode of peace','state of brunei','negara brunei darussalam',
      'brunei darussalam','نڬارا بروني دارالسلام']
    bulgaria = ['bulgaria','sofia','bălgariya','bălgarija','sofiya','sofija',
               'republic of bulgaria','република българия',"people's republic of bulgaria"]
    burkina_faso = ['burkina faso','ouagadougou','upper volta','burkina fasso','haute-volta','bourkina-fasso']
    burundi = ['burundi','gitega','uburundi','urundi','republic of burundi','bujumbura',
              "republika y'uburundi",'république du burundi','kingdom of burundi']
    cambodia = ['cambodia','phnum pénh','kămpŭchéa','phnom penh','kampuchea','khmer republic',
               'kingdom of cambodia','royaume du cambodge','kampuchea','democratic kampuchea',
      "people's republic of kampuchea",'state of cambodia']
    cameroon = ['cameroon','cameroun','yaoundé','yaounde','cameroons','united republic of cameroon','republic of cameron',
               'republic of cameroon','kamerun']
    canada = ['canada','ottawa','victoria','alberta','british columbia','manitoba','new brunswick','newfoundland and labrador',
              'northwest territories','nova scotia','nunavut','ontario','prince edward island','quebec','saskatchewan',
              'yukon territory','dominion of canada', 'dominion du canada', 'acadia', 'francisca']
    cape_verde = ['cape verde','cabo verde','praia','republic of cape verde','republic of cabo verde','república de cabo verde','repúblika di kabu verdi','cabo verde']
    central_african_republic = ['central african republic','bangui','bangî','bêafrîka','centrafrique','cent afr empire',
                                'cent afr republ','ubangi-shari','central african republic','oubangui-chari','ubangi-shari',
                                'république centrafricaine','empire centrafricain','central african empire']
    chad = ['chad','ndjamena','nijāmīnā','tšād','tchad',"n'djamena",'tchad','republic of chad', 'république du tchad', 'jumhūrīyat tashād']
    chile = ['chile','santiago','republic of chile','chilli','chili','reyno de chile','capitania general de chile']
    china = ['china','běijīng','zhōngguó','beijing','macau','oumún','hong kong','heung gong','taipei','zhōnghuá mínguó','táiwān','táiběi',
    'peoples republic of china','mainland china','sinkiang','manchuria','inner mongolia','tibet','macau','peking',
             'hongkong','kowloon','new territories','macao',"people's republic of china",'中华人民共和国','prc','中国','communist china',
      'red china','中共','peoples r china','mainland china','中国大陆/中国內地','mainland china/中国大陆/中国內地',
      'new china/新中国','new china','新中国',"shenzhou/神州","ch'in empire",'cathay',"shenzhou","神州",
      'zhongguo']
    colombia = ['colombia','bogotá','republic of colombia','república de colombia','estados unidos de colombia','confederación granadina',
      'república de la nueva granada']
    comoros = ['comoros','moroni','komori','juzur al-qamar','comores','iles comores','comoro islands',
              'union of the comoros','union des comores','udzima wa komori','al-ittiḥād al-qumurī','united republic of the commoros']
    republic_of_the_congo = ['republic of the congo','brazzaville','balazavile','république du congo','rep congo','repubilika ya kôngo',
                             'republíki ya kongó','congo','republic of the congo','congo (brazzaville)','congo brazzaville', 'french congo',
                            'congo-brazzaville','congo republic','repubilika ya kôngo','republíki ya kongó']
    costa_rica = ['costa rica','san josé','san jose','republic of costa rica', 'república de costa rica']

    cote_ivoire = ["cote d'ivoire",'cote ivoire',"côte d'ivoire",'ivory coast','yamoussoukro','republic of cote diivoire',
                  "republic of côte d'ivoire",'ivory coast','the ivory coast',"république de côte d'ivoire"]
    croatia = ['croatia','zagreb','hrvatska','republic of croatia', 'republika hrvatska', 'hrvatska', 'hrvaška']
    cuba = ['cuba','la habana','havana','republic of cuba', 'república de cuba']
    cyprus = ['cyprus','nicosia','kypros','lefkosia','lefkoşa','republic of cyprus','κυπριακή δημοκρατία','kypriaki dimokratia','kıbrıs cumhuriyeti','κύπρος',
      'kıbrıs','island of venus']
    czech_republic = ['czech republic','prague','česká republika','česko','praha','czechia','česká republika','česko','čr','cr','bohemia','czechland','the czechlands','české země',
      'česká socialistická republika','čsr','csr','protektorát čechy a morava','čechy a morava',
      'bohemia and moravia','země koruny české','koruna česká','česká konfederace']
    denmark = ['denmark','copenhagen','danmark','københavn','faeroe islands','faroe islands','greenland','kalaallit nunaat',
              'kingdom of denmark','kongeriget danmark','dania']
    democratic_republic_of_congo = ['democratic republic of congo','dem rep congo','kinshasa','kinsasa','kinsásá','jamhuri ya kidemokrasia ya kongo','repubilika ya kôngo ya dimokalasi',
                                    'republíki ya kongó demokratíki','république démocratique du congo','zaire','belgian congo',
                                    'katanga','democratic republic of the congo','drc','congo kinshasa','congo belge',
      'congo free state','republic of the congo']
    djibouti = ['djibouti','gabuuti','jabuuti','jībūtī','republic of djibouti','french somaliland','french territory of the afars and the issas','french somaliland','obock territory',
      'territoire français des afars et des issas','côte française des somalis',"territoire d'obock",
      'république de djibouti']
    dominica = ['dominica','roseau','commonwealth of dominica', 'dominique', 'dominik', 'wai‘tu kubuli']
    dominican_republic = ['dominican republic','dominican rep','santo domingo','república dominicana']
    east_timor = ['east timor','díli',"timor lorosa'e",'timor-leste','east timor','democratic republic of timor-leste',
                 'portuguese timor']
    ecuador = ['ecuador','quito','galapagos islands','republic of ecuador','república del ecuador','ikwadur ripuwlika','ekuatur nunka']

    egypt = ['egypt','cairo','misr','masr','al-qāhirah','cairo','arab republic of egypt','united arab republic',
             'united arab rep','unitd arab rep','U A R','united arab rep','arab republic of egypt','meṣr', 'aegyptus', 'kīmi']
    el_salvador = ['el salvador','san salvador','republic of el salvador', 'república de el salvador']
    equatorial_guinea = ['equatorial guinea','malabo','equat guinea','guinea ecuatorial','guinée équatoriale','guiné equatorial','republic of equatorial guinea','spanish guinea','rio muni',
                        'republic of equatorial guinea','república de guinea ecuatorial','république de guinée équatoriale',
      'república da guiné equatorial','guinea ecuatorial','guinée équatoriale','guiné equatorial']
    eritrea = ['eritrea','asmaraa','asmära','iritriya','ertra','asmara','state of eritrea', 'italian eritrea']
    estonia = ['estonia','tallinn','eesti','republic of estonia','esthonia','eesti','eesti vabariik','viru','viro','estland','maarjamaa','igaunija']
    eswatini = ['eswatini','mbabane','swaziland','kingdom of eswatini']
    ethiopia = ['ethiopia','addis abäba',"ityop'ia",'addis ababa','federal democratic republic of ethiopia','ኢትዮጵያ','ītiyop’iya',
      'abyssinia','etiopia','ethiopa','habeshastan','ethiopië','al-habasha']
    fiji = ['fiji','suva','viti','republic of fiji', 'matanitu tugalala o viti']

    finland = ['finland','suomi','helsinki','helsingfors','åland islands','aland islands','republic of finland','suomi','suomen tasavalta','republiken finland','suomenmaa','suopma','suoma dásseváldi',
      'lääˊddjânnam and lääˊddjânnam tääˊssväˊldd','lääˊddjânnam','lääˊddjânnam tääˊssväˊldd','suomâ and suomâ täsiväldi',
      'suomâ','suomâ täsiväldi','финля́ндия','finlyandiya','великое княжество финляндское','velikoye knyazhestvo finlyandskoye',
      'soome']
    france = ['france','paris','cayenne','guyane','polynésie française','papeete','guadeloupe','basse-terre','fort-de-france','martinique',
              'mamoudzou','mayotte','maore','mamoudzou','momoju','nouvelle-calédonie','nouméa','saint-denis','la réunion','réunion',
              'gustavia','saint-barthélemy','st barthelemy','marigot','saint-martin','st martin','saint-pierre','saint-pierre et miquelon','mata utu','matāʻutu',
              'wallis-et-futuna','ʻuvea mo futuna','laire','french guiana','fr.','wallis and futuna','corsica','miquelon and saint pierre',
              'saint pierre and miquelon','miquelon and st. pierre','st. pierre and miquelon','mayotte','french republic','république française',"l'hexagone",'gaul','gaule','γαλλία',
      'gallia','frankreich','francia','farança','tsarfat']
    gabon = ['gabon','république gabonaise','libreville','gabonese republic']
    gambia = ['gambia','banjul','republic of the gambia','bathurst']
    georgia = ['georgia','tbilisi',"sak'art'velo",'georgia ssr','republic of georgia','georgian ssr','georgia (western asia)','georgian s.s.r.',
              'sakartvelo','iberia','colchis','iveria','საქართველოს რესპუბლიკა',
      'sakartvelos respublika','گرجستان','gorjestan']
    germany = ['germany','berlin','deutschland','fed rep ger','ger dem rep','bundes republik','federal republic of germany','bundesrepublik deutschland','deutschland','brd','frg','german empire','weimar republic',
      'duitsland','allemagne','alemania','germania','německo','niemcy','németország','alemannia','alamannia','bavaria',
      'prussia','saksa','deutsch dem rep','east germany','deutsche demokratische republik','german democratic republic','ddr','gdr']
    ghana = ['ghana','accra','nkran','gaana','gana','republic of ghana','gold coast','united gold coast convention','ghana tiŋzuɣu']
    greece = ['greece','hellas','ellada','athinai','athina','ελλάς','ελλάδα','athens','macedonia (greece)','crete',
             'hellenic republic','yunanistan','ελληνική δημοκρατία','kingdom of the hellenes','yavan']
    grenada = ['grenada',"st. george's",'saint george',"saint george's",'gwenad']

    guinea_bissau = ['guinea bissau','guiné-bissau','bissau','guinea-bissau','republic of guinea-bissau','portuguese guinea',
                    'guiné-bissao']
    guatemala = ['guatemala','guatemala city','ciudad de guatemala','republic of guatemala', 'república de guatemala']
    guinea = ['guinea','guinée','gine','conakry','kɔnakiri','konakiri','republic of guinea','french guinea',
             'république de guinée','ߖߌ߬ߣߍ߫','republik bu gine','𞤪𞤫𞤨𞤵𞤦𞤤𞤭𞤳 𞤦𞤵 𞤺𞤭𞤲𞤫','guinée française','guinea-conakry']
    guyana = ['guyana','georgetown','british guiana','co‑operative republic of guyana']
    haiti = ['haiti','port-au-prince','pòtoprens','haïti','ayiti','republic of haiti',"république d'haïti",'repiblik d ayiti','haïti','ayiti','hayti']
    honduras = ['honduras','tegucigalpa','republic of honduras', 'república de honduras']
    hong_kong = ['hong kong','hong kong','heung gong','hongkong','kowloon','new territories']
    hungary = ['hungary','budapest','magyarország','hungary','republic of hungary',"hungarian people's republic",'kingdom of hungary','regnum hungariæ','hungaria',
      'magyarország','magyar köztársaság','magyar népköztársaság','magyar tanácsköztársaság','magyar királyság']
    iceland = ['iceland','reykjavík','ísland','republic of iceland','lýðveldið ísland','fold','thule','frón','ísafold']
    india = ['india','new delhi','bharôt','bhārat','bhārata','bhāratam','bhāratadēsam','bharat','republic of india','sikkim',
            'republic of india','bhārat','भारत','bhārat gaṇarājya','भारत गणराज्य','bhārata','bhāratadēśaṁ',
      'bhāratam','india','union of india','hindustan','भारतवर्ष','bhārat prajatantra','bhāratavarṣa','hodu','aryavarta',
      'jumhūrīyat-e bhārat']
    indonesia = ['indonesia','jakarta','madura','madoera','malay archipelago','sulawesi','celebes','sumatra','bali','java','timor',
                 'irian jaya','indonesian new guinea','west new guinea','indonesian new guinea','west irian','east indies',
                 'netherlands east indies','republic of indonesia','republik indonesia','nusantara','insulinde','indunesia','indonesië',
      'dutch east indies','netherlands east-indies','hindia belanda']

    iran = ['iran','tehrān','īrān','islamic republic of iran',"persia","iri","i.r.i.','i.r.i"]
    iraq = ['iraq','baghdad','bexda','êraq',"al-'iraq",'republic of iraq','republic of iraq', 'mesopotamia', 'assyria', 'babylon']
    ireland = ['ireland','dublin','baile átha cliath','éire','eire','republic of ireland','irish free state','poblacht na héireann','republic of ireland',
      'saorstát éireann','erin','banba','fodla','hibernia']
    israel = ['israel','yerushalayim','al-quds',"yisra'el","israʼiyl",'jerusalem','state of israel','מדינת ישראל',"medinat yisra'el",'دَوْلَة إِسْرَائِيل',"dawlat isra'il",'zion','yehuda','the jewish state','the hebrew state','the holy land',
      'eretz yisrael','tel aviv regime','the zionist entity','occupied palestine','illegal state of zion','zionistan','herzlstan']

    italy = ['italy','roma','rome','italia','sardinia','sicily','italian republic','repubblica italiana','italia','kingdom of italy','the beautiful country','the boot',
      'ausonia','esperia','enotria','tirrenia']
    jamaica = ['jamaica','kingston', 'xamayca']
    japan = ['japan','tōkyō','tokyo','bonin islands','nippon','nihon','yamato','ōyashima','cipangu','zipangu','cipangu/zipangu','gipangu','hinomoto']
    jordan = ['jordan','amman','al-’urdun','‘ammān','hashemite kingdom of jordan', 'hkj']
    kazakhstan = ['kazakhstan','nur-sultan','qazaqstan','kazakhstán','қазақстан','казахстан','kazakh ssr','kazakh s.s.r.','republic of kazakhstan','kazakhstan','republic of kazakhstan','қазақстан','қазақстан республикасы','qazaqstan',
      'alash / алаш / alaş','kazakh soviet socialist republic',
      'қазақ кеңестік социалистік республикасы/qazaq keñestik socïalïstik respwblïkası','qazaq keñestik socïalïstik respwblïkası',
      'казахская советская социалистическая республика','kazakhskaya sovetskaya sotsialisticheskaya respublika']
    kenya = ['kenya','nairobi','republic of kenya','republic of kenya','jamhuri ya kenya','british east africa protectorate','kenya colony']
    kiribati = ['kiribati','tarawa','republic of kiribati','ribaberiki ni kiribati','gilbert islands','kingsmill group']
    north_korea = ['north korea',"p'yŏngyang",'bukchosŏn','chosŏn','pyongyang',"democratic people's republic of korea",'d.p.r.k.','dprk','choson minjujuui inmin konghwaguk','cho-son','dpr korea korea']
    south_korea = ['south korea','seoul','namhan','south korea','republic of korea','r.o.k.','rok','dae-han-min-guk','han-guk','korea republic']
    kosovo = ['kosovo','pristina','kosova','косово','prishtinë','priština','republic of kosovo','republika e kosovës','republika kosovo','република косово','kossovo']
    kuwait = ['kuwait','kuwait city','dawlat ul-kuwayt','il-ikwet','madiinat ul-kuwayt','id-diira','state of kuwait','دولة الكويت','dawlat al-kuwait','الكويت','al-kuwait']
    kyrgyzstan = ['kyrgyzstan','bishkek','kirgizija','кыргызстан','киргизия','kirghizia','kirgizstan','kyrgyz republic','kirghiz ssr',
                  'kirghiz s.s.r.','kyrgyz republic','кыргызстан','кыргыз республикасы','kyrgyz respublikasy','киргизия','кыргызская республика','kyrgyzskaya respublika',
      'kirghiz soviet socialist republic','кыргыз советтик социалисттик республикасы','киргизская советская социалистическая республика',
        'kyrgyz sovettik sotsialisttik respublikasy','kirgizskaya sovetskaya sotsialisticheskaya respublika',
        'кыргыз советтик социалисттик республикасы/kyrgyz sovettik sotsialisttik respublikasy',
      'киргизская советская социалистическая республика/kirgizskaya sovetskaya sotsialisticheskaya respublika','kirghizia',
      'kirgizstan','republic of kyrgyzstan']

    laos = ['laos','lao','vientiane','vieng chan','wīang chan',"lao people's democratic republic",'ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ','sathalanalat paxathipatai paxaxôn',
      'république démocratique populaire lao']
    latvia = ['latvia','rīga','latvija','republic of latvia','latvijas republika','latvija','lettland','letland','letonija','lettonie']
    lebanon = ['lebanon','beirut','lubnān','liban','bayrūt','beyrouth','lebanese republic','the lebanese republic','al-jumhuriyya al-lubnaniyya','lebnan','lubnan','liban','levanon']
    lesotho = ['lesotho','maseru','basutoland','kingdom of lesotho']
    liberia = ['liberia','monrovia','republic of liberia']
    libya = ['libya','tripoli','lībiyā','ṭrables','tarabulus','state of libya','دولة ليبيا','dawlat lībiyā','united libyan kingdom','kingdom of libya','libyan arab republic',
      "socialist people's libyan arab jamahiriya","great socialist people's libyan arab jamahiriya"]

    liechtenstein = ['liechtenstein','vaduz','leichtenstein','principality of liechtenstein', 'fürstentum liechtenstein']
    lithuania = ['lithuania','vilnius','lietuva','lithuania','republic of lithuania','lietuvos respublika','lietuva','литва','litva','lita']
    luxembourg = ['luxembourg','lëtzebuerg','luxemburg','luxemborg','grand duchy of luxembourg','groussherzogdem lëtzebuerg','lëtzebuerg','grand-duché de luxembourg','luxembourg',
      'großherzogtum luxemburg','luxemburg','luxemburgo','lussemburgo']
    madagascar = ['madagascar','antananarivo','madagasikara','antananarivo','tananarive','malagasy republic','malagasy republ','republic of madagascar','madagasikara',"repoblikan'i madagasikara",'république de madagascar']
    malawi = ['malawi','lilongwe','malawi','malaŵi','nyasaland','republic of malawi','republic of malawi', 'nyasaland']
    malaysia = ['malaysia','kuala lumpur','federation of malaya','malaya','malay federation','malaya federation','sabah','sarawak','malay peninsula',
               'malaysia','persekutuan malaysia','federation of malaysia','malaya','sabah and sarawak','tanah melayu','malay land',
      '马来西亚/ mǎláixīyà','mǎláixīyà','malesiya','மலேசியா/ malesiya']
    maldives = ['maldives','dhivehi raajje','malé','republic of maldives','ދިވެހިރާއްޖޭގެ ޖުމްހޫރިއްޔާ/ dhivehi raajjeyge jumhooriyyaa','dhivehi raajjeyge jumhooriyyaa',
      'the maldive islands','mahal dvipa','मालदीव/ maléldvipa','maléldvipa','الدولة المحلديبية/ dhibat-al-mahal',
      'dhibat-al-mahal']
    mali = ['mali','bamako','bamakɔ','republic of mali','république du mali','mali ka fasojamana','ߡߊߟߌ ߞߊ ߝߊߛߏߖߊߡߊߣߊ','renndaandi maali']
    malta = ['malta','valletta','il-belt valletta','republic of malta', "repubblica ta' malta", 'melita']
    marshall_islands = ['marshall islands','majuro','mājro','aorōkin ṃajeḷ','republic of the marshall islands', 'aolepān aorōkin ṃajeḷ']
    mauritania = ['mauritania','nouakchott','muritan','agawec','mūrītānyā','nwakcuṭ','anu ukcuḍ','muritan / agawec','nwakcuṭ / anu ukcuḍ'
                  ,'islamic republic of mauritania', 'république islamique de mauritanie']
    mauritius = ['mauritius','maurice','moris','port louis','port-louis','porlwi','agalega islands','republic of mauritius', 'république de maurice', 'repiblik moris']
    mexico = ['mexico','mexico city','méxico','mēxihco','ciudad de méxico','āltepētl mēxihco','united mexican states','estados unidos mexicanos','méxico','república mexicana','méjico','mex','mx','aztlán','aztlān']
    micronesia = ['micronesia','palikir','federated states of micronesia','johnston island','kiribati','gilbert islands','mariana islands',
                  'marshall islands','nauru','northern mariana islands','pacific islands (trust territory)','tuvalu','ellice islands',
                  'caroline islands','pacific islands','guam','palau','federated states of micronesia', 'fsm']
    moldova = ['moldova','chișinău','moldavia','chisinau','moldavian ssr','moldavian s.s.r.','republic of moldova','republica moldova','moldavian soviet socialist republic','република советикэ сочиалистэ молдовеняскэ',
      'bessarabia bassarabia','boğdan','moldau']
    monaco = ['monaco','múnegu','principality of monaco','principauté de monaco','principatu de mùnegu','principato di monaco','principat de mónegue']
    mongolia = ['mongolia','ulaanbaatar','mongol uls','монгол улс','улаанбаатар','монгол улс']
    montenegro = ['montenegro','crna gora','црна гора','podgorica','подгорица','montenegro','crna gora','republic of montenegro','republika crna gora','mali i zi','karadag','duklja','zeta','black mountain']
    morocco = ['morocco','rabat','amerruk','elmeɣrib','al-maɣréb','amerruk / elmeɣrib','errbaṭ','ar-ribaaṭ','morroco','ifni',
              'kingdom of morocco','al mamkaka al maghribiya','the far west kingdom']
    mozambique = ['mozambique','moçambique','maputo','republic of mozambique','portuguese east africa','republic of mozambique','moçambique','república de moçambique','mozambiki','msumbiji','muzambhiki']
    myanmar = ['myanmar','myanma','nay pyi taw','naypyidaw','burma','republic of the union of myanmar','burma','ပြည်ထောင်စု သမ္မတ မြန်မာနိုင်ငံတော်\u200c','pyidaunzu thanmăda myăma nainngandaw']
    namibia = ['namibia','windhoek','namibië','windhuk','otjomuise','/ae-//gams','s west africa','southwest africa','republic of namibia',
               'south west africa','republic of namibia', 'german south-west africa', 'deutsch-südwestafrika']

    nauru = ['nauru','yaren','naoero','republic of nauru', 'repubrikin naoero', 'pleasant island']
    nepal = ['nepal','kathmandu','nepāl','kāṭhamāṇḍauṃ','kāntipura','federal democratic republic of nepal','federal democratic republic of nepal','संघिय लोकतान्त्रिक गणतन्त्र नेपाल','saṃghiya lokatāntrika gaṇatantra nepāla',
      'kingdom of nepal']
    netherlands = ['netherlands','neth antilles','amsterdam','nederland','nederlân','aruba','curaçao','netherlands','sint maarten','holland','philipsburg',
                   'kingdom of the netherlands','koninkrijk der nederlanden','nederland','holland','batavia','pays-bas']
    new_zealand = ['new zealand','wellington','aotearoa','wellington','poneke/te whanganui-a-tara','poneke','te whanganui-a-tara','niue',
                   'niuē','alofi','tokelau','tokelau islands','new zealand','aotearoa','realm of new zealand','dominion of new zealand']
    nicaragua = ['nicaragua','nicaragua','managua','republic of nicaragua','república de nicaragua']
    niger = ['niger','niamey','republic of niger','republic of the niger', 'the niger', 'république du niger']
    nigeria = ['nigeria','àbújá','abuja','nàìjíríà','naìjíríyà','nijeriya','lagos','federal republic of nigeria','federal republic of nigeria','orílẹ̀-èdè olómìniira àpapọ̀ nàìjíríà','jamhuriyar taraiyar najeriya','ọ̀hàńjíkọ̀ ọ̀hànézè naìjíríyà']
    north_macedonia = ['north macedonia','skopje','severna makedonija','maqedonia e veriut','скопје','shkup','former yugoslav republic of macedonia',
                       'north macedonia','republic of macedonia','republic of north macedonia','republic of north macedonia','republic of macedonia','macedonia','(republic of) macedonia','северна македонија',
      'македонија','the former yugoslav republic of macedonia','fyrom','paeonia','vardar banovina']
    norway = ['norway','oslo','norge','noreg','norga','vuodna','nöörje','svalbard','longyearbyen','kingdom of norway','spitsbergen',
             'kingdom of norway','norge','noreg','kongeriket norge','kongeriket noreg']

    oman = ['oman','muscat','‘umān','umān','masqaṭ','muscat','muscat and oman','sultanate of oman']
    pakistan = ['pakistan','islāmabād','pākistān','islamabad','islamic republic of pakistan','islamic republic of pakistan','federation of pakistan','dominion of pakistan','west pakistan',
      'gandhāra','sindhustan and indoscythia','sindhustan','indoscythia','mumlikat-e khudadaad pakistan']
    palau = ['palau','ngerulmud','belau','warsaw','republic of palau', 'belau']
    palestine = ['palestine','filasṭīn','al-quds al-sharqit','rāmallāh','east jerusalem','ramallah','state of palestine','מדינת פָּלֶשְׂתִּינָה','medinat pālēśtīnā','دَوْلَة فلسطين','dawlat filasṭīn','the holy land','eretz yisrael',
      'filasṭīn','فلسطين','pālēśtīnā','פָּלֶשְׂתִּינָה','as-sulṭa al-filasṭīnīya','palestinian national authority','السلطة الفلسطينية',
      'occupied palestinian territories','west bank and gaza strip','palestinian territories','الأراضي الفلسطينية',
      'occupied palestinian territory','occupied palestinian territory / occupied palestinian territories',
      'israeli-occupied territories','mandatory palestine']
    panama = ['panama','panama city','panamá','ciudad de panamá','republic of panama', 'república de panamá']
    papua_new_guinea = ['papua new guinea','papua n guinea','port moresby','papua new guinea','papua niugini','papua niu gini','port moresby','pot mosbi',
                       'independent state of papua new guinea','independen stet bilong papua niugini','independen stet bilong papua niu gini',
      'papua niugini','papua niu gini']
    paraguay = ['paraguay','asunción','paraguái','republic of paraguay', 'república del paraguay', 'tetã paraguái']

    peru = ['peru','perú','piruw','lima','republic of peru','república del perú','perú','peruvian republic','república peruana']
    philippines = ['philippines','pilipinas','maynila','manila','phillipines','phillippines','philipines','republic of the philippines','republika ng pilipinas','pilipinas','filipinas','las islas filipinas','las islas felipenas',
      'pinás','ph','philippine islands','philippine islands / p.i.','filipina','haríng bayang katagalugan','islas de san lázaro']
    poland = ['poland','warszawa','polska','republic of poland',"polish people's republic",'republic of poland','rzeczpospolita polska','polska','rp',"people's republic of poland",'polska rzeczpospolita ludowa',
      'rzeczpospolita','polish–lithuanian commonwealth','rzeczpospolita obojga narodów','united commonwealth of the two nations',
      'polonia','lechia']
    portugal = ['portugal','lisboa','lisbon','portuguese republic','madeira island','portuguese republic','lusitania','galaico-portuguese','portucale','gallaeci','galician-portuguese','ophiussa','ophiusa','portugalensis patrie',
      'portugalensium patrie','portugaliae','regno portugalensium','portugalis','portugalliae et algarbiae','portugalliae','lusitaniae',
      'purtugall','burtughāl','portingall','portingal','ocidental praia lusitana','pátria lusitana','luxitania','portugraal']
    puerto_rico = ['puerto rico','san juan','puerto rico','commonwealth of puerto rico','estado libre asociado de puerto rico','porto rico','associated free state of puerto rico',
      'borikén','borinquen','borinken']
    qatar = ['qatar','doha','ad-dawḥah','katar','state of qatar','quatar','دولة قطر', 'dawlat qaṭar']

    romania = ['romania','bucurești','românia','bucharest','rumania','roumania','kingdom of romania','regatul româniei',"romanian people's republic",
      'republica populară romînă','socialist republic of romania','republica socialistă românia']
    russia = ['russia','moskva','москва','rossiya','rossiâ','россия','moscow','russian sfsr','russian federation (europe)','russian federation',
              'russian s.f.s.r.','siberia','bashkiria','dagestan','tatarstan','bashkortostan','bashkir republic','tatariya',
              'republic of tatarstan','russian federation','российская федерация','russia','russland','россия','rf','russian empire','russian socialist federative soviet republic','russian soviet federative socialist republic',
      'rsfsr','soviet union','union of soviet socialist republics','ussr','союз советских социалистических республик',
      'soyuz sovetskikh sotsialisticheskikh respublik','ссср','sssr','su','сове́тский сою́з','sovetskiy soyuz','rus','great russia']
    rwanda = ['rwanda','kigali','ruanda','republic of rwanda','rwandese republic','republic of rwanda','république du rwanda',"repubulika y'u rwanda",'pays des mille collines','ruanda']
    saint_lucia = ['saint lucia','castries','st. lucia', 'st lucia','sainte-lucie']

    sahrawi_arab_democratic_republic = ['sahrawi arab democratic republic','laayoune','al-jumhūrīyah',"al-jumhūrīyah al-‘arabīyah aṣ-ṣaḥrāwīyah ad-dīmuqrāṭīyah",'república árabe saharaui democrática',
                                        'république arabe sahraouie démocratique','al-ʿayyūn','el aaiún']
    saint_kitts_and_nevis = ['saint kitts and nevis','basseterre','st. kitts and nevis','st kitts & nevi','nevis','federation of saint christopher and nevis','federation of saint kitts and nevis','saint christopher and nevis',
      'saint christopher-nevis-anguilla','saint kitts-nevis-anguilla','saint kitts','saint christopher','liamuiga and oualie','liamuiga','oualie']
    saint_vincent_and_the_grenadines = ['saint vincent and the grenadines','kingstown','grenadines','st. vincent and the grenadines','saint vincent and the grenadines',
                                       'saint vincent and the grenadines', 'st vincent','saint vincent', 'hairouna']
    são_tomé_and_príncipe = ['sao tome and principe','sao tome & prin','são tomé','são tomé e príncipe','democratic republic of são tomé and príncipe','república democrática de são tomé e príncipe','saint thomas and prince']
    samoa = ['samoa','apia','sāmoa','samoan islands','independent state of samoa', 'malo saʻoloto tutoʻatasi o sāmoa','samoa islands','navigator island','navigator islands','eastern samoa','american samoa','independent state of samoa','western samoa']
    san_marino = ['san marino','san marino','republic of san marino','republic of san marino','repubblica di san marino',"ripóbblica d' san marein",'most serene republic of san marino',
      'serenissima repubblica di san marino']
    saudi_arabia = ['saudi arabia','riyadh',"al-mamlaka al-‘arabiyyah as sa‘ūdiyyah",'ar-riyāḍ','kingdom of saudi arabia','ksa','saudia','al-mamlaka al-arabiyya as-saudiyya','nejd','hijaz','saudi regime',"arav hasa'udit"]

    senegal = ['senegal','dakar','ndakaaru','sénégal','senegaal','republic of senegal','republic of senegal','république du sénégal','réewum senegal','جمهورية السنغال',
      '𞤪𞤫𞤲𞤲𞤣𞤢𞤢𞤲𞤣𞤭 𞤧𞤫𞤲𞤫𞤺𞤢𞥄𞤤','senegal']
    serbia = ['serbia','srbija','србија','beograd','београд','serbia monteneg','republic of serbia','servia','srbija','republika srbija','serbia and montenegro','yugoslavia','kingdom of serbia',
      'raška','serboslavia']
    seychelles = ['seychelles','sesel','victoria','port victoria','republic of seychelles', 'république des seychelles', 'repiblik sesel']
    sierra_leone = ['sierra leone','freetown','republic of sierra leone', 'salone']
    singapore = ['singapore','singapura','xīnjiāpō','singapur','republic of singapore','singapura','sinhapura','xīnjiāpō/新加坡','xīnjiāpō','ciŋkappūr/சிங்கப்பூர்','ciŋkappūr','pulau ujong',
      'shōnan-tō/昭南島','shōnan-tō','little red dot']
    slovakia = ['slovakia','bratislava','slovensko','belusa','slovak republic','slovak republic','slovensko','slovenská republika','sr','slovak socialist republic','slovenská socialistická republika',
      'ssr','slovak state','slovenský štát']
    slovenia = ['slovenia','slovenija','ljubljana','republic of slovenia', 'slovenija', 'republika slovenija', 'rs']

    solomon_islands = ['solomon islands','solomon aelan','honiara','honiala','solomon islands', 'british solomon islands protectorate', 'the solomons']
    somalia = ['somalia','muqdisho','maqadīshū','soomaaliya','aş-şūmāl','mogadishu','federal republic of somalia','jamhuuriyadda federaalka soomaaliya','جمهورية الصومال الاتحادية']
    south_africa = ['south africa','pretoria','cape town','bloemfontein','suid-afrika','iningizimu afrika','umzantsi afrika','afrika-borwa','afrika borwa','aforika borwa','afurika tshipembe','afrika dzonga','isewula afrika','ipitori','pitori','ipitoli','ikapa','ipitoli, ikapa','kaapstad','transkei','s af','s afr','s africa','union of south africa','republic of south africa',
                   'republic of south africa','ciskei','azania','mzansi','suid-afrika','zuid-afrika','union of south africa']
    south_sudan = ['south sudan','juba','south sudan','sudan kusini','paguot thudän','republic of south sudan', 'jamhuri ya sudan kusini']
    spain = ['spain','madrid','madril','españa','espanya','espainia','espanha','balearic islands','canary islands','kingdom of spain','españa','reino de españa','espanya','hispania','espainia','spanish state','la piel de toro',
      'la pell de brau','las batuecas','iberia','sepharad']
    sri_lanka = ['sri lanka','sri jayawardenapura kotte','sri lankā','ceylon','colombo','democratic socialist republic of sri lanka','ceylon','இலங்கை சனநாயக சோசலிசக் குடியரசு','heladiva',
      'land of sinhalese','ratnadeepa','island of gems','elangai/இலங்கை','elangai','taprobane']
    sudan = ['sudan','khartoum','as-sudan','al-khartûm','republic of the sudan','republic of the sudan','the sudan','anglo-egyptian sudan','mahdist state','nubia']
    suriname = ['suriname','suriname','paramaribo','surinam','netherlands guiana','dutch guiana','republic of suriname','dutch guiana','netherlands guiana','republiek suriname','surinam']
    sweden = ['sweden','stockholm','sverige','kingdom of sweden','sverige','konungariket sverige','svea rike','thule','svitjod']
    switzerland = ['switzerland','schweiz','suisse','svizzera','svizra','bern','berne','berna','swiss confederation','schweiz','schweizerische eidgenossenschaft','suisse','confédération suisse','svizzera',
      'confederazione svizzera','svizra','confederaziun svizra','confoederatio helvetica','helvetia','ch']
    syria = ['syria','damascus','suriyah','dimashq','ash-sham','dimashq / ash-sham','syrian arab republic','الجمهورية العربية السورية','al-jumhūrīyah al-ʻarabīyah as-sūrīyah']
    taiwan = ['taiwan','taipei','zhōnghuá mínguó','táiwān','táiběi','formosa','republic of china','republic of china','taiwan','中華民國','臺灣/ 台灣','roc','chinese taipei','zhōnghuá táiběi','separate customs territory of taiwan penghu kinmen and matsu','taipei','taiwan','province of china','nationalist china','free china','formosa','cathay','zhongguo','qing dynasty']

    tajikistan = ['tajikistan','dushanbe','tojikistan','tadjikistan','tadzhik s.s.r.','tadzhik ssr','tadzhikistan','republic of tajikistan','ҷумҳурии тоҷикистон','jumhurii tojikiston','тоҷикистон','تاجیکستان','таджикистан','tadzhikistan']
    tanzania = ['tanzania','dodoma','tanganyika','zanzibar','united republic of tanzania','united republic of tanzania','united republic of tanganyika and zanzibar','deutsch-ostafrika','tanganyika','zanzibar']
    togo = ['togo','lomé','lome','loma','togolese republic', 'république togolaise']
    tonga = ['tonga',"nukuʻalofa",'kingdom of tonga', 'puleʻanga fakatuʻi ʻo tonga']
    thailand = ['thailand','bangkok','thai','prathet thai','ratcha-anachak thai','thai, prathet thai, ratcha-anachak thai','bophuth',
                'kingdom of thailand', 'siam', 'ประเทศสยาม', 'ราชอาณาจักรไทย', 'ประเทศไทย','krung thep','krung thep maha nakhon','maha nakhon','krung thep, krung thep maha nakhon','siam','kingdom of thailand']
    # moldavie ?
    transnistria = ['transnistria','transnistria','transnistrie','tiraspol','pridnestrovie', 'pridnestrovian moldavian republic']
    trinidad_and_tobago = ['trinidad and tobago','trinidad tobago','port of spain','trinid & tabago','tobago','trinidad','republic of trinidad and tobago', 'trinbago', 'iere']
    tunisia = ['tunisia','tunis','tunes','tūns','tunasia','tunisie','tunisin','republic of tunisia', 'تونس', 'الجمهورية التونسية']
    turkey = ['turkey','türkiye','ankara','republic of turkey','türkiye cumhuriyeti','asia minor','anatolia','ottoman empire','turkish empire','porte','sublime porte']
    turkmenistan = ['turkmenistan','türkmenistan','aşgabat','ashgabat','turkmen ssr','turkmen s.s.r.','turkmenistan', 'türkmenistan', 'turkmenia', 'түркменистан', 'туркмения']
    tuvalu = ['tuvalu','fongafale','ellice islands']
    uganda = ['uganda','kampala','republic of uganda', 'jamhuri ya uganda']
    ukraine = ['ukraine','kyjiv','ukrajina','ukssr','ukraine','україна','ua','ukr.','western ukraine','kievan rus','kingdom of galicia–volhynia','kingdom of ruthenia','zaporizhian host']
    united_arab_emirates = ['united arab emirates','u arab emirates','abu dhabi','‘abū ẓabī','abū ẓabī','al-’imārat al-‘arabiyyah al-muttaḥidah','trucial states','united arab emirates',
                           'united arab emirates','trucial states','al emirat al arabbiya al muttahida','uae','u.a.e.','trucial arabia','the emirates']
    united_kingdom = ['united kingdom','gibraltar','england','northern ireland','scotland','wales','isle of man','ellan vannin','douglas','great britain',
    'doolish','st. helier','saint hélier','falkland island','saint hélyi','jersey','jèrri','saint helier','brades estate',
                      'montserrat','adamstown','pitkern ailen','pitcairn islands','jamestown','saint helena','st helena','ascension and tristan da cunha',
                      'cockburn town','turks and caicos islands','turks & caicos','london','llundain','lunnon','lunnainn','londain','loundres',
                      'britain','y deyrnas unedig','unitit kinrick','rìoghachd aonaichte','ríocht aontaithe','an rywvaneth unys',
                      'wales','scotland','north ireland','ascension isl','ascension islan','tristan da cunh','channel islands',
                      'sark','alderney island','jersey island','guernsey island','guernsey','hebrides','united kingdom of great britain and northern ireland','britain','great britain','uk','u.k.','united kingdom','albion',
      'britannia','perfidious albion','anglia','alba','caledonia','cymru','cambria','ulster','british virgin isl','cayman islands']
    united_states = ['united states','guam','united states','guåhån','agaña','saipansaipan','northern mariana islands','notte mariånas','san juan','puerto rico',
    'washington, d.c.','washington','america','estados unidos','états-unis','‘amelika hui pū ‘ia','d.c.','washington d.c.',
                     'wakinekona','wasinetona','charlotte amalie','united states virgin islands','usa','united states of america','america','the states','us','u.s.',
                     'u.s.a.','u.s.a','columbia','freedonia','yankeedom','dixie','appalachia','alleghany','united states','usonia','usono']
    uruguay = ['uruguay','montevideo','oriental republic of uruguay','república oriental del uruguay','república oriental do uruguai']
    uzbekistan = ['uzbekistan','toshkent',"o‘zbekiston",'uzbek ssr','republic of uzbekistan','uzbek s.s.r.','republic of uzbekistan','oʻzbekiston','oʻzbekiston respublikasi','узбекистан','узбекия']
    vanuatu = ['vanuatu','port vila','port-vila','new hebrides','republic of vanuatu','new hebrides','niuhebridis','nouvelles hebrides','république de vanuatu','ripablik blong vanuatu',
      'vanuatri']
    vatican_city = ['vatican city','civitas vaticana','città del vaticano','vatican','vatican city state','status civitatis vaticanae','stato della città del vaticano','holy see']
    venezuela = ['venezuela','caracas','bolivarian republic of venezuela','república bolivariana de venezuela','estado de venezuela','república de venezuela',
      'estados unidos de venezuela','república de venezuela']
    vietnam = ['vietnam','hà nội','việt nam','viet nam','hanoi','north vietnam','socialist republic of vietnam','an nam','champa','đại việt','giao chỉ','french indochina','lĩnh nam','parted in north vietnam',
      'and south vietnam','cộng hòa xã hội chủ nghĩa việt nam','việt nam','srv','s.r.v.','vn v.n.']
    yemen = ['yemen',"sana'a",'al-yaman','ṣan‘ā’','republic of yemen','democratic yemen','sanaa','north yemen','aden','yemen arab rep','yemen peo dem r',
            'republic of yemen','ٱلْجُمْهُورِيَّةُ ٱلْيَمَنِيَّةُ','al-jumhūrīyah al-yamanīyah','yemeni republic']
    zambia = ['zambia','lusaka','republic of zambia','northern rhodesia']
    zimbabwe = ['zimbabwe','harare','rhodesia','southern rhodesia','zimbabwe rhodesia','republic of zimbabwe','zimbabwe rhodes']

    #
    country_list=[zimbabwe,zambia,yemen,vietnam,venezuela,vatican_city,vanuatu,uzbekistan,uruguay,united_states,united_kingdom,
                  united_arab_emirates,ukraine,uganda,tuvalu,turkmenistan,turkey,tunisia,trinidad_and_tobago,transnistria,thailand,
                  tonga,togo,tanzania,tajikistan,taiwan,syria,switzerland,sweden,suriname,sudan,sri_lanka,spain,south_sudan,south_africa,
                  somalia,solomon_islands,slovenia,slovakia,singapore,sierra_leone,seychelles,serbia,senegal,saudi_arabia,san_marino,
                  samoa,são_tomé_and_príncipe,saint_vincent_and_the_grenadines,saint_kitts_and_nevis,sahrawi_arab_democratic_republic,
                  saint_lucia,rwanda,russia,romania,qatar,puerto_rico,portugal,poland,philippines,peru,paraguay,papua_new_guinea,
                  panama,palestine,palau,pakistan,oman,norway,north_macedonia,nigeria,niger,nicaragua,new_zealand,netherlands,nepal,
                  nauru,namibia,myanmar,mozambique,morocco,montenegro,mongolia,monaco,moldova,micronesia,mexico,mauritius,mauritania,
                  marshall_islands,malta,mali,maldives, malaysia, malawi, madagascar, luxembourg, lithuania, liechtenstein, libya, 
                  liberia, lesotho, lebanon, latvia, laos, kyrgyzstan, kuwait, kosovo, south_korea, north_korea, kiribati, kenya, 
                  kazakhstan, jordan, japan, jamaica, italy, israel, ireland, iraq, iran, indonesia, india, iceland, hungary, hong_kong, 
                  honduras, haiti, guyana, guinea, guatemala, guinea_bissau, grenada, greece, ghana, germany, georgia, gambia, gabon, 
                  france, finland, fiji, ethiopia, eswatini, estonia, eritrea, equatorial_guinea, el_salvador, egypt, ecuador, 
                  east_timor, dominican_republic, dominica, djibouti, democratic_republic_of_congo, denmark, czech_republic, cyprus, 
                  cuba, croatia, cote_ivoire, costa_rica, republic_of_the_congo, comoros, colombia, china, chile, chad, 
                  central_african_republic, cape_verde, canada, cameroon, cambodia, burundi, burkina_faso, bulgaria, brunei, brazil, 
                  botswana, bosnia_and_herzegovina, bolivia, bhutan, benin, belize, belgium, belarus, barbados, bangladesh, bahrain, 
                  the_bahamas, azerbaijan, austria, australia, armenia, argentina, antigua_and_barbuda, angola, andorra, algeria, 
                  albania, afghanistan, abkhazia]

    # remove multiple spaces between two strings
    df = df.replace(to_replace=r'\s+', value=' ', regex=True)
    # to remove white space at both ends
    for col in list(df.columns):
        df[col] = df[col].str.strip()
    # lowercase all the columns
    df = df.applymap(lambda x: x.lower() if pd.notnull(x) else x)          
        
    # clean USA    
    df = df.replace(to_replace=r'\w[a-z]+\s+[0-9]+',value='united states', regex=True)
    df = df.replace(to_replace=r'\d+\s*usa*', value='united states', regex=True)
    #df = df.replace(to_replace=r'\w[a-z]\s*usa*', value='united states', regex=True)
    #df = df.replace(['al','ak','as','az','ar','ca','co','ct','de','dc','wi','wy','fl','ga','gu','hi','id','il',
    #                    'in','ia','ks','ky','la','me','md','ma','mi','mn','ms','mo','mt','ne','nv','nh','nj','nm',
     #                   'ny','nc','nd','mp','oh','ok','or','pa','pr','ri','sc','sd','tn','tx','ut','vt','va','vi',
      #                  'wa','wv'], 'united states') 
    
    df = df.replace(to_replace=r'\w+\s+\d+\s*\w*', value='united states', regex=True)
    df = df.replace(to_replace=r'\w[a-z]+\s+usa', value='united states', regex=True)
    df = df.replace(to_replace=r"\w[a-z]+\s+\d+\s+[a-z]+",value='united states', regex=True) #\w[a-z]+\s+[0-9]+\s*\w[a-z]+

    df.replace('united states usa', 'united states', inplace=True)   
    df.replace('united united states', 'united states', inplace=True)
    #
    for country in country_list:
        df = df.replace(country,value=country[0])    
    
    # convert all columns titles to string
    df.columns = df.columns.astype(str)
    # 
    ddf = from_pandas(df, npartitions=2)
    # 
    for col in df.columns.to_list():
        df = clean_country(ddf, col,inplace=True) #
    return df

In [3]:
from functools import reduce

#Import data
data = pd.read_pickle(f'D:\PROJECT/YORK UNIVERSITY/BACTERIOPHAGES/data_analyse/first_clean/data_scholar_all_year_merge.pkl')[
                        ['UT (Unique WOS ID)','Publication Year','Author Full Names','Addresses','Times Cited, All Databases']];


# set accession number as index
data = data.set_index(['UT (Unique WOS ID)','Publication Year','Times Cited, All Databases'])

# dropna base on 'Addresses' columns
data = data.dropna(subset='Addresses').dropna(subset='Author Full Names')

# split and extract first author name
data['first_author'] = data['Author Full Names'].str.split(';', n=1, expand=True)[0]

# Replace last authorsname in each addresses columns by '$'        
address_list = []
for index, rows in data.iterrows():
    for value in range(len(data['Addresses'])):
        Address_df_3 =data['Addresses'][value].replace(data['first_author'][value], '$ ')
        address_list.append(Address_df_3)
        value+=1
    break
Address_df_3 = pd.DataFrame(address_list)

# expand the dataframe by spliting on '$'
Address_df_3_expand = Address_df_3[0].str.split("$", expand=True)

# Drop columns '0' who didn't contains informations of last authors
if len(Address_df_3_expand.columns)>1:
    Address_df_3_expand_drop_0 = Address_df_3_expand.drop(0, axis=1)  

Address_df_3_expand_drop_0[1] = Address_df_3_expand_drop_0[1].fillna('None')  


########
# get country name from each addresse
for col in list(Address_df_3_expand_drop_0.columns):
    Address_df_3_expand_drop_0[col] = Address_df_3_expand_drop_0[col].map(lambda x: str(x).split('[')[0].split(']')[-1].split(';')[0].split(',')[-1]).to_frame() #  lambda x: str(x).split(',')[-1].split(';')[0]


# replace ['None','nan']
Address_df_3_expand_drop_0.replace(['',None,'None','none','nan'], np.nan, inplace=True)

# lowercase all the columns
Address_df_3_expand_drop_0 = Address_df_3_expand_drop_0.applymap(lambda x: x.lower() if pd.notnull(x) else x)

#replace usa
Address_df_3_expand_drop_0.replace('united states usa', 'united states', inplace=True)    

Address_df_3_expand_drop_0 = Address_df_3_expand_drop_0.replace(to_replace=r'\w+\s+usa', value='united states', regex=True)
#df_expand_clean = df_expand_clean.replace(to_replace=r"\w+\s+\d+\s+\w+",value='united states', regex=True)

# Countries cleaning
    # replace unusfull string by NaN
Address_df_3_expand_drop_0 = Address_df_3_expand_drop_0.replace(to_replace=[r"^\w$",
                                                                            r"^[!#$%&'()*+,-./:;<=>?@[\]^_`{|}~]$", ], 
                                                                            value=np.nan, regex=True)  
Address_df_3_expand_drop_0 = Address_df_3_expand_drop_0.replace(r';', '', regex=True) 


## get dummies to have contries on columns
dfs = [pd.get_dummies(Address_df_3_expand_drop_0[col]) for col in Address_df_3_expand_drop_0]
d = reduce(lambda x,y : x.add(y, fill_value=0), dfs)

# data cleaning
cols = [c for c in d.columns if c[-1:] != '.']
df=d[cols]
cols = [c for c in df.columns.to_list() if len(c) != 1]
df=df[cols]

# obtain the columns of the df get dummies and clean using the function
df_columns = df.columns.to_frame()
       # to remove white space at both ends
df_columns[0] = df_columns[0].str.strip().to_frame()
df_columns = df_columns.replace(r';', '', regex=True)  
df_country_clean = country_clean(df_columns).reset_index()

# rename countries names on columns
old_names = df_country_clean['index'].to_list()
new_names  = df_country_clean['0_clean'].to_list()
df_country_clean2 = df.rename(columns=dict(zip(old_names, new_names))).astype(int)    


# Drop NA/NaN in countries columns
col_to_drop = ['','none','None','nan','NA' , 'NaN',np.nan]
for elt in col_to_drop:
    if elt in df_country_clean2.columns :
        df_country_clean2 = df_country_clean2.drop(elt, axis = 1)
    else:
        df_country_clean2
df_country_clean2

# merge and sum the columns with the same countries name
df_country_clean_3 = pd.DataFrame()
df_country_clean3 = pd.DataFrame()
for elt in  df_country_clean2.columns.unique():
    col_array = df_country_clean2.columns.str.fullmatch(elt) # contains
    df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
df_country_clean_3 = pd.concat([df_country_clean_3, df_country_clean3], axis=1)

# replace 0 by NaN to reduce the 
df_country_clean_3.replace(0, np.nan, inplace=True)    


# concat 
#     df_country_clean_4 = df_country_clean_3.loc[~df.index.duplicated(keep='first')]
#     data_2 = data.loc[~data.index.duplicated(keep='first')]

df2 = pd.concat([data.reset_index(), df_country_clean_3], axis=1)
df2.replace(0, np.nan, inplace=True)

#save data
df2.to_csv(f'D:\PROJECT/YORK UNIVERSITY/BACTERIOPHAGES/data_analyse/countries_first_author.csv')
df2.to_pickle(f'D:\PROJECT/YORK UNIVERSITY/BACTERIOPHAGES/data_analyse/countries_first_author.pkl')

print("##########""Total of ", df_country_clean_3.shape[1]," first author countries : ", df2.columns, "###########")

#print the last
df2    



  0%|                                                                                           | 0/15 [00:00<…

Country Cleaning Report:
	175 values cleaned (99.43%)
	1 values unable to be parsed (0.57%), set to NaN
Result contains 175 (99.43%) values in the correct format and 1 null values (0.57%)


  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clean2.loc[:, col_array].sum(axis=1).to_frame()
  df_country_clean_3[elt] = df_country_clea

##########Total of  120  first author countries :  Index(['UT (Unique WOS ID)', 'Publication Year', 'Times Cited, All Databases',
       'Author Full Names', 'Addresses', 'first_author', 'United States',
       'Algeria', 'Argentina', 'Armenia',
       ...
       'Turkmenistan', 'United Arab Emirates', 'Uganda', 'Ukraine', 'Uruguay',
       'Uzbekistan', 'Venezuela', 'Vietnam', 'Yemen', 'Zimbabwe'],
      dtype='object', length=126) ###########


Unnamed: 0,UT (Unique WOS ID),Publication Year,"Times Cited, All Databases",Author Full Names,Addresses,first_author,United States,Algeria,Argentina,Armenia,...,Turkmenistan,United Arab Emirates,Uganda,Ukraine,Uruguay,Uzbekistan,Venezuela,Vietnam,Yemen,Zimbabwe
0,wos:a1975ak27200035,1975,106.0,"modrich, p; richardson, cc","harvard univ, med sch, dept biol chem, boston,...","modrich, p",,,,,...,,,,,,,,,,
1,wos:a1975bb78600005,1975,28.0,"esche, h; schweiger, m; trautner, ta","max planck inst molek genet,abt trautner,ihne ...","esche, h",,,,,...,,,,,,,,,,
2,wos:a1975ak49300025,1975,50.0,"jazwinski, sm; lindberg, aa; kornberg, a","stanford univ, sch med, dept biochem, stanford...","jazwinski, sm",,,,,...,,,,,,,,,,
3,wos:a1975al49000001,1975,31.0,"silberstein, s; inouye, m; studier, fw","suny, dept biochem, stony brook, ny 11794 usa;...","silberstein, s",,,,,...,,,,,,,,,,
4,wos:a1975ay53300001,1975,3.0,"levy, jn","univ washington, dept genet, seattle, wa 98105...","levy, jn",,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57212,wos:000968835000001,2023,,"lan, jinxin; wu, yao; lin, changmei; chen, jia...","[lan, jinxin; wu, yao; lin, changmei; chen, ji...","lan, jinxin",,,,,...,,,,,,,,,,
57213,wos:000933339800001,2023,,"hirao, rie; shigetoh, keisuke; inagaki, shinji...","[hirao, rie; shigetoh, keisuke; inagaki, shinj...","hirao, rie",,,,,...,,,,,,,,,,
57214,wos:000914484900001,2023,5.0,"brogna, carlo; cristoni, simone; brogna, barba...","[brogna, carlo; bisaccia, domenico rocco] dept...","brogna, carlo",,,,,...,,,,,,,,,,
57215,wos:001003139700001,2023,,"lee, l. y. y.; landry, s. a.; jamriska, m.; su...","[lee, l. y. y.; subbarao, k.] univ melbourne, ...","lee, l. y. y.",,,,,...,,,,,,,,,,


In [4]:
df_country_clean_3.count().sort_values(ascending=False)

United States     8460
China             4204
United Kingdom    1564
Germany           1401
Japan             1252
                  ... 
Kosovo               1
Barbados             1
Burkina Faso         1
Myanmar              1
Botswana             1
Length: 120, dtype: int64