In [2]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

## NY MOMA DATASET

In [3]:
# reading in csv file 
# source: https://www.kaggle.com/datasets/momanyc/museum-collection
moma = pd.read_csv('artist_moma.csv', encoding ='latin-1')
moma.head()

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930.0,1992.0
1,2,Doroteo Arnaiz,Spanish,Male,1936.0,
2,3,Bill Arnold,American,Male,1941.0,
3,4,Charles Arnoldi,American,Male,1946.0,
4,5,Per Arnoldi,Danish,Male,1941.0,


In [5]:
# splicing nationality
nationality = moma['Nationality']
n_nation = nationality.dropna()

print(n_nation)

0        American
1         Spanish
2        American
3        American
4          Danish
           ...   
15082    American
15084    American
15085     Chinese
15086     Chinese
15087     Chinese
Name: Nationality, Length: 12603, dtype: object


In [6]:
# counting frequency of each nationality
from collections import Counter
top = Counter(n_nation).most_common()
print(top)

[('American', 5198), ('German', 930), ('French', 839), ('British', 835), ('Italian', 531), ('Japanese', 498), ('Swiss', 280), ('Dutch', 265), ('Nationality unknown', 255), ('Austrian', 243), ('Canadian', 196), ('Russian', 188), ('Brazilian', 155), ('Spanish', 153), ('Argentine', 139), ('Swedish', 130), ('Mexican', 128), ('Polish', 125), ('Danish', 119), ('Belgian', 89), ('Czech', 83), ('Chinese', 81), ('Israeli', 75), ('South African', 69), ('Chilean', 61), ('Finnish', 60), ('Cuban', 58), ('Australian', 56), ('Hungarian', 52), ('Norwegian', 49), ('Colombian', 42), ('Venezuelan', 41), ('Korean', 35), ('Peruvian', 34), ('Indian', 28), ('Scottish', 28), ('Turkish', 26), ('Croatian', 25), ('Yugoslav', 24), ('Uruguayan', 21), ('Iranian', 21), ('Romanian', 20), ('Irish', 19), ('Haitian', 17), ('Portuguese', 17), ('New Zealander', 16), ('Greek', 13), ('Ukrainian', 11), ('Icelandic', 11), ('Slovak', 8), ('Bosnian', 8), ('Egyptian', 8), ('Taiwanese', 7), ('Georgian', 7), ('Vietnamese', 7), ('Al

In [13]:
np_nation = np.array(top) # from list to array
frequency = np_nation[:,1] # take frequency from  into an array
origin = np_nation[:,0] # take nationality from list into an array

In [14]:
freq_int = frequency.astype(int) # changing item type from strings to integers
freq_int

array([5198,  930,  839,  835,  531,  498,  280,  265,  255,  243,  196,
        188,  155,  153,  139,  130,  128,  125,  119,   89,   83,   81,
         75,   69,   61,   60,   58,   56,   52,   49,   42,   41,   35,
         34,   28,   28,   26,   25,   24,   21,   21,   20,   19,   17,
         17,   16,   13,   11,   11,    8,    8,    8,    7,    7,    7,
          6,    6,    6,    6,    6,    5,    5,    5,    4,    4,    4,
          4,    4,    4,    4,    4,    3,    3,    3,    3,    3,    3,
          3,    3,    3,    3,    3,    3,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1])

In [15]:
origin

array(['American', 'German', 'French', 'British', 'Italian', 'Japanese',
       'Swiss', 'Dutch', 'Nationality unknown', 'Austrian', 'Canadian',
       'Russian', 'Brazilian', 'Spanish', 'Argentine', 'Swedish',
       'Mexican', 'Polish', 'Danish', 'Belgian', 'Czech', 'Chinese',
       'Israeli', 'South African', 'Chilean', 'Finnish', 'Cuban',
       'Australian', 'Hungarian', 'Norwegian', 'Colombian', 'Venezuelan',
       'Korean', 'Peruvian', 'Indian', 'Scottish', 'Turkish', 'Croatian',
       'Yugoslav', 'Uruguayan', 'Iranian', 'Romanian', 'Irish', 'Haitian',
       'Portuguese', 'New Zealander', 'Greek', 'Ukrainian', 'Icelandic',
       'Slovak', 'Bosnian', 'Egyptian', 'Taiwanese', 'Georgian',
       'Vietnamese', 'Algerian', 'Slovenian', 'Czechoslovakian',
       'Serbian', 'Lebanese', 'Guatemalan', 'Moroccan', 'Puerto Rican',
       'Ecuadorian', 'Luxembourgish', 'Zimbabwean', 'Bulgarian',
       'Tunisian', 'Albanian', 'Palestinian', 'Iraqi', 'Estonian',
       'Bolivian', 'Cana

In [31]:
# convert nationalities into country codes using ISO alpha-3 (format for plotly)
# reference: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3
# nationality unknown = XXX
countrycodes = ['USA', 'DEU', 'FRA','GBR', 'ITA', 'JPN', 'CHE', 'NLD', 'XXX', 'AUT', 'CAN', 'RUS', 'BRA', 'ESP', 'ARG', 
                'SWE',
                'MEX','POL', 'DNK', 'BEL', 'CZE', 'CHN', 'ISR', 'ZAF', 'CHL', 'FIN', 'CUB', 'AUS', 'HUN', 'NOR', 'COL', 
                'VEN', 'KOR', 'PER', 'IND', 'GBR', 'TUR', 'HRV', 'YUG', 'URY', 'IRN', 'ROU','IRL', 'HTI', 'PRT', 'NZL', 
                'GRC', 'UKR', 'ISL', 'SVK', 'BIH', 'EGY', 'TWN', 'GEO', 'VNM', 'DZA', 'SVN', 'CZE', 'SRB',
                'LBN', 'GTM', 'MAR', 'PRI',
                'ECU', 'LUX', 'ZWE', 'BGR', 'TUN', 'ALB', 'PSE', 'IRQ', 'EST', 'BOL', 'CAN', 'SEN', 'THA', 'PAK', 'MLI', 
                'NGA', 'SGP', 'GHA', 'KGZ', 'PHL', 'XXX', 'CRI', 'COG', 'PAN', 'NIC', 'LVA', 'XXX', 'LTU', 'KEN', 'MYS',
                'NAM', 'XXX', 'GBR', 'KAZ', 'KWT', 'GUY', 'PRY', 'TZA', 'SDN', 'ETH', 'BHS', 'ARE', 'CYP', 'AZE', 'CIV', 
                'TJK', 'KHM', 'AFG', 'USA', 'UGA', 'CMR', 'MRT', 'SYR', 'SAU', 'RWA', 'IDN', 'BFA', 'MKD', 'MOZ', 'AGO',
                'ESP', 'ESH']

# put Scottish &  Welsh under Great Britain
# values 19 & 2 respectively, will drop later and change up GBR value

In [32]:
# testing accuracy
test = [countrycodes[0], freq_int[0]]
test

['USA', 5198]

In [33]:
# looping in data from country codes to match the correct frequency
NEWDATA = []
for i in range(125):
    a = [countrycodes[i],freq_int[i]]
    NEWDATA.append(a)
    data = np.array(NEWDATA)
    
NEWDATA

[['USA', 5198],
 ['DEU', 930],
 ['FRA', 839],
 ['GBR', 835],
 ['ITA', 531],
 ['JPN', 498],
 ['CHE', 280],
 ['NLD', 265],
 ['XXX', 255],
 ['AUT', 243],
 ['CAN', 196],
 ['RUS', 188],
 ['BRA', 155],
 ['ESP', 153],
 ['ARG', 139],
 ['SWE', 130],
 ['MEX', 128],
 ['POL', 125],
 ['DNK', 119],
 ['BEL', 89],
 ['CZE', 83],
 ['CHN', 81],
 ['ISR', 75],
 ['ZAF', 69],
 ['CHL', 61],
 ['FIN', 60],
 ['CUB', 58],
 ['AUS', 56],
 ['HUN', 52],
 ['NOR', 49],
 ['COL', 42],
 ['VEN', 41],
 ['KOR', 35],
 ['PER', 34],
 ['IND', 28],
 ['GBR', 28],
 ['TUR', 26],
 ['HRV', 25],
 ['YUG', 24],
 ['URY', 21],
 ['IRN', 21],
 ['ROU', 20],
 ['IRL', 19],
 ['HTI', 17],
 ['PRT', 17],
 ['NZL', 16],
 ['GRC', 13],
 ['UKR', 11],
 ['ISL', 11],
 ['SVK', 8],
 ['BIH', 8],
 ['EGY', 8],
 ['TWN', 7],
 ['GEO', 7],
 ['VNM', 7],
 ['DZA', 6],
 ['SVN', 6],
 ['CZE', 6],
 ['SRB', 6],
 ['LBN', 6],
 ['GTM', 5],
 ['MAR', 5],
 ['PRI', 5],
 ['ECU', 4],
 ['LUX', 4],
 ['ZWE', 4],
 ['BGR', 4],
 ['TUN', 4],
 ['ALB', 4],
 ['PSE', 4],
 ['IRQ', 4],
 ['EST',

In [35]:
# creating a dataframe using the array just created
column_values = ['Country Code','Frequency']
df = pd.DataFrame(data = NEWDATA,
                  index = origin, 
                  columns = column_values)
df

Unnamed: 0,Country Code,Frequency
American,USA,5198
German,DEU,930
French,FRA,839
British,GBR,835
Italian,ITA,531
...,...,...
Macedonian,MKD,1
Mozambican,MOZ,1
Angolan,AGO,1
Catalan,ESP,1


In [47]:
# adding 28 + 2 to GBR value of 835 = 865 (to include Scottish and Welsh nationalities)
df['Frequency'] = df['Frequency'].replace(835,865)
df

Unnamed: 0,Country Code,Frequency
American,USA,5198
German,DEU,930
French,FRA,839
British,GBR,865
Italian,ITA,531
...,...,...
Macedonian,MKD,1
Mozambican,MOZ,1
Angolan,AGO,1
Catalan,ESP,1


In [48]:
# dropping American label and Nationality Unknowns and Welsh/Scottish
df1 = df.drop(labels=['American','Native American','Nationality Unknown','Nationality unknown', 'nationality unknown',
                     'Various','Welsh', 'Scottish'])
df1.to_string() # double checking data looks right

'                Country Code  Frequency\nGerman                   DEU        930\nFrench                   FRA        839\nBritish                  GBR        865\nItalian                  ITA        531\nJapanese                 JPN        498\nSwiss                    CHE        280\nDutch                    NLD        265\nAustrian                 AUT        243\nCanadian                 CAN        196\nRussian                  RUS        188\nBrazilian                BRA        155\nSpanish                  ESP        153\nArgentine                ARG        139\nSwedish                  SWE        130\nMexican                  MEX        128\nPolish                   POL        125\nDanish                   DNK        119\nBelgian                  BEL         89\nCzech                    CZE         83\nChinese                  CHN         81\nIsraeli                  ISR         75\nSouth African            ZAF         69\nChilean                  CHL         61\nFinnish        

In [49]:
choropleth_map = go.Figure(
    data = {
        'type':'choropleth',
        'locationmode':'ISO-3',        
        'locations':df1['Country Code'],
        'colorscale':'YlGnBu',            
        'z':df1['Frequency'],                  
        'colorbar':{'title':'Frequency of Nationality'}
    }, layout = {
      'geo':{
          'scope':'world'
      }  
    })

choropleth_map.update_layout(
    title_text = 'Nationality Representation of Artists in NY MOMA (Excluding American)')