In [5]:
import pandas as pd
import plotly.express as px

In [115]:
routes_df = pd.read_csv("./data/routes.csv", index_col=False);

results = []

for index, row in routes_df.iterrows():
    source = row['place_of_purchase']
    destination = row['port_arrival']
    arrival_year = row['year_arrival']
    voyage_id = row['voyage_id']

    row_data = {
        'Voyage_Id': voyage_id,
        'Source': source,
        'Destination': destination,
        'Arrival': arrival_year
    }

    results.append(row_data)

df = pd.DataFrame(results)

text_to_remove = ', port unspecified'
df['Source'] = df['Source'].str.replace(text_to_remove, '') 
df['Destination'] = df['Destination'].str.replace(text_to_remove, '') 

conditionSourceNaN = pd.isna(df['Source'])
conditionDestinationNaN = pd.isna(df['Destination'])
conditionRemoved = conditionSourceNaN | conditionDestinationNaN

df = df[~conditionRemoved]

df.to_csv('./output_data/week4/cleaned_routes.csv', index=False)

Let's go back to our previous approach with our semi-cleaned data.

In [59]:
routes_df = pd.read_csv("./route-pairs-cleaned.csv", index_col=False);

source_hashmap = {}
destination_hashmap = {}
countries = []

for index, row in routes_df.iterrows():
    source = row['Source']
    destination = row['Destination']
    count = row['Count']

    if source not in countries:
        countries.append(source)
    if destination not in countries:
        countries.append(destination)

    if source in source_hashmap :
        source_hashmap[source] += count
    else : 
        source_hashmap[source] = count

    if destination in destination_hashmap :
        destination_hashmap[destination] += count
    else :
        destination_hashmap[destination] = count

source_keys = list(source_hashmap.keys())
source_values = list(source_hashmap.values())

destination_keys = list(destination_hashmap.keys())
destination_values = list(destination_hashmap.values())

source_data = {
    'Source' : source_keys, 
    'Count': source_values
}

destination_data = {
    'Destination' : destination_keys, 
    'Count': destination_values
}

country_data = {
    'Source': countries
}

source_df = pd.DataFrame(source_data)
destination_df = pd.DataFrame(destination_data)
# print(countries)

# country_df =pd.DataFrame(country_data)
# country_df.to_csv('./country_data.csv', index=False)
# source_df.to_csv('./source_routes_data.csv', index=False)
# destination_df.to_csv('./destination_routes_data.csv', index=False)


['Gold Coast', 'Bahia', 'Luanda', 'Rio de Janeiro', 'Africa', 'Havana', 'Barbados', 'Pernambuco', 'Jamaica', 'Benguela', 'Cabinda', 'Cuba', 'Kingston', 'Americas', 'Bonny', 'West Central Africa and St. Helena', 'Cap Francais', 'Saint John (Antigua)', 'St. Kitts', 'Whydah', 'Mozambique', 'Santiago de Cuba', 'Malembo', 'Anomabu', 'Bissau', 'Maranhao', 'Martinique', 'Cartagena', 'Gambia', 'Charleston', 'Veracruz', 'Quilimane', 'Portuguese Guinea', 'Suriname', 'Sao Tome', 'New Spain', 'Calabar', 'Spanish Americas', 'Cacheu', 'Ambriz', 'Lagos, Onim', 'Congo North', 'Cape Coast Castle', 'Cape Verde Islands', 'St. Thomas', 'Elmina', 'Dominica', 'Grenada', 'Port-au-Prince', 'Trinidad de Cuba', 'Buenos Aires', 'Ardra', 'Curacao', 'Rio de Janeiro, Sao Paulo, Santa Catarina', 'West Indies (colony unspecified)', 'Princes Island', 'Freetown', 'Matanzas', 'Windward Coast', 'Spanish Circum-Caribbean,unspecified', 'Sierra Leone estuary', 'Para', 'Montego Bay', 'Newcastle (Nevis)', 'Leogane', 'Porto No

In [69]:
countries_df = pd.read_csv("./country_data.dsv", sep='|', index_col=False);
print(countries_df['Modern Country'].value_counts());
# countries_df.to_csv('./country_data.csv', sep=',', index=False);
# Multiple might be an issue but Uncertain seems fine.

United States                                42
Brazil                                       39
Uncertain                                    34
Multiple                                     27
Cuba                                         25
                                             ..
Argentina & Uruguay                           1
Suriname, Guyana (possibly French Guiana)     1
Sint Maarten                                  1
Tanzania or Kenya (Mombasa)                   1
United Kingdom (England)                      1
Name: Modern Country, Length: 98, dtype: int64


In [104]:
destination_df = pd.read_csv("./output_data/week4/destination_routes_data.csv", index_col=False);
source_df = pd.read_csv("./output_data/week4/source_routes_data.csv", index_col=False);
countries_df = pd.read_csv('./output_data/week4/country_data.csv', index_col=False)


country_hashmap = {}
for index, row in source_df.iterrows():
    source = row['Source']
    count = row['Count']
    country = countries_df[countries_df['Original'] == source]['Modern Country']
    if not country.empty:  
        country_name = country.iloc[0]  

    if country_name in country_hashmap :
        country_hashmap[country_name] += count
    else : 
        country_hashmap[country_name] = count

for index, row in destination_df.iterrows():
    destination = row['Destination']
    count = row['Count']
    country = countries_df[countries_df['Original'] == destination]['Modern Country']
    if not country.empty:  
        country_name = country.iloc[0]  

    if country_name in country_hashmap :
        country_hashmap[country_name] += count
    else :
        country_hashmap[country_name] = count

country_keys = list(country_hashmap.keys())
country_values = list(country_hashmap.values())

country_data = {
    'Source' : country_keys, 
    'Count': country_values
}

count_df = pd.DataFrame(country_data)
count_df = count_df.sort_values(by=['Count'], ascending=False)
count_df.to_csv('./country_count_data.csv', index=False)


In [110]:
country_count_df = pd.read_csv('./country_count_data@5.csv', index_col=False)
countries_df = pd.read_csv('./multiples.csv', index_col=False)



We've cleaned up the data for the choropleth but now we need to do the same for the routes.