In [1]:
import pandas as pd
import scipy
import numpy as np

# Data

In [222]:
#World Hapiness Data
wh_data = pd.read_csv("https://raw.githubusercontent.com/joannarashid/cse6242_proj/main/WH_data_2019.csv")

#Women's Prosperity Index
wps_data = pd.read_csv("https://raw.githubusercontent.com/joannarashid/cse6242_proj/main/WPS-Index-2021-Data.csv", 
                       encoding = 'unicode_escape')

#Tropical Climate Data
tropical_data = pd.read_csv("https://raw.githubusercontent.com/joannarashid/cse6242_proj/main/tropical_countries.csv")

#climate data
climate_data = pd.read_csv('https://raw.githubusercontent.com/joannarashid/cse6242_proj/main/climate_zones.csv')

#LGBTQ Safety and Welfare Data
lgbtq_data = pd.read_csv("https://raw.githubusercontent.com/joannarashid/cse6242_proj/main/LGBTQ_Safety_Index.csv")

#Industry Sector Data
sector_data = pd.read_csv("https://raw.githubusercontent.com/joannarashid/cse6242_proj/main/country_sectors.csv")

In [223]:
wh_data.shape

(155, 9)

In [224]:
wps_data.shape

(197, 14)

In [225]:
climate_data.shape

(245, 4)

In [226]:
lgbtq_data.shape

(204, 28)

In [227]:
sector_data.shape

(221, 8)

In [228]:
wh_data.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


## Clean Data

In [229]:
#Some column renaming to prepare for joins

#Clean World Happiness Data (preserving all vars)
wh_data.rename(columns = {'Overall rank': 'WH Rank', 
                          'Country or region': 'Country',
                          'Score': 'WH Score'}, inplace = True)

#Clean Women's Prosperity Index Data
wps_data.rename(columns = {'ï»¿WPS Index rank': 'WPS Rank', 
                           'WPS Index score': 'WPS Score', 
                           'Country':'WPS Country'}, inplace = True)
wps_data = wps_data.iloc[:,0:3] #select vars

#Clean Tropical climate data
climate_data.rename(columns = {'Country':'Climate Country'}, inplace = True)
climate_data['Climate Country'] = climate_data['Climate Country'].str.strip() #removing leading spaces

#Clean LGBTQ data
lgbtq_data.rename(columns = {'Rank': 'LGBTQ Rank', 
                             "Score \n(worst is -100)\n(best is 0)": 'LGBTQ Score',
                             'Grade': 'LGBTQ Grade',
                             'Country': 'LGBTQ Country'}, inplace = True)

lgbtq_data = lgbtq_data[['LGBTQ Rank', 'LGBTQ Country', 'LGBTQ Score', 'LGBTQ Grade']]

#Clean sector data
sector_data.rename(columns = {'Country': 'Sector Country'}, inplace = True)
sector_data['Sector Country'] = sector_data['Sector Country'].str.strip() #removing leading spaces


## Joins

In [230]:
#since the World Happiness Index is the objectove value for this application, 
#wh_data is the left df on whihc the df is started which ensures all countries in the WHI are included
#subsequent joins eliminate observations that are not in the WHI

#merge World Happiness df with Women's Prosperity df
df = wh_data.merge(wps_data,
                   how = 'left', 
                   left_on = "Country",
                   right_on = "WPS Country")

#merge LGBTQ data with main df
df = df.merge(lgbtq_data,
              how = 'left',
              left_on = 'Country',
              right_on = 'LGBTQ Country')

#merge climate type data with main df
df = df.merge(climate_data,
              how = 'left',
              left_on = 'Country',
              right_on = 'Climate Country')

#merge economic sector data data with main df
df = df.merge(sector_data,
              how = 'left',
              left_on = 'Country',
              right_on = 'Sector Country')


In [231]:
df.head()

Unnamed: 0,WH Rank,Country,WH Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,WPS Rank,...,Avg_temp_F,Avg_temp_C,Sector Country,Total GDP in mil USD,Agricultural percent,Industrial percent,Service percent,Agricultural in mil USD,Industrial in mil USD,Service in mil USD
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,2.0,...,42.15,5.64,Finland,244900,2.70%,28.20%,69.10%,6612,69062,169226
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,4.0,...,49.8,9.89,Denmark,287800,1.30%,22.90%,75.80%,3741,65906,218152
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,1.0,...,41.05,5.03,Norway,381200,2.30%,33.70%,64.00%,8768,128464,243968
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,3.0,...,37.71,3.17,Iceland,18180,5.80%,19.70%,74.60%,1054,3581,13562
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,10.0,...,52.86,11.59,Netherlands,924400,1.60%,17.90%,70.20%,14790,165468,648929


## Transform Values

In [232]:
#assign int values to LGBTQ letter grades
grades = list(df['LGBTQ Grade'].unique())
grades = [grade for grade in grades if type(grade) == str] #only letter grades
values = sorted(list(range(1,len(grades))), reverse=True) # list of integers in reverse
scores = dict(zip(grades, values))
df['LGBTQ Score'] = df['LGBTQ Grade'].apply(lambda x: scores.get(x)) #new column with inter values for grades


In [233]:
#additing climate infomation detail
climate_codes = df['Climate zone'].unique()

climate_zones = {
                "DFC":["Subartic, severe winter, no dry season, cool summer","Cold"],
                 "CFB":["Marine west coast, warm summer","Temperate"],
                 "ET":["Tundra","Cold"],
                 "DFB":["Humid continental, no dry season, warm summer", "Cold"],
                 "BWH":["Subtropical desert","Arid"],
                 "BSH":["Subtropical steppe","Arid"],
                 "CFA":["Humid subtropical, no dry season","Temperate"],
                 "CSA":["Mediterranean, hot summer","Temperate"],
                 "BSK":["Mid-latitude steppe","Arid"],
                 "CWB":["Temperate highland tropical climate with dry winters","Temperate"],
                 "CSB":["Mediterranean, warm summer","Temperate"],
                 "AM":["Tropical monsoon","Tropical"],
                 "AW":["Tropical wet and dry or savanna","Tropical"],
                 "AF":["Tropical rainforest","Tropical"],
                 "BWK":["Mid-latitude desert", "Arid"],
                 "DWB":["Humid continental, severe dry winter, warm summer","Cold"],
                 "DSC":["Humid continental, dry warm summer","Cold"],
                 "CWA":["Humid subtropical, dry winter","Temperate"],
                 "DSB":["Humid continental, dry warm summer", "Cold"],
                 "DWA":["Humid continental, severe dry winter, hot summer", "Cold"],
                 "DWC":["Subartic, dry winter, cool summer", "Cold"]
                 }

def map_climate_zones(zone):
    """
    Maps descriptions found in climate_zones dict to climate code in df per dict
    """
    if zone in climate_zones:
        return climate_zones[zone]
    else:
        return ["", ""]

# apply the mapping function to the climate zone column and create two new columns for descriptions
df[["Climate description", "Climate type"]] = df["Climate zone"].apply(map_climate_zones).tolist()

In [234]:
#convert sector data to decimal
df['Agricultural percent'] = df['Agricultural percent'].str.rstrip('%').astype('float') / 100.0
df['Industrial percent'] = df['Industrial percent'].str.rstrip('%').astype('float') / 100.0
df['Service percent'] = df['Service percent'].str.rstrip('%').astype('float') / 100.0

#add dominant sector
df["dom_sector"] = df[['Agricultural percent','Industrial percent','Service percent']].idxmax(axis=1)
df["dom_sector"] = df["dom_sector"].str.replace(' percent', '')

In [235]:
#Normalizing data
df['LGBTQ_norm'] = (df['LGBTQ Score'] - df['LGBTQ Score'].min()) / (df['LGBTQ Score'].max() - df['LGBTQ Score'].min())
df['WPS_norm'] = (df['WPS Score'] - df['WPS Score'].min()) / (df['WPS Score'].max() - df['WPS Score'].min())
df['Freedom_norm'] = (df['Freedom to make life choices'] - df['Freedom to make life choices'].min()) / (df['Freedom to make life choices'].max() - df['Freedom to make life choices'].min())
df['GDP_norm'] = (df['GDP per capita'] - df['GDP per capita'].min()) / (df['GDP per capita'].max() - df['GDP per capita'].min())

In [236]:
df.head()

Unnamed: 0,WH Rank,Country,WH Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,WPS Rank,...,Agricultural in mil USD,Industrial in mil USD,Service in mil USD,Climate description,Climate type,dom_sector,LGBTQ_norm,WPS_norm,Freedom_norm,GDP_norm
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,2.0,...,6612,69062,169226,"Subartic, severe winter, no dry season, cool s...",Cold,Service,1.0,0.979814,0.944532,0.795724
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,4.0,...,3741,65906,218152,"Marine west coast, warm summer",Temperate,Service,1.0,0.970497,0.938193,0.821259
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,1.0,...,8768,128464,243968,"Subartic, severe winter, no dry season, cool s...",Cold,Service,0.9,1.0,0.955626,0.88361
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,3.0,...,1054,3581,13562,Tundra,Cold,Service,1.0,0.976708,0.936609,0.819477
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,10.0,...,14790,165468,648929,"Marine west coast, warm summer",Temperate,Service,0.9,0.942547,0.882726,0.828979


# Weighted Sum Optimization Function

In [237]:
def optimize(df, user_profile, n=5):
    """
    Uses values from user_profile to filter df on users prefered climate and dominant economic sector.
    Then uses uses ranked vars in user_profile to calculate weights for each var
    Column is added to df with weighted value for each var
    """
    # Filter for climate and sector

    df = df[(df['Climate type'] == user_profile["climate"]) & (df['dom_sector'] == user_profile["sector"])].copy()  # Make a copy of the filtered DataFrame
    
    # Normalize the ranks so that they sum up to 1
    rank_sum = user_profile["LGBTQ_rank"] +\
                user_profile["WPSI_rank"] + \
                user_profile["freedom_rank"] + \
                user_profile["GDP_rank"]
    LGBTQ_weight = user_profile["LGBTQ_rank"] / rank_sum
    WPS_weight = user_profile["WPSI_rank"] / rank_sum
    freedom_weight = user_profile["freedom_rank"] / rank_sum
    GDP_weight = user_profile["GDP_rank"] / rank_sum
    
    # Create a new column in the dataframe that combines the weights with the corresponding variables
    df.loc[:, 'weighted_sum'] = (LGBTQ_weight * df['LGBTQ_norm']) + \
                                (WPS_weight * df['WPS_norm']) + \
                                (freedom_weight * df['Freedom_norm']) + \
                                (GDP_weight * df['GDP_norm'])
    
    # Find the top n rows with the highest weighted sums
    sorted_df = df.sort_values(by='weighted_sum', ascending=False).reset_index(drop=True)
    n_best = sorted_df.loc[:n-1, 'Country'].tolist()  # Use .loc to slice and get a view of the original data
    
    # Return a list of the 'City' values of the top n rows
    return n_best
    #return sorted_df


In [244]:
#user query

user_profile = {
    'sector' : input("What economic sector is predominat in you ideal country? ('Agricultural', 'Service', 'Industrial'): "),
    'climate' : input("What climate do you prefer? ('Cold', 'Temperate', or 'Tropical'): "),
    'LGBTQ_rank' : int(input("Rank the importance of LGBTQ equality from 1 to 4: ")),
    'WPSI_rank' : int(input("Rank the importance of status of women from 1 to 4: ")),
    'freedom_rank' : int(input("Rank the importance of personal freedom from 1 to 4: ")),
    'GDP_rank' : int(input("Rank the importance of the strength of the economy from 1 to 4: "))
    }


What economic sector is predominat in you ideal country? ('Agricultural', 'Service', 'Industrial'): Service
What climate do you prefer? ('Cold', 'Temperate', or 'Tropical'): Temperate
Rank the importance of LGBTQ equality from 1 to 4: 1
Rank the importance of status of women from 1 to 4: 2
Rank the importance of personal freedom from 1 to 4: 3
Rank the importance of the strength of the economy from 1 to 4: 4


In [245]:
#results
best = optimize(df, user_profile, n=5)
best

['Luxembourg', 'Denmark', 'Ireland', 'Netherlands', 'New Zealand']

## Additional Test Cases

user1 = {
    'sector' : "Service",
    'climate' : "Cold",
    'LGBTQ_rank' : 1,
    'WPSI_rank' : 2,
    'freedom_rank' : 3,
    'GDP_rank' : 4,
    }

user2 = {
    'sector' : "Industrial",
    'climate' : "Temperate",
    'LGBTQ_rank' : 4 ,
    'WPSI_rank' : 3,
    'freedom_rank' : 2,
    'GDP_rank' : 1
    }

user3 = {
    'sector' : "Service",
    'climate' : "Temperate",
    'LGBTQ_rank' : 1,
    'WPSI_rank' : 2,
    'freedom_rank' : 3,
    'GDP_rank' : 4
    }

user4 = {
    'sector' : "Agricultural",
    'climate' : "Tropical",
    'LGBTQ_rank' : 3,
    'WPSI_rank' : 1,
    'freedom_rank' : 2,
    'GDP_rank' : 4
    }

In [249]:
user1_best = optimize(df, user1, n=5)
print("User1 5 Best places: ", user1_best)

user2_best = optimize(df, user2, n=5)
print("User2 5 Best places: ", user2_best)

user3_best = optimize(df, user3, n=5)
print("User3 5 Best places: ", user3_best)

user4_best = optimize(df, user4, n=5)
print("User1 5 Best places: ", user4_best)

User1 5 Best places:  ['Norway', 'Switzerland', 'Iceland', 'Finland', 'Sweden']
User2 5 Best places:  []
User3 5 Best places:  ['Luxembourg', 'Denmark', 'Ireland', 'Netherlands', 'New Zealand']
User1 5 Best places:  ['Sierra Leone', 'Comoros', 'Central African Republic']


Notes: There are still some holes when the solution space for a particular profile yeilds 0 results.
It is TBD what to do in these corner cases.