# Geographical Visualizations

### This script contains the following:
#### 1. Import data and libraries
#### 2. Data wrangling
#### 3. Data preparation
#### 4. Plotting a choropleth

### 1. Import data and libraries

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [6]:
# This command propts matplotlib visuals to appear in the notebook 

%matplotlib inline

In [7]:
# establish data path

data_path = r'C:\Users\gerar\CareerFoundry Python\Spotify charts streaming analysis'

In [8]:
# import data
df_spotify = pd.read_pickle(os.path.join(data_path, '02 Data', 'Prepared Data', 'spotify_clean_no_global.pkl'))
geojson_path = r'C:\Users\gerar\CareerFoundry Python\Spotify charts streaming analysis\02 Data\Original Data\world-countries.json'

# Load the GeoJSON data into a Python dictionary
with open(geojson_path, 'r') as file:
    country_json = json.load(file)

# column display

pd.options.display.max_columns = 50
    
# show
df_spotify.head()

Unnamed: 0,uri,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,weeks_on_chart,streams,week,danceability,energy,key,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language
0,spotify:track:2gpQi3hbcUAcEG8m2dlgfB,Paulo Londra,1.0,Paulo Londra,spotify:artist:3vQ0GE3mI0dAaxIMYe5g7z,argentine hip hop,0,Plan A,1.0,1,4,3003411,2022-04-14,0.583,0.834,0.0,1.0,-4.875,0.0495,0.0,0.0658,0.557,173.935,178.0,Argentina,South America,Spanish
1,spotify:track:2x8oBuYaObjqHqgGuIUZ0b,WOS,1.0,WOS,spotify:artist:5YCc6xS5Gpj3EkaYGdjyNK,argentine indie,0,ARRANCARMELO,1.0,2,2,2512175,2022-04-14,0.654,0.354,5.0,1.0,-7.358,0.724,0.0,0.134,0.262,81.956,184.0,Argentina,South America,Spanish
2,spotify:track:2SJZdZ5DLtlRosJ2xHJJJa,Paulo Londra,1.0,Paulo Londra,spotify:artist:3vQ0GE3mI0dAaxIMYe5g7z,argentine hip hop,0,Chance,2.0,3,2,2408983,2022-04-14,0.721,0.463,1.0,0.0,-9.483,0.241,0.0,0.0929,0.216,137.915,204.0,Argentina,South America,Spanish
3,spotify:track:1O2pcBJGej0pmH2Y9XZMs6,Cris Mj,1.0,Cris Mj,spotify:artist:1Yj5Xey7kTwvZla8sqdsdE,urbano chileno,0,Una Noche en Medellín,1.0,5,8,2080139,2022-04-14,0.87,0.548,10.0,0.0,-5.253,0.0924,4.6e-05,0.0534,0.832,96.018,154.0,Argentina,South America,Spanish
4,spotify:track:1TpZKxGnHp37ohJRszTSiq,Emilia,1.0,Emilia,spotify:artist:0AqlFI0tz2DsEoJlKSIiT9,pop argentino,0,cuatro veinte,1.0,6,3,1923270,2022-04-14,0.761,0.696,7.0,0.0,-3.817,0.0811,6.3e-05,0.101,0.501,95.066,134.0,Argentina,South America,Spanish


### 2. Data wrangling 

In [10]:
df_spotify.columns

Index(['uri', 'artist_names', 'artists_num', 'artist_individual', 'artist_id',
       'artist_genre', 'collab', 'track_name', 'album_num_tracks', 'peak_rank',
       'weeks_on_chart', 'streams', 'week', 'danceability', 'energy', 'key',
       'mode', 'loudness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration', 'country', 'region', 'language'],
      dtype='object')

In [29]:
# Drop columns from main dataframe

df_spotify.drop(columns = ['artist_names', 'artists_num', 'artist_individual',
       'artist_id', 'artist_genre','peak_rank', 'collab', 'track_name', 'album_num_tracks','streams', 'key', 'region', 'language'], inplace = True)

In [31]:
# check for null values
df_spotify.isnull().sum()

uri                 0
weeks_on_chart      0
week                0
danceability        0
energy              0
mode                0
loudness            0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration            0
country             0
dtype: int64

##### **Checking unusual values**

In [34]:
df_spotify.describe()

Unnamed: 0,weeks_on_chart,week,danceability,energy,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration
count,1691599.0,1691599,1691599.0,1691599.0,1691599.0,1691599.0,1691599.0,1691599.0,1691599.0,1691599.0,1691599.0,1691599.0
mean,35.91727,2021-10-26 00:42:46.003881472,0.6979214,0.6563172,0.5509273,-6.193268,0.2543078,0.009625871,0.1694139,0.5546963,122.0544,211.5563
min,1.0,2021-02-04 00:00:00,0.0859,0.00316,0.0,-41.41,1.77e-06,0.0,0.0134,1e-05,31.262,31.0
25%,7.0,2021-06-17 00:00:00,0.612,0.557,0.0,-7.522,0.0558,0.0,0.0921,0.392,96.699,173.0
50%,20.0,2021-10-28 00:00:00,0.721,0.673,1.0,-5.865,0.182,0.0,0.118,0.561,119.947,202.0
75%,47.0,2022-03-10 00:00:00,0.797,0.775,1.0,-4.522,0.393,3.47e-05,0.209,0.73,141.066,234.0
max,290.0,2022-07-14 00:00:00,0.986,1.0,1.0,1.906,0.995,0.99,0.99,0.992,232.018,1787.0
std,43.1213,,0.1319994,0.1564129,0.4973998,2.393214,0.2377675,0.06841016,0.1265433,0.2224002,30.18891,58.68089


_From my experience and musicological knowledge there are only 2 values that stand out. **Max duration of "1.787030e+06"** and **max loudness of "1.906000"**_

In [None]:
# check for songs with unusual duration

df_spotify[df_spotify['duration']>1.00e+03]

In [None]:
# check for songs with more than 0db loudness

df_spotify[df_spotify['loudness']>=0]


For the loudness, Spotify automatically corrects the loudness to 0. But it is possible that artists sent their songs with a loudness higher than 0db. 
I will leave these values in my analysis. 
all other columns look normal. 

232 bpm for the tempo as the maximum is not unusual. 30 bpm in the tempo belongs to a classical soundtrack score. Also possible. 

The song with the unusual duration is included in the Spotify database with 29 minutes and 49 seconds. https://open.spotify.com/track/1atrCmFrGEN5QvjoEWFRHo

### 3. Data preparation

A choropleth map could solve some of my previous ideas about music being mostly in major or minor depending on their country. This could be done by averaging all the songs from a country.

The same could be done with tempo, duration, energy, danceability, and many other values.

2 variations could be done: 

    -Getting that result by considering repeated entries (a song can last longer in the charts thus have more entries) 

    -Or using unique values per song per country. Songs may be repeated but only if they are from different country charts. 

I created both dataframes just in case. 

### Creating copies

**df_spotify_uniques** is made so I can eliminate duplicates made on a weekly basis. Therefore I removed the date columns. One combination of song and country possible

**df_spotify_repeated** has songs that stayed in the charts. Repeated entries are allowed. 

In [None]:
df_spotify.columns

In [None]:
# Create copy for each case
df_spotify_uniques = df_spotify[['uri', 'country', 'danceability', 'energy', 'mode', 'loudness',
                                 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                                 'duration']].copy()

df_spotify_repeated = df_spotify[['uri', 'country', 'week', 'danceability', 'energy', 'mode',
                                  'loudness', 'acousticness', 'instrumentalness', 'liveness', 'valence',
                                  'tempo', 'duration']].copy()



In [None]:
df_spotify_uniques.head(3)

In [None]:
df_spotify_repeated.head(3)

### **"Uniques" Dataset**

In [None]:
# drop duplicates
df_spotify_uniques.drop_duplicates(inplace=True)

In [None]:
# Check "uniqueness" was achieved
df_spotify_uniques[df_spotify_uniques['uri'] == 'spotify:track:2x8oBuYaObjqHqgGuIUZ0b']

##### **Creating Averages**

In [None]:
# Create average for danceability .... duration
df_spotify_uniques['average_danceability'] = df_spotify_uniques.groupby(['country'])['danceability'].transform('mean')
df_spotify_uniques['average_energy'] = df_spotify_uniques.groupby(['country'])['energy'].transform('mean')
df_spotify_uniques['average_mode'] = df_spotify_uniques.groupby(['country'])['mode'].transform('mean')
df_spotify_uniques['average_loudness'] = df_spotify_uniques.groupby(['country'])['loudness'].transform('mean')
df_spotify_uniques['average_acousticness'] = df_spotify_uniques.groupby(['country'])['acousticness'].transform('mean')
df_spotify_uniques['average_instrumentalness'] = df_spotify_uniques.groupby(['country'])['instrumentalness'].transform('mean')
df_spotify_uniques['average_liveness'] = df_spotify_uniques.groupby(['country'])['liveness'].transform('mean')
df_spotify_uniques['average_valence'] = df_spotify_uniques.groupby(['country'])['valence'].transform('mean')
df_spotify_uniques['average_tempo'] = df_spotify_uniques.groupby(['country'])['tempo'].transform('mean')
df_spotify_uniques['average_duration'] = df_spotify_uniques.groupby(['country'])['duration'].transform('mean')


#sample to check values

df_spotify_uniques.sample(n=10)

### **"Repeated" Dataset**

##### **Creating averages** 

In [None]:
# Create average for danceability .... duration
df_spotify_repeated['average_danceability'] = df_spotify_repeated.groupby(['country'])['danceability'].transform('mean')
df_spotify_repeated['average_energy'] = df_spotify_repeated.groupby(['country'])['energy'].transform('mean')
df_spotify_repeated['average_mode'] = df_spotify_repeated.groupby(['country'])['mode'].transform('mean')
df_spotify_repeated['average_loudness'] = df_spotify_repeated.groupby(['country'])['loudness'].transform('mean')
df_spotify_repeated['average_acousticness'] = df_spotify_repeated.groupby(['country'])['acousticness'].transform('mean')
df_spotify_repeated['average_instrumentalness'] = df_spotify_repeated.groupby(['country'])['instrumentalness'].transform('mean')
df_spotify_repeated['average_liveness'] = df_spotify_repeated.groupby(['country'])['liveness'].transform('mean')
df_spotify_repeated['average_valence'] = df_spotify_repeated.groupby(['country'])['valence'].transform('mean')
df_spotify_repeated['average_tempo'] = df_spotify_repeated.groupby(['country'])['tempo'].transform('mean')
df_spotify_repeated['average_duration'] = df_spotify_repeated.groupby(['country'])['duration'].transform('mean')


#sample to check values

df_spotify_repeated.sample(n=10)

### 4. Plotting a choropleth

In [None]:
# Create a data frame with just the states and the values for rating we want plotted

data_to_plot = df_spotify_uniques[['country','average_tempo', 'average_mode', 'average_danceability', 'average_duration', 'average_energy']]

# sample
data_to_plot.sample(n=5)

##### **Filtering the GeoJSON to only display the countries in my dataset**

In [None]:
# Create a set of countries from your DataFrame
country_list = set(df_spotify_uniques['country'])

# Filter the GeoJSON features
filtered_features = [feature for feature in country_json['features'] 
                     if feature['properties']['name'] in country_list]


In [None]:
# Create a new GeoJSON object with filtered features
filtered_geojson = {
    'type': 'FeatureCollection',
    'features': filtered_features
}

### **Tempo Choropleth**

In [None]:
# Setup a folium map at a high-level zoom
tempo_map = folium.Map(location = [0, 0], zoom_start = 2.4)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = filtered_geojson, 
    data = data_to_plot,
    columns = ['country', 'average_tempo'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrRd', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "Tempo: Red means faster music").add_to(tempo_map)
folium.LayerControl().add_to(tempo_map)


tempo_map

The above map shows how the average tempo of charts is different depending on what country we are looking at. 

Audiences all over the world prefer different types of content.

**High tempo**

Brazil seems to have more preference to faster tempos in the music that reaches the charts. 

It is followed by France, Poland, Bulgaria, Romania and Japan. 


**Low tempo** 

India, Pakistan, Nigeria, South Africa and Sweden have a preference for songs with slower tempos. 

### **Mode Choropleth (major vs. minor)**

In [None]:
# Setup a folium map at a high-level zoom
mode_map = folium.Map(location = [0, 0], zoom_start = 2.4)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = filtered_geojson, 
    data = data_to_plot,
    columns = ['country', 'average_mode'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'PRGn', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "Mode: Purple means minor key, Green is major key").add_to(mode_map)
folium.LayerControl().add_to(mode_map)

mode_map

In this dataset, a value of 1 represents that a song is in the major key. A value of 0 means that the song is in the minor key.

By averaging these values for each countries we can see how much each country is "pulled" into a certain key. 

Minor is represented by purple which is usually also a melancholic sad color. 

Major is represented by green which is an optimistic and "happier" color. 

One can notice that the center of the map is mostly minor, while the "outer" countries are major. 

**Minor**

The clearest preference for the minor mode are in France and and Turkey. Followed by Egypt and Morroco. 

**Major**

The clearest preference for the major mode are in Thailand, Indonesia, Phillipines and Japan. Followed by Australia, New Zealand, Canada, Vietnam, and Ireland

### Conclusion

This type of geographical analysis has given several insights on different preferences for spotify charts.

One should take in account that maybe the general trend of music falls into the findings of these maps and not neccesarily that songs with such characteristics will be charts.

Further analysis is needed with different variables and maybe use the repeated values since this analysis only considered the unique values. 

### Duration

In [None]:
# Setup a folium map at a high-level zoom
duration_map = folium.Map(location = [0, 0], zoom_start = 2.4)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = filtered_geojson, 
    data = data_to_plot,
    columns = ['country', 'average_duration'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'PRGn', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "Duration in ms").add_to(duration_map)
folium.LayerControl().add_to(duration_map)

duration_map

### Danceability

In [None]:
# Setup a folium map at a high-level zoom
danceability_map = folium.Map(location = [0, 0], zoom_start = 2.4)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = filtered_geojson, 
    data = data_to_plot,
    columns = ['country', 'average_danceability'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'PRGn', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "Duration in ms").add_to(danceability_map)
folium.LayerControl().add_to(duration_map)

danceability_map