1. IMPORTS

In [2]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

2. DATA

In [3]:
path = 'data/data.csv'

In [4]:
# creating dataframe
df = pd.read_csv(path)

In [5]:
df.head()

Unnamed: 0,Unnamed: 1,Festival_Name,Location,Attendance_Numbers,Visitor_Demographics,Economic_Impact,Music_Genre
Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various
Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM
Sziget Festival,Budapest,Hungary,500000,18-40,International Attendees,€130 million,Various
Rock am Ring,Nürburg,Germany,90000,20-40,Rock Fans,€80 million,Rock
Roskilde Festival,Roskilde,Denmark,130000,18-40,Alternative Music Lovers,DKK 70 million,Alternative


In [6]:
df.columns

Index(['Festival_Name', 'Location', 'Attendance_Numbers',
       'Visitor_Demographics', 'Economic_Impact', 'Music_Genre'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 157 entries, ('Glastonbury Festival', 'Somerset') to ('Festival Internacional de Jazz', 'Mexico City')
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Festival_Name         157 non-null    object
 1   Location              157 non-null    int64 
 2   Attendance_Numbers    157 non-null    object
 3   Visitor_Demographics  157 non-null    object
 4   Economic_Impact       157 non-null    object
 5   Music_Genre           157 non-null    object
dtypes: int64(1), object(5)
memory usage: 12.9+ KB


3. DATA CLEANING

In [8]:
# splitting 'Festival_Name' column into 3 columns
# columns will be 'Festival_Name', 'Location', and 'Country'

df.head(1)

Unnamed: 0,Unnamed: 1,Festival_Name,Location,Attendance_Numbers,Visitor_Demographics,Economic_Impact,Music_Genre
Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various


In [9]:
# first reset the dataframe
df.reset_index(inplace=True)
df

Unnamed: 0,level_0,level_1,Festival_Name,Location,Attendance_Numbers,Visitor_Demographics,Economic_Impact,Music_Genre
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM
2,Sziget Festival,Budapest,Hungary,500000,18-40,International Attendees,€130 million,Various
3,Rock am Ring,Nürburg,Germany,90000,20-40,Rock Fans,€80 million,Rock
4,Roskilde Festival,Roskilde,Denmark,130000,18-40,Alternative Music Lovers,DKK 70 million,Alternative
...,...,...,...,...,...,...,...,...
152,Electric Picnic,Stradbally,Ireland,50000,18-40,Indie and Electronic Fans,€30 million,Indie/Electronic
153,Blue Balls Festival,Lucerne,Switzerland,35000,25-60,Jazz and Blues Fans,CHF 15 million,Jazz/Blues
154,Lollapalooza Buenos Aires,Buenos Aires,Argentina,60000,18-35,Rock and Electronic Fans,ARS 20 million,Rock/Electronic
155,Valencia Music Festival,Valencia,Spain,35000,18-35,Electronic and Rock Fans,€15 million,Electronic/Rock


In [10]:
# then replace the columns
# df.columns = ['No.','Festival_Name', 'Location','Country', 'Attendance_Numbers','Age_Range','Visitor_Demographics', 'Economic_Impact', 'Music_Genre']

In [11]:
df.head(2)

Unnamed: 0,level_0,level_1,Festival_Name,Location,Attendance_Numbers,Visitor_Demographics,Economic_Impact,Music_Genre
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM


In [12]:
df.to_csv('Outputs/festivals_step_1.csv')

next the dataset was geocoded online to add lat and lon columns for the specific locations
- https://www.geoapify.com/tools/geocoding-online/ 


In [13]:
# new data
path_2 = 'data/festivals_step_1_geocoded.csv'
df = pd.read_csv(path_2)
df.columns

Index(['original_', 'original_No.', 'original_Festival_Name',
       'original_Location', 'original_Country', 'original_Attendance_Numbers',
       'original_Age_Range', 'original_Visitor_Demographics',
       'original_Economic_Impact', 'original_Music_Genre', 'lat', 'lon',
       'formatted', 'district', 'street', 'postcode', 'city', 'county',
       'county_code', 'name', 'state', 'state_code', 'country',
       'confidence_city_level', 'country_code', 'confidence', 'attribution',
       'attribution_license', 'attribution_url'],
      dtype='object')

In [14]:
# dropping all columns that are not useful
df.drop(columns = ['original_','original_No.','formatted', 'district', 'street', 'postcode', 'city', 'county',
       'county_code', 'name', 'state', 'state_code', 'country',
       'confidence_city_level', 'country_code', 'confidence', 'attribution',
       'attribution_license', 'attribution_url'])

Unnamed: 0,original_Festival_Name,original_Location,original_Country,original_Attendance_Numbers,original_Age_Range,original_Visitor_Demographics,original_Economic_Impact,original_Music_Genre,lat,lon
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various,51.161751,-3.075387
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM,51.087379,4.366722
2,Sziget Festival,Budapest,Hungary,500000,18-40,International Attendees,€130 million,Various,47.497879,19.040238
3,Rock am Ring,Nürburg,Germany,90000,20-40,Rock Fans,€80 million,Rock,50.341339,6.951923
4,Roskilde Festival,Roskilde,Denmark,130000,18-40,Alternative Music Lovers,DKK 70 million,Alternative,55.643348,12.081925
...,...,...,...,...,...,...,...,...,...,...
152,Electric Picnic,Stradbally,Ireland,50000,18-40,Indie and Electronic Fans,€30 million,Indie/Electronic,52.130781,-7.461683
153,Blue Balls Festival,Lucerne,Switzerland,35000,25-60,Jazz and Blues Fans,CHF 15 million,Jazz/Blues,47.050545,8.305468
154,Lollapalooza Buenos Aires,Buenos Aires,Argentina,60000,18-35,Rock and Electronic Fans,ARS 20 million,Rock/Electronic,-34.603718,-58.381530
155,Valencia Music Festival,Valencia,Spain,35000,18-35,Electronic and Rock Fans,€15 million,Electronic/Rock,39.469707,-0.376335


In [15]:
df.head(2)

Unnamed: 0,original_,original_No.,original_Festival_Name,original_Location,original_Country,original_Attendance_Numbers,original_Age_Range,original_Visitor_Demographics,original_Economic_Impact,original_Music_Genre,...,name,state,state_code,country,confidence_city_level,country_code,confidence,attribution,attribution_license,attribution_url
0,0,0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various,...,Somerset,England,ENG,United Kingdom,,gb,1,© OpenStreetMap contributors,Open Database License,https://www.openstreetmap.org/copyright
1,1,1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM,...,,Antwerp,,Belgium,1.0,be,1,© OpenStreetMap contributors,Open Database License,https://www.openstreetmap.org/copyright


In [16]:
# Renaming the columns because they had different column names
df.reset_index(inplace=True)
df.head(2)

Unnamed: 0,index,original_,original_No.,original_Festival_Name,original_Location,original_Country,original_Attendance_Numbers,original_Age_Range,original_Visitor_Demographics,original_Economic_Impact,...,name,state,state_code,country,confidence_city_level,country_code,confidence,attribution,attribution_license,attribution_url
0,0,0,0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,...,Somerset,England,ENG,United Kingdom,,gb,1,© OpenStreetMap contributors,Open Database License,https://www.openstreetmap.org/copyright
1,1,1,1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,...,,Antwerp,,Belgium,1.0,be,1,© OpenStreetMap contributors,Open Database License,https://www.openstreetmap.org/copyright


In [17]:
df.columns

Index(['index', 'original_', 'original_No.', 'original_Festival_Name',
       'original_Location', 'original_Country', 'original_Attendance_Numbers',
       'original_Age_Range', 'original_Visitor_Demographics',
       'original_Economic_Impact', 'original_Music_Genre', 'lat', 'lon',
       'formatted', 'district', 'street', 'postcode', 'city', 'county',
       'county_code', 'name', 'state', 'state_code', 'country',
       'confidence_city_level', 'country_code', 'confidence', 'attribution',
       'attribution_license', 'attribution_url'],
      dtype='object')

In [18]:
df = df.drop(columns= ['index', 'original_', 'original_No.','formatted', 'district', 'street', 'postcode', 'city', 'county',
       'county_code', 'name', 'state', 'state_code', 'country',
       'confidence_city_level', 'country_code', 'confidence', 'attribution',
       'attribution_license', 'attribution_url'])

In [19]:
# renaming because columns had the word had original in them
df.columns = ['Festival_Name', 'Location','Country', 'Attendance_Numbers','Age_Range','Visitor_Demographics', 'Economic_Impact', 'Music_Genre','Latitude', 'Longitude']

In [20]:
df 

Unnamed: 0,Festival_Name,Location,Country,Attendance_Numbers,Age_Range,Visitor_Demographics,Economic_Impact,Music_Genre,Latitude,Longitude
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various,51.161751,-3.075387
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM,51.087379,4.366722
2,Sziget Festival,Budapest,Hungary,500000,18-40,International Attendees,€130 million,Various,47.497879,19.040238
3,Rock am Ring,Nürburg,Germany,90000,20-40,Rock Fans,€80 million,Rock,50.341339,6.951923
4,Roskilde Festival,Roskilde,Denmark,130000,18-40,Alternative Music Lovers,DKK 70 million,Alternative,55.643348,12.081925
...,...,...,...,...,...,...,...,...,...,...
152,Electric Picnic,Stradbally,Ireland,50000,18-40,Indie and Electronic Fans,€30 million,Indie/Electronic,52.130781,-7.461683
153,Blue Balls Festival,Lucerne,Switzerland,35000,25-60,Jazz and Blues Fans,CHF 15 million,Jazz/Blues,47.050545,8.305468
154,Lollapalooza Buenos Aires,Buenos Aires,Argentina,60000,18-35,Rock and Electronic Fans,ARS 20 million,Rock/Electronic,-34.603718,-58.381530
155,Valencia Music Festival,Valencia,Spain,35000,18-35,Electronic and Rock Fans,€15 million,Electronic/Rock,39.469707,-0.376335


In [21]:
df.head()

Unnamed: 0,Festival_Name,Location,Country,Attendance_Numbers,Age_Range,Visitor_Demographics,Economic_Impact,Music_Genre,Latitude,Longitude
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various,51.161751,-3.075387
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM,51.087379,4.366722
2,Sziget Festival,Budapest,Hungary,500000,18-40,International Attendees,€130 million,Various,47.497879,19.040238
3,Rock am Ring,Nürburg,Germany,90000,20-40,Rock Fans,€80 million,Rock,50.341339,6.951923
4,Roskilde Festival,Roskilde,Denmark,130000,18-40,Alternative Music Lovers,DKK 70 million,Alternative,55.643348,12.081925


In [22]:
df.dtypes

Festival_Name            object
Location                 object
Country                  object
Attendance_Numbers        int64
Age_Range                object
Visitor_Demographics     object
Economic_Impact          object
Music_Genre              object
Latitude                float64
Longitude               float64
dtype: object

In [23]:
# adding geometry column to the dataset
geometry = gpd.points_from_xy(df['Longitude'], df['Latitude'])
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs= 'epsg: 4326')

In [24]:
gdf.head()

Unnamed: 0,Festival_Name,Location,Country,Attendance_Numbers,Age_Range,Visitor_Demographics,Economic_Impact,Music_Genre,Latitude,Longitude,geometry
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,£100 million,Various,51.161751,-3.075387,POINT (-3.07539 51.16175)
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,€150 million,EDM,51.087379,4.366722,POINT (4.36672 51.08738)
2,Sziget Festival,Budapest,Hungary,500000,18-40,International Attendees,€130 million,Various,47.497879,19.040238,POINT (19.04024 47.49788)
3,Rock am Ring,Nürburg,Germany,90000,20-40,Rock Fans,€80 million,Rock,50.341339,6.951923,POINT (6.95192 50.34134)
4,Roskilde Festival,Roskilde,Denmark,130000,18-40,Alternative Music Lovers,DKK 70 million,Alternative,55.643348,12.081925,POINT (12.08192 55.64335)


In [25]:
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World: Afghanistan, Albania, Algeria, American Samoa, Andorra, Angola, Anguilla, Antarctica, Antigua and Barbuda, Argentina, Armenia, Aruba, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belgium, Belgium, Belize, Benin, Bermuda, Bhutan, Bolivia, Bonaire, Saint Eustasius and Saba, Bosnia and Herzegovina, Botswana, Bouvet Island, Brazil, British Indian Ocean Territory, British Virgin Islands, Brunei Darussalam, Bulgaria, Burkina Faso, Burundi, Cambodia, Cameroon, Canada, Cape Verde, Cayman Islands, Central African Republic, Chad, Chile, China, Christmas Island, Cocos (Keeling) Islands, Comoros, Congo, Cook Islands, Costa Rica, Côte d'Ivoire (Ivory Coast), Croatia, Cuba, Curacao, Cyprus, Czechia, Denmark, Djibouti, Dominica, Dominican Republic, East Timor, Ecuador, Egypt, El Salvador, Equatoria

In [26]:
gdf.columns

Index(['Festival_Name', 'Location', 'Country', 'Attendance_Numbers',
       'Age_Range', 'Visitor_Demographics', 'Economic_Impact', 'Music_Genre',
       'Latitude', 'Longitude', 'geometry'],
      dtype='object')

In [27]:
gdf['Economic_Impact']

0        £100 million
1        €150 million
2        €130 million
3         €80 million
4      DKK 70 million
            ...      
152       €30 million
153    CHF 15 million
154    ARS 20 million
155       €15 million
156    MXN 20 million
Name: Economic_Impact, Length: 157, dtype: object

In [28]:
convert_clp = 0.0011
convert_jpy = 0.0065
convert_nok = 0.092
convert_cad = 0.72
convert_pln = 0.25

#lets convert all the Economic Impact into USD
def convert_to_usd(value):
    if '€' in value:
        numeric_value = float(value.replace('€', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 1.1 * 1e6)  # Converting million Euros to USD
    if '£' in value:
        numeric_value = float(value.replace('£', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 1.29 * 1e6)  # Converting million Pounds to USD
    if 'USD' in value:
        numeric_value = float(value.replace('USD', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 1 * 1e6)  # Converting million USD to USD
    if 'MXN' in value:
        numeric_value = float(value.replace('MXN', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * .054 * 1e6)  # Converting million MXN to USD
    if 'AU$' in value:
        numeric_value = float(value.replace('AU$', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 0.65535 * 1e6)  # Converting million AU$ to USD
    if 'ARS' in value:
        numeric_value = float(value.replace('ARS', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 0.0011 * 1e6)  # Converting million ARS to USD
    if 'CHF' in value:
        numeric_value = float(value.replace('CHF', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 1.13 * 1e6)  # Converting million CHF to USD
    if 'DKK' in value:
        numeric_value = float(value.replace('DKK', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 0.15 * 1e6)  # Converting million DKK to USD
    if 'SEK' in value:
        numeric_value = float(value.replace('SEK', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * 0.092 * 1e6)  # Converting million SEK to USD
    if 'CLP' in value:
        numeric_value = float(value.replace('CLP', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * convert_clp * 1e6)  # Converting million CLP to USD
    if 'JPY' in value:
        numeric_value = float(value.replace('JPY', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * convert_jpy * 1e6)  # Converting million JYP to USD
    if 'PLN' in value:
        numeric_value = float(value.replace('PLN', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * convert_pln * 1e6)  # Converting million PLN to USD
    if 'NOK' in value:
        numeric_value = float(value.replace('NOK', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * convert_nok * 1e6)  # Converting million NOK to USD
    if 'CAD' in value:
        numeric_value = float(value.replace('CAD', '').replace(' million', '').replace(' ', ''))
        return '{:.0f}'.format(numeric_value * convert_cad * 1e6)  # Converting million CAD to USD    
    return value

In [29]:
gdf['Economic_Impact'] = gdf['Economic_Impact'].apply(convert_to_usd)  

In [30]:
gdf.head(2)

Unnamed: 0,Festival_Name,Location,Country,Attendance_Numbers,Age_Range,Visitor_Demographics,Economic_Impact,Music_Genre,Latitude,Longitude,geometry
0,Glastonbury Festival,Somerset,UK,200000,18-35,Music Enthusiasts,129000000,Various,51.161751,-3.075387,POINT (-3.07539 51.16175)
1,Tomorrowland,Boom,Belgium,400000,18-30,EDM Fans,165000000,EDM,51.087379,4.366722,POINT (4.36672 51.08738)


In [31]:
gdf['Economic_Impact'].unique()

array(['129000000', '165000000', '143000000', '88000000', '10500000',
       '77400000', '22000000', '16500000', '55000000', '77000000',
       '44000000', '66000000', '33000000', '60500000', '93500000',
       '38500000', '49500000', '51600000', '27500000', '19350000',
       '25800000', '15480000', '12900000', '32250000', '38700000',
       '13107000', '45150000', '11300000', '5000000', '19800000',
       '25000000', '7500000', '9040000', '28380000', '13200000',
       '11000000', '50000000', '1080000', '40000000', '8000000',
       '24200000', '736000', '7200000', '1104000', '20000000', '1380000',
       '10000000', '1350000', '260000', '22000', '1840000', '16950000'],
      dtype=object)

In [36]:
gdf.columns = 

Index(['Festival_Name', 'Location', 'Country', 'Attendance_Numbers',
       'Age_Range', 'Visitor_Demographics', 'Economic_Impact', 'Music_Genre',
       'Latitude', 'Longitude', 'geometry'],
      dtype='object')

In [None]:
# FINAL CLEAN DATA OUTPUT
# gdf.to_csv("Outputs/festivals_clean.csv")