# American Music Awards

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_second_table_2(url, year):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        tables = soup.find_all('table')
        
        if len(tables) >= 4:
            second_table = tables[3]
            data = []
            
            rows = second_table.find_all('tr')
            for i in range(len(rows)):
                row_text = rows[i].get_text().strip().split('\n')
                
                if i % 2 == 0:
                    categories = [category.strip() for category in row_text if category.strip()]
                else:
                    cells = rows[i].find_all(['td', 'th'])
                    
                    winner_1 = ''
                    winner_2 = ''
                    
                    if len(cells) >= 2:
                        winner_1_tag = cells[0].find('b')
                        winner_2_tag = cells[1].find('b')
                        
                        if winner_1_tag:
                            winner_1 = winner_1_tag.get_text().strip()
                        if winner_2_tag:
                            winner_2 = winner_2_tag.get_text().strip()
                    elif len(cells) == 1:
                        bold_winner_tag = cells[0].find('b')
                        if bold_winner_tag:
                            winner_1 = bold_winner_tag.get_text().strip()
                        
                    for idx, category in enumerate(categories):
                        if idx % 2 == 0:
                            winner = winner_1
                        else:
                            winner = winner_2
                        data.append({'Category': category, 'Winner': winner, 'Year': year})
            
            df = pd.DataFrame(data)
            return df
        else:
            print('Non è stata trovata una quarta tabella sulla pagina.')
            return None
    else:
        print('Errore nella richiesta HTTP:', response.status_code)
        return None

urls = [
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2013',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2014',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2015',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2016',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2017',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2018',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2019',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2020',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2021',
    'https://en.wikipedia.org/wiki/American_Music_Awards_of_2022',
]

all_dfs = []

for url in urls:
    year = url.split("_")[-1]
    df_results = scrape_second_table_2(url, year)
    all_dfs.append(df_results)

AmericanAwards = pd.concat(all_dfs, ignore_index=True)

In [4]:
# Add the column 'Competition' that has the same value 'AmericanMusicAwards' to help the integration
AmericanAwards['Competition'] = 'AmericanMusicAwards'

In [5]:
# Transform "Year" into a int variable
AmericanAwards["Year"]=pd.to_numeric(AmericanAwards["Year"])

### Manual Changes

In [6]:
# To change the Category from "Kohl's New Artist of the Year" to "New Artist of the Year"
AmericanAwards.at[1, "Category"] = "New Artist of the Year"

#AmericanAwards[AmericanAwards["Category"]=="New Artist of the Year"]

In [7]:
# To change the Category from "Single of the Year" to "Song of the Year"
AmericanAwards.at[20, "Category"] = "Song of the Year"
AmericanAwards.at[43, "Category"] = "Song of the Year"

#AmericanAwards[AmericanAwards["Category"]=="Song of the Year"]

In [8]:
# To insert the Favorite Female Artist – Pop/Rock in 2019, bacuase it was null
AmericanAwards.at[164, "Winner"] = "Taylor Swift"

In [9]:
# To change the Category from "Kohl's New Artist of the Year" to "New Artist of the Year"
AmericanAwards.at[127, "Category"] = "Artist of the Year"

#AmericanAwards[AmericanAwards["Category"]=="Artist of the Year"]

In [10]:
# To change the Winner from "Beyoncé" to "Beyoncé – Beyoncé"; from "1989" to "1989 – Taylor Swift"; from "Anything Goes" to "Anything Goes – Florida Georgia Line"
AmericanAwards.at[37, "Winner"] = "Beyoncé – Beyoncé"
AmericanAwards.at[53, "Winner"] = "1989 – Taylor Swift"
AmericanAwards.at[57, "Winner"] = "Anything Goes – Florida Georgia Line"
AmericanAwards.at[155,"Winner"] = "Marshmello"

In [11]:
# AmericanAwards['Category'].unique()

In [12]:
#To change some Categories regarding Country
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Country Male Artist', 'Favorite Male Artist – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Male Country Artist', 'Favorite Male Artist – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Country Female Artist', 'Favorite Female Artist – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Female Country Artist', 'Favorite Female Artist – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Country Duo or Group', 'Favorite Band/Duo/Group – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Country Band/Duo/Group', 'Favorite Band/Duo/Group – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Duo or Group – Country', 'Favorite Band/Duo/Group – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Country Album', 'Favorite Album – Country')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Country Song', 'Favorite Song – Country')

# AmericanAwards[AmericanAwards["Category"].str.contains("Country")]["Category"].unique()

In [13]:
#To change some Categories regarding Soul/R&B
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Soul/R&B Male Artist', 'Favorite Male Artist – Soul/R&B')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Soul/R&B Female Artist', 'Favorite Female Artist – Soul/R&B')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Soul/R&B Album', 'Favorite Album – Soul/R&B')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Soul/R&B Song', 'Favorite Song – Soul/R&B')

# AmericanAwards[AmericanAwards["Category"].str.contains("Soul/R&B")]["Category"].unique()

In [14]:
#To change some Categories regarding Rap/Hip-Hop
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Rap/Hip-Hop Artist', 'Favorite Artist – Rap/Hip-Hop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Rap/Hip-Hop Album', 'Favorite Album – Rap/Hip-Hop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Rap/Hip-Hop Song', 'Favorite Song – Rap/Hip-Hop')

# AmericanAwards[AmericanAwards["Category"].str.contains("Rap/Hip-Hop")]["Category"].unique()

In [15]:
#To change some Categories regarding Latin
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Latin Artist', 'Favorite Artist – Latin')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Male Latin Artist', 'Favorite Male Artist – Latin')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Female Latin Artist', 'Favorite Female Artist – Latin')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Latin Song', 'Favorite Song – Latin')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Latin Album', 'Favorite Album – Latin')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Latin Duo or Group', 'Favorite Duo or Group – Latin')

# AmericanAwards[AmericanAwards["Category"].str.contains("Latin")]["Category"].unique()

In [16]:
#To change some Categories regarding Electronic Dance Music
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Electronic Dance Music Artist', 'Favorite Artist – Electronic Dance Music')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite EDM Artist', 'Favorite Artist – Electronic Dance Music')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Dance/Electronic Artist', 'Favorite Artist – Electronic Dance Music')

# AmericanAwards[AmericanAwards["Category"].str.contains("Electronic")]["Category"].unique()

In [17]:
#To change some Categories regarding Pop/Rock
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Artist', 'Favorite Artist – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Male Pop/Rock Artist', 'Favorite Male Artist – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Female Pop/Rock Artist', 'Favorite Female Artist – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Male Artist', 'Favorite Male Artist – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Female Artist', 'Favorite Female Artist – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Song', 'Favorite Song – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Album', 'Favorite Album – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Duo or Group', 'Favorite Duo or Group – Pop/Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop/Rock Band/Duo/Group', 'Favorite Duo or Group – Pop/Rock')

# AmericanAwards[AmericanAwards["Category"].str.contains("Pop/Rock")]["Category"].unique()

In [18]:
#To change some Categories regarding Hip-Hop
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Hip-Hop Artist', 'Favorite Artist – Hip-Hop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Male Hip-Hop Artist', 'Favorite Male Artist – Hip-Hop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Female Hip-Hop Artist', 'Favorite Female Artist – Hip-Hop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Hip-Hop Song', 'Favorite Song – Hip-Hop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Hip-Hop Album', 'Favorite Album – Hip-Hop')

# AmericanAwards[AmericanAwards["Category"].str.contains(" Hip-Hop")]["Category"].unique()

In [19]:
#To change some Categories regarding Rock
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Rock Artist', 'Favorite Artist – Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Rock Song', 'Favorite Song – Rock')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Rock Album', 'Favorite Album – Rock')

# AmericanAwards[AmericanAwards["Category"].str.contains(" Rock")]["Category"].unique()

In [20]:
#To change some Categories regarding R&B
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Male R&B Artist', 'Favorite Male Artist – R&B')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Female R&B Artist', 'Favorite Female Artist – R&B')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite R&B Song', 'Favorite Song – R&B')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite R&B Album', 'Favorite Album – R&B')

# AmericanAwards[AmericanAwards["Category"].str.contains(" R&B")]["Category"].unique()

In [21]:
#To change some Categories regarding Pop
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Male Pop Artist', 'Favorite Male Artist – Pop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Female Pop Artist', 'Favorite Female Artist – Pop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop Song', 'Favorite Song – Pop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop Album', 'Favorite Album – Pop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop Duo or Group', 'Favorite Duo or Group – Pop')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Pop Album', 'Favorite Album – Pop')

# AmericanAwards[AmericanAwards["Category"].str.contains("Pop")]["Category"].unique()

In [22]:
#To change other categories
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Gospel Artist', 'Favorite Artist – Gospel')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite Afrobeats Artist', 'Favorite Artist – Afrobeats')
AmericanAwards['Category'] = AmericanAwards['Category'].replace('Favorite K-pop Artist', 'Favorite Artist – Kpop')
AmericanAwards.loc[246, 'Winner'] = 'Silk Sonic-"Leave the Door Open"'

In [23]:
#To change the position of some values
AmericanAwards.at[43, "Category"] = "Song of the Year"

AmericanAwards.loc[AmericanAwards['Winner'] == 'Cardi B featuring Megan Thee Stallion – "WAP"', 'Winner'] = '"WAP" – Cardi B featuring Megan Thee Stallion'
AmericanAwards.loc[AmericanAwards['Winner'] == 'Doja Cat featuring SZA – "Kiss Me More"', 'Winner'] ='"Kiss Me More" – Doja Cat featuring SZA'
AmericanAwards.loc[AmericanAwards['Winner'] == 'Future featuring Drake and Tems – "Wait For U"', 'Winner'] ='"Wait For U" – Future featuring Drake and Tems'
AmericanAwards.loc[AmericanAwards['Winner'] == 'Wizkid featuring Tems – "Essence"', 'Winner'] ='"Essence" – Wizkid featuring Tems'

In [24]:
AmericanAwards = AmericanAwards.replace('–', '-', regex=True)

### Creation of the final dataframe 

In [25]:
AmericanAwards["Song"]=None
AmericanAwards["Album"]=None
AmericanAwards["Artist 1"]=None
AmericanAwards["Artist 2"]=None
AmericanAwards["Featuring"]=None


# Definition of the functions
import re

def extract_album_first(row):
    if not re.search("-", str(row)):
         return row
    else:
        parts=row.split("-")
        return parts[0].strip()
    
      
def extract_album_second(row):
    if not re.search("-", str(row)):
         return row
    else:
        parts=row.split("-")
        return parts[1].strip()


def extract_song(row):
    if not re.search('-', str(row)):
         return row
    else:
        match=re.search(r'"([^"]*)"', str(row))
        if match:
            return match.group(1)
        
             
def extract_artist(row):
    if str(row).startswith('"'):
        match=re.search('-\s*(.*)', str(row))
        if match:
            return match.group(1)
    else:
        match=re.search(r'^([^-]+)', str(row))
        if match:
            return match.group(1)


def extract_feat(row):
    split_string = str(row).split("featuring")
    return split_string[1].strip()


def extract_artist2(row):
    match=re.search(r'-\s*(.*?)\s*featuring', row)

    if match:
        return match.group(1).strip()

     
def extract_artist3(row):
    split_string = str(row).split("and")
    return split_string[0].strip()


def extract_artist4(row):
    split_string = str(row).split("and")
    return split_string[1].strip()
      
    
    
for i in range(0,len(AmericanAwards)-1):
    category=AmericanAwards["Category"].iloc[i]
    
    if "Artist" in category or "Group" in category or "Icon" in category or "Dick Clark" in category:
        AmericanAwards.at[i, "Artist 1"]=AmericanAwards["Winner"].iloc[i]
        
    elif "Album" in category:
        if AmericanAwards["Year"].iloc[i]<2020:
            AmericanAwards.at[i, "Album"]=extract_album_first(AmericanAwards["Winner"].iloc[i])
            AmericanAwards.at[i, "Artist 1"]=extract_album_second(AmericanAwards["Winner"].iloc[i])
        else:
            AmericanAwards.at[i, "Album"]=extract_album_second(AmericanAwards["Winner"].iloc[i])
            AmericanAwards.at[i, "Artist 1"]=extract_album_first(AmericanAwards["Winner"].iloc[i])
  
    elif "Song" in category or "Collaboration" in category or "Video" in category:
        AmericanAwards.at[i, "Song"]=extract_song(AmericanAwards["Winner"].iloc[i])
        AmericanAwards.at[i, "Artist 1"]=extract_artist(AmericanAwards["Winner"].iloc[i])


     


# Featuring
for i in range(0,len(AmericanAwards)-1):
    artist=AmericanAwards["Winner"].iloc[i]
    
    if "featuring" in artist:
        AmericanAwards.at[i, "Featuring"]=extract_feat(AmericanAwards["Artist 1"].iloc[i])
        AmericanAwards.at[i, "Artist 1"]=extract_artist2(AmericanAwards["Winner"].iloc[i])
        # print(artist,"\n", AmericanAwards.at[i, "Artist 2"] )



# Artist 1 and Artist 2
for i in range(0, len(AmericanAwards)-1):
    artist=AmericanAwards["Artist 1"].iloc[i]
    
    if artist is not None:    
        if artist.find(" and ")!= -1:
            AmericanAwards.at[i, "Artist 2"]=extract_artist4(AmericanAwards["Artist 1"].iloc[i])
            AmericanAwards.at[i, "Artist 1"]=extract_artist3(AmericanAwards["Artist 1"].iloc[i])
            
        
    
AmericanAwards.head(10)   

Unnamed: 0,Category,Winner,Year,Competition,Song,Album,Artist 1,Artist 2,Featuring
0,Artist of the Year,Taylor Swift,2013,AmericanMusicAwards,,,Taylor Swift,,
1,New Artist of the Year,Ariana Grande,2013,AmericanMusicAwards,,,Ariana Grande,,
2,Favorite Male Artist - Pop/Rock,Justin Timberlake,2013,AmericanMusicAwards,,,Justin Timberlake,,
3,Favorite Female Artist - Pop/Rock,Taylor Swift,2013,AmericanMusicAwards,,,Taylor Swift,,
4,Favorite Duo or Group - Pop/Rock,One Direction,2013,AmericanMusicAwards,,,One Direction,,
5,Favorite Album - Pop/Rock,Take Me Home - One Direction,2013,AmericanMusicAwards,,Take Me Home,One Direction,,
6,Favorite Male Artist - Country,Luke Bryan,2013,AmericanMusicAwards,,,Luke Bryan,,
7,Favorite Female Artist - Country,Taylor Swift,2013,AmericanMusicAwards,,,Taylor Swift,,
8,Favorite Band/Duo/Group - Country,Lady Antebellum,2013,AmericanMusicAwards,,,Lady Antebellum,,
9,Favorite Album - Country,Red - Taylor Swift,2013,AmericanMusicAwards,,Red,Taylor Swift,,


### Save as csv file

In [26]:
AmericanAwards.to_csv("AMA_final.csv", encoding='utf-8')

# Analysis

### Who is the most successful Artist?

In [27]:
from collections import Counter

artists_list = [artist for artist in AmericanAwards[['Artist 1', 'Artist 2']].values.ravel() if pd.notnull(artist)]
artist_counter = Counter(artists_list)

most_successful_artist, most_successful_artist_count = artist_counter.most_common(1)[0]

print("The most successful Artist is:", most_successful_artist)
print("Number of Awards:", most_successful_artist_count)

The most successful Artist is: Taylor Swift
Number of Awards: 25


### Which are the most awarded Album?

In [28]:
album_counts = AmericanAwards['Album'].value_counts()
max_occurrences = album_counts.max()
most_awarded_albums = album_counts[album_counts == max_occurrences].index.tolist()

print("The most awarded Albums are:", most_awarded_albums)
print("Number of Awards:", max_occurrences)

The most awarded Albums are: ["Red (Taylor's Version)", '24K Magic']
Number of Awards: 2


### Which is the most awarded Song?

In [29]:
song_counts = AmericanAwards['Song'].value_counts()
max_occurrences = song_counts.max()
most_awarded_songs = song_counts[song_counts == max_occurrences].index.tolist()

print("The most awarded Song is:", most_awarded_songs)
print("Number of Awards:", max_occurrences)

The most awarded Song is: ['Havana']
Number of Awards: 3


### Which artists has made the greatest contribution to the victory?

In [30]:
featuring_counts = AmericanAwards['Featuring'].value_counts()
max_occurrences = featuring_counts.max()
most_frequent_featurings = featuring_counts[featuring_counts == max_occurrences].index.tolist()

print("The Artists who did the highest number of Featuring collaborations are:", most_frequent_featurings)
print("Number of Awards:", max_occurrences)

The Artists who did the highest number of Featuring collaborations are: ['Justin Bieber', 'Young Thug']
Number of Awards: 3


### In what year were the categories most inclusive?

In [31]:
year_counts = AmericanAwards['Year'].value_counts()
max_occurrences = year_counts.max()
most_inclusive_years = year_counts[year_counts == max_occurrences].index.tolist()

print("The most inclusive Year is:", most_inclusive_years)
print("Number of Categories:", max_occurrences)

The most inclusive Year is: [2022]
Number of Categories: 39
