## Import packages and data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import plotly.express as px
import requests
import json
from bs4 import BeautifulSoup
import time
import os

df = pd.read_csv('data.csv')
df = df[df['popularity'] != 0]
df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


## Looking at and cleaning the data

In [18]:
df['name'].value_counts()

White Christmas                                70
Winter Wonderland                              59
Jingle Bells                                   52
Summertime                                     52
Silent Night                                   43
                                               ..
Take Me Back To My Boots And Saddles            1
Cosmic Rays                                     1
99 Red Balloons - Club Mix                      1
What Is This Thing Called Love - Remastered     1
Stan - Live At 43rd Grammy Awards               1
Name: name, Length: 109268, dtype: int64

In [19]:
# People can't get enough holiday music!

In [22]:
df['primary_artist'] = df['artists'].apply(lambda x: ast.literal_eval(x)[0])
df['title_artist'] = df['name'] + ' ' + df['primary_artist']
df['title_artist_search_term'] = ['%20'.join(search.split(' ')) for search in df['title_artist']]
df = df.sort_values(by='popularity', ascending=False)

## Build a quick Linear Regression to see how we can get started on modeling popularity

In [6]:
df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'primary_artist', 'title_artist',
       'title_artist_search_term'],
      dtype='object')

In [7]:
from sklearn.model_selection import train_test_split
X = df[['valence', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo']]
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.3150535842087152

In [9]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train, y_train)
X_test_scaled = ss.transform(X_test)

In [10]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

0.3150535842087152

In [11]:
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures()
X_train_exp = pf.fit_transform(X_train, y_train)
X_test_exp = pf.transform(X_test)

In [12]:
lr = LinearRegression()
lr.fit(X_train_exp, y_train)
lr.score(X_test_exp, y_test)

0.39742710135526993

## This data is interesting, but we don't just want to look at the popularity provided by Spotify. It would also be good to bring in some data from other sources - Genius.com will be a great source.

### Genius is a website that provides the backstory behind music lyrics, and the best part is that all of the information is crowdsourced! Passionate fans add the story for the lyrics of their favorite songs, so this will give us a great sense of people who are the most passionate about a given song!

## Get search results from Genius API

In [13]:
get_genius_api_results = False # MARK AS FALSE UNLESS YOU WANT TO MAKE 10k API CALLS

if get_genius_api_results:
    
    print('GETTING API RESULTS - INTERRUPT KERNEL IF THIS IS NOT INTENDED')
    
    # Create a dataframe in which we will pass API results
    df_genius_api = pd.DataFrame(columns=['search_term', 'annotation_count', 'full_title', 'pyongs_count', 'stats_unreviewed_annotations', 'stats_hot', 'song_url', 'primary_artist'])
    
    # Create a function to get data from the Genius API, such as user interaction like views, annotations
    for search_term in df['title_artist_search_term'][0:10000]:
        url = f"https://api.genius.com/search?q={search_term}"
        payload={}
        headers = {
          'Authorization': f'Bearer {os.environ.get("SPOTIFY_API_TOKEN_V2")}',
          'Cookie': os.environ.get("SPOTIFY_API_COOKIE")
        }
        response = requests.request("GET", url, headers=headers, data=payload)
        result_big = json.loads(response.text)['response']['hits']
        if len(result_big) > 0:
            result = result_big[0]['result']
            arr = {'search_term': search_term, 'annotation_count': result['annotation_count'], 'full_title': result['full_title'], 'pyongs_count': result['pyongs_count'], 'stats_unreviewed_annotations': result['stats']['unreviewed_annotations'], 'stats_hot': result['stats']['hot'], 'song_url': result['url'], 'primary_artist': result['primary_artist']['name']}
            df_genius_api = df_genius_api.append(arr, ignore_index=True)
        else:
            arr = {'search_term': search_term, 'annotation_count': 'n/a', 'full_title': 'n/a', 'pyongs_count': 'n/a', 'stats_unreviewed_annotations': 'n/a', 'stats_hot': 'n/a', 'song_url': 'n/a', 'primary_artist': 'n/a'}
            df_genius_api = df_genius_api.append(arr, ignore_index=True)
        time.sleep(.5)
        
    df_genius_api.to_csv(r'genius_api_results.csv')
else:
    print('Reading in df from csv')
    df_genius_api = pd.read_csv('genius_api_results.csv')
print('done')

Reading in df from csv
done


## Webscraping lyrics from Genius.com

In [14]:
def get_lyrics(url):
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'lxml')
        lyrics = soup.select_one('div[class^="lyrics"]').find('p').getText()
        lyrics = lyrics.replace("\n", "").replace("\'", "")
        return lyrics
    except:
        return 'no lyrics available on Genius'

In [15]:
scrape_genius_for_lyrics = False # MARK AS FALSE UNLESS YOU WANT TO WEBSCRAPE 10k WEBPAGES

if scrape_genius_for_lyrics:
    print('WEBSCRAPING - INTERRUPT KERNEL IF THIS IS NOT INTENDED')
    df_genius_api['song_lyrics_text'] = df_genius_api['song_url'].apply(get_lyrics)
    df_genius_api.to_csv(r'genius_api_and_lyrics.csv')

df_genius = pd.read_csv('genius_api_and_lyrics.csv')

## Merge original Spotify dataframe and Genius data (API and webscraped)

In [16]:
df_merged = pd.merge(df, df_genius, how='inner', left_on='title_artist_search_term', right_on='search_term')
pd.set_option('display.max_columns', None)
df_merged = df_merged[['id', 'full_title', 'name', 'primary_artist_x',  'release_date', 'explicit', 'duration_ms', 'valence', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key','liveness', 'loudness', 'mode',  'speechiness', 'tempo',  'annotation_count', 'popularity', 'pyongs_count', 'stats_unreviewed_annotations','stats_hot', 'song_url', 'song_lyrics_text']]
# dropping 'primary_artist_y', 'title_artist','title_artist_search_term', 'search_term',  'artists', 'year'
df_all = df_merged.drop_duplicates(subset=['full_title'])
df_all = df_all.reset_index(drop=True)
df_all.to_csv(r'df_all.csv')
df_all

## Get most hummed songs list

In [118]:
get_most_hummed_songs = False # MARK AS FALSE UNLESS YOU WANT TO WEBSCRAPE BILLBOARD.COM

if get_most_hummed_songs:
    from selenium import webdriver
    DRIVER_PATH = '/usr/local/bin/chromedriver'
    driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    driver.get('https://www.billboard.com/charts/year-end/top-hummed')
    table = driver.find_element_by_xpath('//html/body/main/div/div/div[4]/div/ol')
    table_text = table.text.split('\n')

    rank = []
    title = []
    artist = []

    for num in range(0, len(table_text), 3):
        rank.append(table_text[num])
        title.append(table_text[num+1])
        artist.append(table_text[num+2])

    df_hummed_list = pd.DataFrame({'rank': rank, 'title': title, 'artist': artist}).set_index('rank')
    df_hummed_list.to_csv(r'songs_hummed.csv')
else:
    df_hummed_list = pd.read_csv('songs_hummed.csv')
df_hummed = pd.merge(df_all, df_hummed, how='inner', left_on='name', right_on='title').drop_duplicates()
df_hummed.to_csv('songs_hummed.csv')

## Now we have all of the data we need  - let's get to modeling popularity and then predicting the upcoming Grammy Awards