In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# read the playlist csv
df=pd.read_csv('https://raw.githubusercontent.com/dekaghub/Data-Projects-Deka/main/Datasets/Production_Playlist.csv')

In [5]:
# read the countries file
countries = pd.read_csv('https://github.com/dekaghub/datasets-for-public/raw/main/all_countries_231.txt', delimiter="\t", header=None,names=['country'])

In [6]:
countries.tail(5)

Unnamed: 0,country
226,Wales
227,Wallis & Futuna
228,Yemen
229,Zambia
230,Zimbabwe


In [7]:
# adding U.S., US, UK, U.K. because wikipedia
countries = pd.concat([countries,pd.DataFrame({'country':['U.S.', 'US', 'UK', 'U.K.']})]).sort_values(by='country',ignore_index=True)

In [8]:
countries[-20:-15]

Unnamed: 0,country
215,U.S.
216,UK
217,US
218,Uganda
219,Ukraine


In [10]:
artists = df['Artist']

In [11]:
artists[:10]

0            SBTRKT,Roses Gabor
1                         Ozuna
2                     Dwson,Sio
3                          Biyo
4                       Pat Lok
5    Young Franco,Blair De Milo
6          Garden City Movement
7               Young the Giant
8                   The Strokes
9                        Sigrid
Name: Artist, dtype: object

In [11]:
artists = artists.apply(lambda i: i.split(","))

In [12]:
artists

0      [SBTRKT, Roses Gabor]
1                    [Ozuna]
2               [Dwson, Sio]
3                     [Biyo]
4                  [Pat Lok]
               ...          
671             [Amber Mark]
672            [Smash Mouth]
673             [Los Retros]
674               [Anomalie]
675               [Anomalie]
Name: Artist, Length: 676, dtype: object

In [13]:
# funciton to scrape wikipedia

def get_tr_lines(wiki_table):
    temp = []

    for item in wiki_table.findAll("tr"):
         if "Born" in item.get_text(" ") or "Origin" in item.get_text(" "):
            temp.append(item.get_text(" "))
    return temp

def wiki_country(artist):

    url = "https://en.wikipedia.org/wiki/" + artist
    class_bio = "infobox biography vcard"
    class_plain = "infobox vcard plainlist"

    response = requests.get(url)
    result = ''

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        text = []

        wiki_table = soup.find("table", {"class":class_bio})

        if wiki_table != None:
            text = get_tr_lines(wiki_table)
        
        if wiki_table is None:
            wiki_table = soup.find("table", {"class":class_plain})
            if wiki_table != None:
                text = get_tr_lines(wiki_table)
    
        for t in text:
            for country in countries['country']:
                if country in t:
                    result = country

    return result

In [14]:
# funciton to scrape ra.co

def ra_country(artist):

    url = "https://ra.co/dj/" + artist
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
    div_class = "Text-sc-1t0gn2o-0 ijXAtQ"

    response = requests.get(url, headers=headers)
    result = ''

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        ra_div = soup.find("span", {"class":div_class})

        if ra_div:
            for country in countries['country']:
                if country in ra_div:
                    result = country
    return result

In [15]:
ra_country('michaelbibi')

'United Kingdom'

In [16]:
def artist_name_util(artist):
    artist_name_variations = {'wiki':[],'ra':[]}

    
    if " " in artist:
        # wiki
        artist_name_variations['wiki'].append(artist.replace(" ", "_"))
        artist_name_variations['wiki'].append(artist.replace(" ", "_")+'_(musician)')
        artist_name_variations['wiki'].append(artist.replace(" ", "_")+'_(band)')
        artist_name_variations['wiki'].append(artist.replace(" ", "_")+'_(rapper)')
        artist_name_variations['wiki'].append(artist.replace(" ", "_")+'_(artist)')
        # ra
        artist_name_variations['ra'].append(artist.replace(" ", ""))
    else:
        artist_name_variations['wiki'].append(artist)
        artist_name_variations['wiki'].append(artist+'_(musician)')
        artist_name_variations['wiki'].append(artist+'_(band)')
        artist_name_variations['wiki'].append(artist+'_(rapper)')
        artist_name_variations['wiki'].append(artist+'_(artist)')
        artist_name_variations['ra'].append(artist)
    return artist_name_variations


In [17]:
def get_country(row):
    artist_country = {}
    
    for item in row:
        temp = artist_name_util(item)
        for name in temp['wiki']:
            res = wiki_country(name)
            if res:
                artist_country[item] = res
                break
            else:
                for name in temp['ra']:
                    res = ra_country(name)
                    if res:
                        artist_country[item] = res
        
        if (artist_country == {}):
            artist_country[item]='N/A'
        elif len(artist_country) & (res == ''):
            artist_country[item]='N/A'


    return artist_country

In [37]:
artists_country = artists.apply(lambda x: get_country(x))

In [38]:
artists_country

0      {'SBTRKT': 'England', 'Roses Gabor': 'N/A'}
1                         {'Ozuna': 'Puerto Rico'}
2                   {'Dwson': 'N/A', 'Sio': 'N/A'}
3                                  {'Biyo': 'N/A'}
4                               {'Pat Lok': 'N/A'}
                          ...                     
671                         {'Amber Mark': 'U.S.'}
672                        {'Smash Mouth': 'U.S.'}
673                          {'Los Retros': 'N/A'}
674                            {'Anomalie': 'N/A'}
675                            {'Anomalie': 'N/A'}
Name: Artist, Length: 676, dtype: object

In [23]:
artists_country.to_csv('../Datasets/Production_Playlist_Countries.csv')

### RA Status Code Check

Use this to check if the status code is 403 or 200. If it's 403, you'll have to wait 12 hours and then it resets and you can try again.

In [None]:
def ra_country_status(artist):

    url = "https://ra.co/dj/" + artist
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
    div_class = "Text-sc-1t0gn2o-0 ijXAtQ"

    response = requests.get(url, headers=headers)
    
    return response.status_code

In [None]:
ra_country_status('shafhuse')