In [1]:
import pandas as pd
import json
import time
import numpy as np
from IPython.display import display
import ipywidgets as widgets

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [3]:
# youtube
from googleapiclient.discovery import build
# API_KEY = 'AIzaSyDfhiUHhz21lJHxJxsiy19D5KL5bYGqtL8'
API_KEY = 'AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ'
# Initialize the YouTube Data API client
api_service_name = "youtube"
api_version = "v3"
youtube = build(api_service_name, api_version, developerKey=API_KEY)

In [4]:
# spotify
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
# client_id = "125fcec321e048db972933f9f364d74e"
# client_secret = "615237e26e0c4d22a13889fd7b91a46a"
client_id = "ba0b6a6887054b74ae1bc6bcb2c0deb4"
client_secret = "c2d9fab1d5da43df8af735ad7847c7fd"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [5]:
## Step 1: Import Pollstar Data

In [6]:
# Load data
data = pd.read_csv('PollStar_KPop_cleaned_dataset.csv')
data.head(3)

Unnamed: 0,Event Date,# Shows,Headliner,Support,Venue,City,State,Country,Market,Currency,Promoter,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min,Ticket Price Max,Ticket Price Avg. USD
0,2/6/2024,1,Xikers,,Melkweg,Amsterdam,,Netherlands,,Euro,Greenhouse Talent,Asian Pop,925,"$94,460.40",1350,69%,95.0,95.0,102.12
1,2/3/2024,1,Twice,,Foro Sol,Mexico City,,Mexico,,Pesos,OCESA,Asian Pop,57044,"$5,582,863.47",57062,100%,838.33,5029.95,97.87
2,2/3/2024,1,ENHYPEN,,New Clark City Stadium,New Clark City,,Philippines,,Pesos,HYBE,Asian Pop,23794,"$4,126,227.29",23850,100%,3000.0,16500.0,173.41


In [7]:
# Data preprocessing for pollstar data

# Check if 'Event Date' column contains valid dates
data['Event Date'] = pd.to_datetime(data['Event Date'], errors='coerce')

# Drop rows where 'Event Date' is null (indicating invalid date)
data = data.dropna(subset=['Event Date'])

# Reset index after dropping rows
data = data.reset_index(drop=True)

In [8]:
# extract month from Event Date to represent seasonality
data['Event Date'] = pd.to_datetime(data['Event Date'], format='%m/%d/%Y')
data['Month'] = data['Event Date'].dt.month

In [9]:
# Remove commas and convert to float
data['Avg. Gross USD'] = data['Avg. Gross USD'].replace({'\$': '', ',': ''}, regex=True).astype(float)

In [10]:
# create a Location column using 'City', 'State','Country'
data['Location'] = data['City'] + '_' + data['State'] + '_' + data['Country']

In [11]:
## Step 2: Extract artist names

In [12]:
artist_names = data['Headliner'].unique()

# Print or use the unique values as needed
print((artist_names))

['Xikers' 'Twice' 'ENHYPEN' 'SEVENTEEN' 'Chuu' '"Power 96.1 Jingle Ball"'
 '"Hot 99.5 Jingle Ball"' '"KISS 108 Jingle Ball"' 'BABYMETAL'
 'TOMORROW X TOGETHER' '"KIIS FM Jingle Ball"'
 '"iHeartradio Jingle Ball Tour"' 'EVERGLOW' 'The Rose' 'xikers'
 'Accusefive' '"KBS Immortal Songs Live"' '"Music Bank K-Pop Festival"'
 '"Mcountdown In France"' 'tripleS' 'Stay-C' 'Radwimps'
 'BABYMETAL, Metalocalypse: Dethklok' 'P1harmony' 'Loona' 'Jonathan Lee'
 'ATEEZ' '"Krazy K-Pop Super Concert"' '"Kamp Festival "' 'BLACKPINK'
 '"Lollapalooza Aftershows", The Rose' 'Suga' 'Rain' 'Enhypen' 'KARD'
 'NCT Dream' 'SUGA' 'Red Velvet' 'Nmixx' 'Mayday' 'JJ Lin'
 '"We Bridge Music Festival & Expo"' 'Sabaton' 'Baek Z Young' 'Stray Kids'
 'CIX' 'NCT 127' 'Seventeen' 'DPR Live, DPR IAN, DPR Cream'
 '"MetaMoon Music Festival"' 'ITZY' 'MUSTB' '"KCon"' '"HallyuPopFest"'
 'G I-DLE' 'Tomorrow X Together' 'Monsta X' 'The Boyz' '"K.Flex"' 'BTS'
 '"WIOQ Q102 Jingle Ball"' '"iHeart Radio\'s Jingle Ball"' 'Day6'
 '"B96 

In [13]:
# Define substrings to be dropped
to_drop = ['Festival', 'Jingle Ball', 'Awards','K-Pop']

# Filter out unwanted values
artist_names_filtered = [name for name in artist_names if not any(substring in name for substring in to_drop)]

# Print or use the filtered values as needed
print(artist_names_filtered)

['Xikers', 'Twice', 'ENHYPEN', 'SEVENTEEN', 'Chuu', 'BABYMETAL', 'TOMORROW X TOGETHER', 'EVERGLOW', 'The Rose', 'xikers', 'Accusefive', '"KBS Immortal Songs Live"', '"Mcountdown In France"', 'tripleS', 'Stay-C', 'Radwimps', 'BABYMETAL, Metalocalypse: Dethklok', 'P1harmony', 'Loona', 'Jonathan Lee', 'ATEEZ', 'BLACKPINK', '"Lollapalooza Aftershows", The Rose', 'Suga', 'Rain', 'Enhypen', 'KARD', 'NCT Dream', 'SUGA', 'Red Velvet', 'Nmixx', 'Mayday', 'JJ Lin', 'Sabaton', 'Baek Z Young', 'Stray Kids', 'CIX', 'NCT 127', 'Seventeen', 'DPR Live, DPR IAN, DPR Cream', 'ITZY', 'MUSTB', '"KCon"', '"HallyuPopFest"', 'G I-DLE', 'Tomorrow X Together', 'Monsta X', 'The Boyz', '"K.Flex"', 'BTS', 'Day6', '"B96 Jingle Bash"', 'Got7', 'Little Simz', 'Ateez', '"KCon", Mamamoo', 'Band Of Horses', '"KIIS FM Wango Tango"', '"93.3 Summer Kickoff"', 'Fei Yu Ching', '"BBC Asian Network Live"', 'Winner', 'Ssingssing', 'Jacky Cheung', 'Rene Liu', 'B.A.P.', 'Jane Zhang', 'Wanna One', '"Reform And Open Up 40th Annive

In [14]:
## Step 3: Pull and Merge Youtube Data

In [15]:
def get_channel_basic_info_name(channel_names):
    channel_info_list = []
    
    for channel_name in channel_names:
        try:
            # Search for channels with the specified name
            search_request = youtube.search().list(
                part='snippet',
                type='channel',
                q=channel_name
            )
            search_response = search_request.execute()

            # Check if there are any search results
            if 'items' in search_response and search_response['items']:
                # Select the most relevant channel from the search results
                top_result = search_response['items'][0]
                channel_id = top_result['id']['channelId']
                
                # Call the channels.list method to retrieve information about the channel
                request = youtube.channels().list(
                    part="snippet,statistics",
                    id=channel_id
                )
                response = request.execute()

                # Extract relevant information from the API response
                channel_info = response['items'][0]
                snippet = channel_info['snippet']
                statistics = channel_info['statistics']

                # Store the information in a dictionary
                channel_data = {
                    "yt name":channel_name,
                    "yt Channel ID": channel_id,
                    "yt Title": snippet['title'],
                    "yt Description": snippet['description'],
                    "yt Published At": snippet['publishedAt'],
                    "yt View Count": statistics.get('viewCount', 0),
                    "yt Subscriber Count": statistics.get('subscriberCount', 0),
                    "yt Video Count": statistics.get('videoCount', 0)
                }

                # Append channel information to the list
                channel_info_list.append(channel_data)
                # print(channel_data)
            else:
                print(f"No channel found with the name: {channel_name}")

        except Exception as e:
            print(f"Error occurred: {e}")
    # Create a DataFrame from the list of dictionaries
    channel_df = pd.DataFrame(channel_info_list)
    return channel_df

In [16]:
yt_channel_info_df = get_channel_basic_info_name(artist_names_filtered)
# yt_channel_info_df = get_channel_basic_info_name(['2NE1'])
print(yt_channel_info_df)

Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=Xikers&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "YouTube Data API v3 has not been used in project 558190355524 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/youtube.googleapis.com/overview?project=558190355524 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.". Details: "[{'message': 'YouTube Data API v3 has not been used in project 558190355524 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/youtube.googleapis.com/overview?project=558190355524 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'domain': 'usageLimits', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com'}]">
Er

Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=Loona&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "YouTube Data API v3 has not been used in project 558190355524 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/youtube.googleapis.com/overview?project=558190355524 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.". Details: "[{'message': 'YouTube Data API v3 has not been used in project 558190355524 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/youtube.googleapis.com/overview?project=558190355524 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'domain': 'usageLimits', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com'}]">
Err

Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=G+I-DLE&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "YouTube Data API v3 has not been used in project 558190355524 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/youtube.googleapis.com/overview?project=558190355524 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.". Details: "[{'message': 'YouTube Data API v3 has not been used in project 558190355524 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/youtube.googleapis.com/overview?project=558190355524 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'domain': 'usageLimits', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com'}]">
E

No channel found with the name: "Geyou Benefit Concert"
No channel found with the name: "Rong Bang 10Th Anniversary Cenebration"
No channel found with the name: "Korea OPPA Fantasy Bubble Show"
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=snippet%2Cstatistics&id=UCdWs6SAfswwMj3-tOdgXNOg&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=snippet%2Cstatistics&id=UCzw-C7fNfs018R1FzIKnlaA&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "The request cannot be completed because you have exceed

Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=Mr.&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=%22Heavy+Montreal%22&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtu

Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=FTIsland%2C+CNBlue&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=%22SMTOWN+Live%22&key=AIzaSyDTpHq4WiHOedllYpR8cMell8jRLZ1xABQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'dom

In [17]:
# Merge data
merged_yt = pd.merge(data, yt_channel_info_df, how='left', left_on='Headliner', right_on='yt name')

## Step 4: Pull and Merge Spotify Data

In [18]:
def get_artist_basic_info_by_name(artist_names, sp):
    basic_spotify_records = {
        'sp artist_name': [],
        'sp artist_genre': [],
        'sp followers': [],
        'sp popularity': []
    }

    for artist_name in artist_names:
        result = sp.search(artist_name, type='artist')  # Search for the artist
        # if result:
        if result and 'artists' in result and 'items' in result['artists'] and result['artists']['items']:
            artist = result['artists']['items'][0]
            artist_id = artist['id']
            
            artist_info = sp.artist(artist_id)

            basic_spotify_records['sp artist_name'].append(artist_name)
            basic_spotify_records['sp artist_genre'].append(artist['genres'])
            basic_spotify_records['sp followers'].append(artist_info['followers']['total'])
            basic_spotify_records['sp popularity'].append(artist['popularity'])
        else:
            print(f"No artist found for {artist_name}")

    df_basic_spotify_records = pd.DataFrame(basic_spotify_records)
    return df_basic_spotify_records


In [19]:
basic_spotify_records_df=get_artist_basic_info_by_name(artist_names_filtered, sp)
print(basic_spotify_records_df)

No artist found for "93.3 Summer Kickoff"
No artist found for "Invite Love Concert"
No artist found for "KISW Pain In The Grass"
No artist found for "Zhejiang 20Th Anniversary Gala"
No artist found for "M!Countdown Asia"
No artist found for "Haha 2016 New Year Performance"
No artist found for "Rock On The Range"
No artist found for "Perfect Voice, Warmest Love"
No artist found for "I Want Music 'Cool'"
No artist found for "Singer Returned Live"
             sp artist_name  \
0                    Xikers   
1                     Twice   
2                   ENHYPEN   
3                 SEVENTEEN   
4                      Chuu   
..                      ...   
160         Chang Cheng-Yue   
161  Chang Cheng-Yue, Free9   
162               Box-O-Car   
163                  Danzig   
164                 Anthrax   

                                       sp artist_genre  sp followers  \
0                                    [k-pop boy group]        355657   
1                       [k-pop, k-

In [20]:
# merge data
merged_yt_sp = pd.merge(merged_yt, basic_spotify_records_df, how='left', left_on='Headliner', right_on='sp artist_name')

In [21]:
# Create a Pandas Excel writer， Save Data
with pd.ExcelWriter(f'Merged_data.xlsx', engine='xlsxwriter') as writer:
    # Write each DataFrame to a different sheet in the Excel file
    merged_yt_sp.to_excel(writer, sheet_name=f'merged_yt_sp', index=False)

## Step 5: Merge with Daisuki K-pop Dataset

In [22]:
# Load data
app_data = pd.read_csv('app_kpop_group.csv')
app_data.head(3)

Unnamed: 0,id,is_collab,name,kname,previous_name,previous_kname,fname,alias,id_company,members,...,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71
0,1,n,100%,백퍼센트,,,,,61,male,...,,,,,,,,,,
1,2,n,24K+,투포케이플러스,24K,투포케이,,,60,male,...,,,,,,,,,,
2,3,n,2AM,투에이엠,,,,,642,male,...,,,,,,,,,,


In [23]:
selected_columns = ['name', 'sales', 'gaondigital_times','gaondigital_firsts','yawards_total']
app_data_clean = app_data[selected_columns].copy() 
app_data_clean.head(3)

Unnamed: 0,name,sales,gaondigital_times,gaondigital_firsts,yawards_total
0,100%,103148,3,0,0
1,24K+,10083,0,0,0
2,2AM,239440,194,4,7


In [24]:
# preprocess the columns to mae them uniform
app_data_clean['name_processed'] = app_data_clean['name'].str.lower().str.replace('[^\w\s"]', '')
merged_yt_sp['Headliner_processed'] = merged_yt_sp['Headliner'].str.lower().str.replace('[^\w\s"]', '')
# merge
# Merge the datasets
merged_df = pd.merge(merged_yt_sp, app_data_clean,
                     how='left',
                     left_on='Headliner_processed',
                     right_on='name_processed')
# Drop the processed columns
merged_df.drop(['name_processed', 'Headliner_processed'], axis=1, inplace=True)


  app_data_clean['name_processed'] = app_data_clean['name'].str.lower().str.replace('[^\w\s"]', '')
  merged_yt_sp['Headliner_processed'] = merged_yt_sp['Headliner'].str.lower().str.replace('[^\w\s"]', '')


In [26]:
# Create a Pandas Excel writer
with pd.ExcelWriter(f'Merged_data.xlsx', engine='xlsxwriter') as writer:
    # Write each DataFrame to a different sheet in the Excel file
    merged_df.to_excel(writer, sheet_name=f'merged_df', index=False)