In [15]:
import pandas as pd
import json
import time
import numpy as np
from IPython.display import display
import ipywidgets as widgets

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [36]:
# youtube
from googleapiclient.discovery import build
API_KEY = 'AIzaSyDfhiUHhz21lJHxJxsiy19D5KL5bYGqtL8'
# Initialize the YouTube Data API client
api_service_name = "youtube"
api_version = "v3"
youtube = build(api_service_name, api_version, developerKey=API_KEY)

In [43]:
# spotify
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
client_id = "125fcec321e048db972933f9f364d74e"
client_secret = "615237e26e0c4d22a13889fd7b91a46a"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [37]:
## Step 1: Import Pollstar Data

In [38]:
# Load data
data = pd.read_csv('a-round-pollstar.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,Event Date,# Shows,Headliner,Support,Venue,City,State,Country,Market,...,Promoter,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min,Ticket Price Max,Ticket Price Avg. USD,Unnamed: 21
0,1,2023-11-04,1,xikers,,Warfield Theatre,San Francisco,California,United States,San Francisco-Oakland-San Jose,...,Sean Healy Presents,Asian Pop,1261.0,152175.02,1780.0,71.0,50.0,175.0,120.68,
1,2,2023-11-03,1,EVERGLOW,,Kings Theatre,Brooklyn,New York,United States,New York,...,Sean Healy Presents,Asian Pop,941.0,100970.0,2312.0,41.0,60.0,195.0,107.3,
2,3,2023-11-01,1,Xikers,,Orpheum Theatre,Los Angeles,California,United States,Los Angeles,...,Sean Healy Presents,Asian Pop,1812.0,209839.65,1924.0,94.0,50.0,175.0,115.81,


In [39]:
# Data preprocessing for pollstar data

# Check if 'Event Date' column contains valid dates
data['Event Date'] = pd.to_datetime(data['Event Date'], errors='coerce')

# Drop rows where 'Event Date' is null (indicating invalid date)
data = data.dropna(subset=['Event Date'])

# Reset index after dropping rows
data = data.reset_index(drop=True)

In [20]:
# extract month from Event Date to represent seasonality
data['Event Date'] = pd.to_datetime(data['Event Date'], format='%m/%d/%Y')
data['Month'] = data['Event Date'].dt.month

In [21]:
# Remove commas and convert to float
data['Avg. Gross USD'] = data['Avg. Gross USD'].astype(float)

In [22]:
# create a Location column using 'City', 'State','Country'
data['Location'] = data['City'] + '_' + data['State'] + '_' + data['Country']

In [23]:
## Step 2: Extract artist names

In [42]:
artist_names = data['Headliner'].unique()

# Print or use the unique values as needed
print((artist_names))

['xikers' 'EVERGLOW' 'Xikers' 'The Rose' 'ENHYPEN'
 '"Mcountdown In France"' 'tripleS' 'Stay-C' 'Radwimps'
 'BABYMETAL, Metalocalypse: Dethklok' 'P1harmony' 'Loona' 'Twice'
 'Jonathan Lee' 'SEVENTEEN' 'ATEEZ' 'TOMORROW X TOGETHER'
 '"Lollapalooza Aftershows", The Rose' 'Suga' 'Rain' 'Enhypen' 'KARD'
 'NCT Dream' 'BLACKPINK' 'Red Velvet' 'Nmixx' 'Mayday' 'JJ Lin' 'Sabaton'
 'Baek Z Young' 'Stray Kids' 'CIX' 'NCT 127' 'Seventeen'
 'DPR Live, DPR IAN, DPR Cream' 'ITZY' 'MUSTB' 'G I-DLE'
 'Tomorrow X Together' 'Monsta X' 'The Boyz' 'BTS' 'BABYMETAL' 'Day6'
 'Got7' 'Little Simz' 'Ateez' '"KCon", Mamamoo' 'Band Of Horses'
 '"Billboard Music Awards"' 'Fei Yu Ching' 'Winner' 'Ssingssing'
 'Jacky Cheung' 'Rene Liu' 'B.A.P.' 'Jane Zhang' 'Wanna One' 'Up10tion'
 'Leehom Wang' 'Zhang Jun' 'Nine Percent' 'Chris Lee (Li Yuchun)'
 'Yu-Ching Fei' 'Snh48' 'A-Mei' 'Kuang Program' 'FTIsland' 'Yu-Rong Yang'
 'Lala Hsu' 'G-Dragon' 'Silence (Wang Su Long)' 'Bibi Zhou' 'Taeyang'
 'Ronghao Li' 'Lion' 'Pu Shu'

In [25]:
## Step 3: Pull and Merge Youtube Data

In [40]:
def get_channel_basic_info_name(channel_names):
    channel_info_list = []

    try:
        for channel_name in channel_names:
            # Search for channels with the specified name
            search_request = youtube.search().list(
                part='snippet',
                type='channel',
                q=channel_name
            )
            search_response = search_request.execute()

            # Check if there are any search results
            if 'items' in search_response and search_response['items']:
                # Select the most relevant channel from the search results
                top_result = search_response['items'][0]
                channel_id = top_result['id']['channelId']
                
                # Call the channels.list method to retrieve information about the channel
                request = youtube.channels().list(
                    part="snippet,statistics",
                    id=channel_id
                )
                response = request.execute()

                # Extract relevant information from the API response
                channel_info = response['items'][0]
                snippet = channel_info['snippet']
                statistics = channel_info['statistics']

                # Store the information in a dictionary
                channel_data = {
                    "yt name":channel_name
                    "yt Channel ID": channel_id,
                    "yt Title": snippet['title'],
                    "yt Description": snippet['description'],
                    "yt Published At": snippet['publishedAt'],
                    "yt View Count": statistics.get('viewCount', 0),
                    "yt Subscriber Count": statistics.get('subscriberCount', 0),
                    "yt Video Count": statistics.get('videoCount', 0)
                }

                # Append channel information to the list
                channel_info_list.append(channel_data)
            else:
                print(f"No channel found with the name: {channel_name}")

        # Create a DataFrame from the list of dictionaries
        channel_df = pd.DataFrame(channel_info_list)

        return channel_df
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [41]:
yt_channel_info_df = get_channel_basic_info_name(artist_names)
print(yt_channel_info_df)

No channel found with the name: "Northside Festival", Pelada
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=7+Senses&key=AIzaSyDfhiUHhz21lJHxJxsiy19D5KL5bYGqtL8&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
None


In [None]:
# Merge data
merged_yt = pd.merge(data, yt_channel_info_df, how='left', left_on='Headliner', right_on='yt name')

## Step 4: Pull and Merge Spotify Data

In [46]:
def get_artist_basic_info_by_name(artist_names, sp):
    basic_spotify_records = {
        'sp artist_name': [],
        'sp artist_genre': [],
        'sp followers': [],
        'sp popularity': []
    }

    for artist_name in artist_names:
        result = sp.search(artist_name, type='artist')  # Search for the artist
        if result:
            artist = result['artists']['items'][0]
            artist_id = artist['id']
            
            artist_info = sp.artist(artist_id)

            basic_spotify_records['sp artist_name'].append(artist_name)
            basic_spotify_records['sp artist_genre'].append(artist['genres'])
            basic_spotify_records['sp followers'].append(artist_info['followers']['total'])
            basic_spotify_records['sp popularity'].append(artist['popularity'])
        else:
            print(f"No artist found for {artist_name}")

    df_basic_spotify_records = pd.DataFrame(basic_spotify_records)
    return df_basic_spotify_records


In [47]:
basic_spotify_records_df=get_artist_basic_info_by_name(artist_names, sp)
print(basic_spotify_records_df)

             sp artist_name  \
0                    xikers   
1                  EVERGLOW   
2                    Xikers   
3                  The Rose   
4                   ENHYPEN   
..                      ...   
131         Chang Cheng-Yue   
132  Chang Cheng-Yue, Free9   
133               Box-O-Car   
134                  Danzig   
135                 Anthrax   

                                       sp artist_genre  sp followers  \
0                                    [k-pop boy group]        331687   
1                            [k-pop, k-pop girl group]       2097851   
2                                    [k-pop boy group]        331687   
3                             [k-pop, k-pop boy group]       2604531   
4                             [anime, k-pop boy group]       8054496   
..                                                 ...           ...   
131  [classic mandopop, mandopop, taiwan pop, taiwa...        189833   
132                                                

In [48]:
# merge data
merged_yt_sp = pd.merge(merged_yt, basic_spotify_records_df, how='left', left_on='Headliner', right_on='sp artist_name')

NameError: name 'merged_yt' is not defined

In [None]:
## Step 5: Save Data

In [None]:
# Create a Pandas Excel writer
with pd.ExcelWriter(f'Merged_data.xlsx', engine='xlsxwriter') as writer:
    # Write each DataFrame to a different sheet in the Excel file
    merged_yt_sp.to_excel(writer, sheet_name=f'merged_yt_sp', index=False)