In [7]:
import pandas as pd
import re
import os
import time
import csv
import requests
from dotenv import load_dotenv

In [10]:
def extract_unique_studios(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Filter out rows with 'Not specified'
    df = df[df['recorded_at'] != 'Not specified']
    
    # Create an empty set to store unique studios
    unique_studios = set()
    
    # Iterate through each row and split the studios
    for studios in df['recorded_at']:
        if isinstance(studios, str):  # Check if the value is a string
            # Split by semicolon and strip whitespace
            studio_list = [studio.strip() for studio in studios.split(';')]
            unique_studios.update(studio_list)
    
    # Convert set to sorted list
    sorted_studios = sorted(unique_studios)
    
    # Create a DataFrame with the unique studios
    output_df = pd.DataFrame({'studio_name': sorted_studios})
    
    # Save to CSV
    output_df.to_csv(output_file, index=False)
    
    return len(sorted_studios)  # Return count of unique studios

In [18]:
# Example usage
input_file = 'top_album_studios_2022_2024.csv'
output_file = 'unique_studios_2022_24.csv'

num_studios = extract_unique_studios(input_file, output_file)

In [19]:
print(f"Successfully extracted {num_studios} unique studios to {output_file}")

Successfully extracted 412 unique studios to unique_studios_2022_24.csv


In [23]:
studios_2022_24 = pd.read_csv('unique_studios_2022_24.csv')

In [26]:
studio_list = studios_2022_24['studio_name'].tolist()

In [27]:
studio_list

['08001 Studio',
 '38 Fresh',
 '4 Stars Recording Studios',
 '54 Sound',
 'ABC Studios, Adelaide',
 'ABS Recording',
 'AR Studios',
 'Abbey Road Studios',
 'Adorable Trap',
 'Air Lyndhurst Hall',
 'All Faders Up',
 'All Hallows-on-the-Wall Church, London',
 'All Saints Church, Tooting, London',
 'Almelo Jazz Club',
 'Ameraycan Studios',
 'American Recording Co.',
 'Analog Lab',
 'Angel Sound Studio, Barcelona',
 'Angelic Studios',
 'Apmamman',
 'Atlantic Studios',
 'Atlantis Metronome',
 'Audu Music Studio',
 'Augustana Hochschule Neuendettelsau',
 'Avatar Studios',
 "BBC Radio 1's Live Lounge",
 'BOOMcity Studios, Nashville, TN',
 'Backstage Recording Studio',
 'Balboa Recording Studio',
 'Barber Shop Studios',
 'Barron Studios',
 'Baseline Studios',
 'Basement Beats',
 'Basilica di San Marco, Venice',
 'Beauty Supply Studios',
 "Betty's",
 'Beuron, Kirche Der Erzabtei St. Martin',
 'Big Bag Sound, Los Angeles, CA',
 'Big Hit Studio',
 'Big Loud Studios',
 'Big Mercy Studio',
 'Big So

In [28]:
import os
import time
import requests
import csv
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
DISCOGS_TOKEN = os.getenv('discogs_token')
USER_AGENT = os.getenv('USER_AGENT')

def search_studio(studio_name):
    """
    Search for a studio by name using Discogs API
    """
    headers = {
        'Authorization': f'Discogs token={DISCOGS_TOKEN}',
        'User-Agent': USER_AGENT
    }
    
    params = {
        'q': studio_name,
        'type': 'label',
        'per_page': 1
    }
    
    while True:
        try:
            response = requests.get(
                'https://api.discogs.com/database/search',
                headers=headers,
                params=params
            )
            if response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', 1))
                print(f"Rate limit hit. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
                continue

            response.raise_for_status()
            results = response.json()
            
            if results['results']:
                resource_url = results['results'][0]['resource_url']
                studio_id = resource_url.split('/')[-1]
                return studio_id
            return None
        except requests.exceptions.RequestException as e:
            print(f"Error searching for studio '{studio_name}': {e}")
            return None

def fetch_studio_data(studio_id):
    """
    Fetch detailed information about a studio from Discogs API
    """
    headers = {
        'Authorization': f'Discogs token={DISCOGS_TOKEN}',
        'User-Agent': USER_AGENT
    }
    
    url = f'https://api.discogs.com/labels/{studio_id}'
    
    while True:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', 30))
                print(f"Rate limit hit. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
                continue

            response.raise_for_status()
            studio_data = response.json()
            
            # If there's a parent label, fetch its details
            parent_label_data = None
            if studio_data.get('parent_label'):
                parent_url = studio_data['parent_label']['resource_url']
                time.sleep(1)  # Rate limiting delay
                parent_response = requests.get(parent_url, headers=headers)
                if parent_response.status_code == 200:
                    parent_label_data = parent_response.json()
            
            return studio_data, parent_label_data
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for studio ID {studio_id}: {e}")
            return None, None

def process_studios(studio_names):
    """
    Process a list of studio names and return their data
    """
    studios_data = []
    
    for studio_name in studio_names:
        print(f"Processing: {studio_name}")
        
        studio_id = search_studio(studio_name.strip())
        
        if studio_id:
            studio_data, parent_label_data = fetch_studio_data(studio_id)
            
            if studio_data:
                # Extract links from the URLs array
                urls = studio_data.get('urls', [])
                formatted_links = '; '.join(urls) if urls else ''
                
                # Get parent label information
                parent_label_info = ''
                if parent_label_data:
                    parent_name = parent_label_data.get('name', '')
                    parent_location = parent_label_data.get('profile', '').split('\r\n')[0]  # Often first line has location
                    parent_label_info = f"{parent_name} ({parent_location})" if parent_location else parent_name
                
                studio_info = {
                    'name': studio_data.get('name'),
                    'profile': studio_data.get('profile', ''),
                    'contact_info': studio_data.get('contact_info', ''),
                    'links': formatted_links,
                    'parent_label': parent_label_info
                }
                studios_data.append(studio_info)
        else:
            studios_data.append({
                'name': studio_name.strip(),
                'profile': 'Not found',
                'contact_info': '',
                'links': '',
                'parent_label': ''
            })
            
        time.sleep(1)
    
    return studios_data

def save_to_csv(studios_data, output_file='studios_data_discogs_pull_2022_24.csv'):
    """
    Save studios data to CSV file
    """
    if not studios_data:
        print("No data to save")
        return
    
    fieldnames = ['name', 'profile', 'contact_info', 'links', 'parent_label']
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(studios_data)
        print(f"Data saved to {output_file}")
        
def main():
    studio_names = studio_list  # Make sure to define this variable with your studio names
    studios_data = process_studios(studio_names)
    save_to_csv(studios_data)
    
if __name__ == "__main__":
    main()

Processing: 08001 Studio
Processing: 38 Fresh
Processing: 4 Stars Recording Studios
Processing: 54 Sound
Processing: ABC Studios, Adelaide
Processing: ABS Recording
Processing: AR Studios
Processing: Abbey Road Studios
Processing: Adorable Trap
Processing: Air Lyndhurst Hall
Processing: All Faders Up
Processing: All Hallows-on-the-Wall Church, London
Processing: All Saints Church, Tooting, London
Processing: Almelo Jazz Club
Processing: Ameraycan Studios
Processing: American Recording Co.
Processing: Analog Lab
Processing: Angel Sound Studio, Barcelona
Processing: Angelic Studios
Processing: Apmamman
Processing: Atlantic Studios
Processing: Atlantis Metronome
Processing: Audu Music Studio
Processing: Augustana Hochschule Neuendettelsau
Processing: Avatar Studios
Processing: BBC Radio 1's Live Lounge
Processing: BOOMcity Studios, Nashville, TN
Processing: Backstage Recording Studio
Processing: Balboa Recording Studio
Processing: Barber Shop Studios
Processing: Barron Studios
Processing:

Processing: Johanniskirche, Göttingen
Processing: Jon's Studio
Processing: Jubilee Road Recording Suite
Processing: Jungle City Studios
Processing: KDS Music Studios
Processing: KHJ Studios
Processing: Kass Cave
Processing: Kempinski Hotel Corvinus
Processing: Kingsway Studios (3)
Processing: Kirche Neumünster, Zürich
Processing: Kiss The Chief Studios
Processing: Kulturzentrum Grand Hotel, Gustav-Mahler-Saal
Processing: LVRN Studios
Processing: La Base (3)
Rate limit hit. Retrying after 1 seconds...
Rate limit hit. Retrying after 30 seconds...
Processing: La Caja De Música (2)
Processing: La Casa de Isaza, Bogotá
Processing: La Fabrique
Processing: La Frette Studios
Processing: Larrabee Sound Studios
Processing: Larrabee West
Processing: Larry And George Studios
Processing: Leeds Town Hall
Processing: Legacy Towers
Processing: LeoRD Produciendo Studio
Processing: Levcon Studios
Processing: Liederkranz Hall, New York
Processing: Little Seed Studios
Processing: Logos Studio (2)
Processi

Processing: Sound Stage Studios
Processing: Sound Techniques, London
Processing: Southern Ground Studios, Nashville
Processing: Soy Sauce Studio
Processing: Square Houze
Processing: St Alban's Church, Holborn
Processing: St John-at-Hackney, London
Processing: St. John's, Smith Square
Processing: St. Osdag Church
Processing: Stamford Street
Processing: Stankonia Recording
Processing: Stockholm Syndrome Sound
Processing: Strongroom
Processing: Studio 112 (4)
Processing: Studio Bambi Gang
Processing: Studio Engine 55
Processing: Studio In The Clouds
Processing: Studio Wonderlust
Processing: Studio Young
Processing: Studio-T
Processing: Studios De l'O.R.T.F.
Processing: Studios at Fischer
Processing: Sun Studios
Processing: Sun Studios, Dublin
Processing: Sunset Sound
Processing: Superlegal Studios
Processing: Symphony Hall, Salt Lake City
Processing: Take Away Studios
Processing: Terminal C
Processing: The Barony Of Rosendal
Rate limit hit. Retrying after 30 seconds...
Processing: The Bas

In [29]:
def extract_location(row):
    """Extract city and country from profile and contact_info"""
    # Initialize default values
    city = ''
    country = ''
    
    # Common country abbreviations and full names
    country_map = {
        'U.S.A.': 'USA',
        'US': 'USA',
        'United States': 'USA',
        'USA': 'USA'
    }
    
    # Try to find city and country in contact_info first
    if pd.notna(row['contact_info']):
        # Look for city, state format (e.g., "Dallas, Texas" or "Baltimore, MD")
        city_state = re.search(r'(?i)[A-Za-z ]+,\s*[A-Za-z]{2}\s*\d', row['contact_info'])
        if city_state:
            city = city_state.group(0).split(',')[0].strip()
        
        # Look for country
        for c in country_map.keys():
            if c in row['contact_info']:
                country = country_map[c]
                break
    
    # If not found in contact_info, try profile
    if (not city or not country) and pd.notna(row['profile']):
        # Look for "in City, State" pattern
        location_match = re.search(r'(?i)in ([A-Za-z ]+),\s*[A-Za-z]{2}', row['profile'])
        if location_match:
            city = location_match.group(1).strip()
            country = 'USA'  # Assume USA if state format is found
    
    return pd.Series({'City': city, 'Country': country})

def extract_contact_details(row):
    """Extract email, phone, and fax from contact_info"""
    email = ''
    phone = ''
    fax = ''
    
    if pd.notna(row['contact_info']):
        # Extract email
        email_match = re.search(r'(?i)(?:email:\s*)?([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', row['contact_info'])
        if email_match:
            email = email_match.group(1).lower()
        
        # Extract phone
        phone_match = re.search(r'(?i)(?:phone|p|telephone|tel|t|mobile|mob|m):\s*([0-9()-.\s]{10,})', row['contact_info'])
        if phone_match:
            phone = phone_match.group(1).strip()
        
        # Extract fax
        fax_match = re.search(r'(?i)(?:fax|f):\s*([0-9()-.\s]{10,})', row['contact_info'])
        if fax_match:
            fax = fax_match.group(1).strip()
    
    return pd.Series({'Email': email, 'Phone': phone, 'Fax': fax})

def process_studio_data(input_file, output_file):
    """Process the studio data and add new columns"""
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Extract location information
    location_info = df.apply(extract_location, axis=1)
    df[['City', 'Country']] = location_info
    
    # Extract contact details
    contact_info = df.apply(extract_contact_details, axis=1)
    df[['Email', 'Phone', 'Fax']] = contact_info
    
    # Clean up the data
    df['City'] = df['City'].str.title()
    df['Email'] = df['Email'].str.lower()
    
    # Save the processed data
    df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")
    
    # Print a sample of extracted information
    print("\nSample of extracted information:")
    print(df[['name', 'City', 'Country', 'Email', 'Phone', 'Fax']].head())
    
    # Print some statistics
    print("\nExtraction statistics:")
    print(f"Total records: {len(df)}")
    print(f"Records with city: {df['City'].notna().sum()}")
    print(f"Records with email: {df['Email'].notna().sum()}")
    print(f"Records with phone: {df['Phone'].notna().sum()}")
    print(f"Records with fax: {df['Fax'].notna().sum()}")

if __name__ == "__main__":
    input_file = "studios_data_discogs_pull_2022_24.csv"
    output_file = "studios_data_processed_2022_24.csv"
    process_studio_data(input_file, output_file)

Processed data saved to studios_data_processed_2022_24.csv

Sample of extracted information:
                        name         City Country Email Phone Fax
0               08001 Studio                                     
1                   38 Fresh  Los Angeles     USA                
2  4 Stars Recording Studios  Miami Beach                        
3                   54 Sound     Ferndale     USA                
4      ABC Studios, Adelaide                                     

Extraction statistics:
Total records: 412
Records with city: 412
Records with email: 412
Records with phone: 412
Records with fax: 412


In [34]:
processed_2019 = pd.read_csv("studios_data_processed_2019.csv")
processed_2020_21 = pd.read_csv("studios_data_processed_2020_21.csv")
processed_2022_24 = pd.read_csv("studios_data_processed_2022_24.csv")

df = pd.concat([processed_2019, processed_2020_21, processed_2022_24])
df = df.drop_duplicates(subset='name', keep='first')
df.to_csv("collated_studios_processed_2022_24.csv", index=False)

In [49]:
import pandas as pd

def update_studio_data(collated_df, sample):
    """
    Update matching studio records in collated_df with values from sample
    
    Parameters:
    collated_df (pandas.DataFrame): The main collated dataset
    sample (pandas.DataFrame): The 2019 dataset with updated values
    
    Returns:
    pandas.DataFrame: Updated version of collated_df
    """
    # Create a copy of collated_df to avoid modifying the original
    result_df = collated_df.copy()
    
    # Define columns to update (columns that exist in both dataframes)
    update_columns = ['address', 'city', 'country', 'links', 'email', 'phone']
    
    # Ensure column names match between dataframes
    sample_renamed = sample.copy()
    
    # Create a mask for matching studios
    matching_studios = result_df['name'].isin(sample_renamed['name'])
    
    # For each column we want to update
    for col in update_columns:
        if col in sample_renamed.columns and col in result_df.columns:
            # Create a mapping dictionary for this column
            update_map = dict(zip(sample_renamed['name'], sample_renamed[col]))
            
            # Update only non-null values from 2019 dataset
            for studio_name, new_value in update_map.items():
                if pd.notna(new_value):  # Only update if the new value isn't NaN
                    mask = (result_df['name'] == studio_name)
                    result_df.loc[mask, col] = new_value
    
    # Print summary of updates
    num_matches = matching_studios.sum()
    print(f"Found {num_matches} matching studios")
    for col in update_columns:
        if col in sample_renamed.columns:
            updates = sum(pd.notna(sample_renamed[sample_renamed['name'].isin(result_df['name'])][col]))
            print(f"Updated {updates} non-null values in '{col}' column")
    
    return result_df


In [53]:
# Read the CSV files
sample = pd.read_csv('studios_data_processed_2020_21.csv')
collated = pd.read_csv('collated_studios_processed_2019_24.csv')

# Update the data
updated_df = update_studio_data(collated, sample)

Found 570 matching studios
Updated 250 non-null values in 'address' column
Updated 470 non-null values in 'city' column
Updated 480 non-null values in 'country' column
Updated 303 non-null values in 'links' column
Updated 79 non-null values in 'email' column
Updated 139 non-null values in 'phone' column


In [61]:
updated_df

0

In [55]:
updated_df.to_csv("collated_studios_processed_2019_24.csv", index=False)