# Databricks Summit Word Analysis 
### for 2023 versus 2024

## Get Transcripts from YouTube Videos

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
import re
import plotly.graph_objects as go
from collections import Counter

def get_youtube_transcripts(video_ids):
    """
    Fetches and concatenates transcripts from a list of YouTube video IDs.
    
    Args:
    video_ids (list): List of YouTube video IDs as strings.
    
    Returns:
    str: Concatenated transcript text.
    """
    transcript_text = ''
    for video_id in video_ids:
        try:
            # Fetch transcript
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
            # Concatenate transcript texts
            transcript_text += ' '.join([transcript['text'] for transcript in transcript_list]) + ' '
        except Exception as e:
            print(f"Error fetching transcript for video ID {video_id}: {e}")

    return transcript_text.lower()

# List the YouTube video ids from its URL for the conferences:
video_ids_2023 = ['h4z4vBoxQ6s', 'un_telVvKiY'] # Databricks 2023
video_ids_2024 = ['UfbyzK488Hk', 'uB0n4IZmS34'] # Databricks 2024

# Run the function to get the transcripts from YouTube
transcript_2023 = get_youtube_transcripts(video_ids_2023)
transcript_2024 = get_youtube_transcripts(video_ids_2024)

# Clean up the misqouted text that YouTube created when it tried to convert speech to text
databricks_variations = [
    r'\bdata breaks\b', r'\bdata bricks\b', r'\bdata brick\b', r'\bdata breakes\b', r'\bdata bri\b', r'\bdata brak\b', r'\bdata braks\b',
    r'\bdatabrick\b', r'\bdatabriks\b', r'\bdata briks\b', r'\bdata breakers\b', r'\bdata brakes\b', r'\bdatab bricks\b',
    r'\bdata brake\b', r'\bdata break\b', r'\bdatab briak\b', r'\bdatab briaks\b', r'\bdatab brick\b', r'\bdatab bricks\b',
    r'\bdata brecks\b', r'\bdata briak\b', r'\bdata brook\b', r'\bdata bir\b', r"\bdata burk's\b", r'\bdata brs\b'
]
for variation in databricks_variations:
    transcript_2023 = re.sub(variation, 'databricks', transcript_2023, flags=re.IGNORECASE)
    transcript_2024 = re.sub(variation, 'databricks', transcript_2024, flags=re.IGNORECASE)

# Fix other speech foul-ups
transcript_2023 = re.sub(r'\bsequel\b', 'sql', transcript_2023, flags=re.IGNORECASE)
transcript_2024 = re.sub(r'\bsequel\b', 'sql', transcript_2024, flags=re.IGNORECASE)

# Split the text into words to get a final word count
transcript_2023_words = transcript_2023.split()
transcript_2024_words = transcript_2024.split()
transcript_2023_wordcount = len(transcript_2023_words)
transcript_2024_wordcount = len(transcript_2024_words)

# Now, get the count of the following words: "spark" "ai", "data", "model" for each year's transcript
words_to_count = ['spark', 'ai', 'data', 'model']

def count_words(word_list, words_to_count):
    word_counter = Counter(word_list)
    return {word: word_counter[word] for word in words_to_count}

count_2023 = count_words(transcript_2023_words, words_to_count)
count_2024 = count_words(transcript_2024_words, words_to_count)

# Calculate the percentage of the count of each of those 4 words, per year relative to the total word count as a percentage
def calculate_percentage(count_dict, total_words):
    return {word: (count / total_words) * 100 for word, count in count_dict.items()}

percentages_2023 = calculate_percentage(count_2023, transcript_2023_wordcount)
percentages_2024 = calculate_percentage(count_2024, transcript_2024_wordcount)


# Graph it

In [None]:

# Prepare data for Plotly
years = ['2023', '2024']
data_percentages = [percentages_2023['data'], percentages_2024['data']]
ai_percentages = [percentages_2023['ai'], percentages_2024['ai']]
model_percentages = [percentages_2023['model'], percentages_2024['model']]
spark_percentages = [percentages_2023['spark'], percentages_2024['spark']]

# Create traces
trace_data = go.Bar(x=years, y=data_percentages, name='Data', marker_color='#98102A')
trace_ai = go.Bar(x=years, y=ai_percentages, name='AI', marker_color='#2272B4')
trace_model = go.Bar(x=years, y=model_percentages, name='Model', marker_color='#FFAB00')
trace_spark = go.Bar(x=years, y=spark_percentages, name='Spark', marker_color='#00A972')

# Create layout with right-justified title and add logo
layout = go.Layout(
    title={
        'text': ''' Summit Keyword Freq Comparison<br>(as Percentage of Total Words)''',
        'x': 1,  # Right justify the title
        'xanchor': 'right',
        'yanchor': 'top'
    },
    xaxis=dict(title='Summit Year'),
    yaxis=dict(title='Percentage of Total Words'),
    barmode='group',
    width=600,  # Set the width to make the plot more narrow
    plot_bgcolor='white',  # Set plot background to white
    paper_bgcolor='white',  # Set paper background to white
    images=[dict(
        source="https://upload.wikimedia.org/wikipedia/commons/6/63/Databricks_Logo.png",
        xref="paper", yref="paper",
        x=0, y=1.05,
        sizex=0.40, sizey=0.40,
        xanchor="left", yanchor="bottom"
    )]
)

# Create the fig. and show it.
fig = go.Figure(data=[trace_data, trace_ai, trace_model, trace_spark], layout=layout)
fig.show()