# Crawler para encontrar e baixar vídeos do youtube baseado em um assunto

## Import das bibliotecas

In [1]:
!pip install pandas
!pip install youtube_dl



In [2]:
import time
import subprocess
import pandas as pd
from youtube_dl import YoutubeDL
from __future__ import unicode_literals

## Configurações iniciais

In [3]:
# Dataset do google com labels adicionados manualmente referente a eventos de audio. Leia mais em: https://research.google.com/audioset/index.html
# Também pode ser obtido em: http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
df_eval_segments = pd.read_csv('../../data/external/eval_segments.csv', skipinitialspace = True, quotechar='"', names=["YTID", "start_seconds", "end_seconds", "labels"])

# Dataset contendo os nomes e demais informações dos eventos.
# Também pode ser obtido em: http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
df_class_labels_index = pd.read_csv('../../data/external/youtube_class_labels_index.csv')

In [4]:
# Remoção das 3 primeiras linhas. Contém informações sobre o dataset e não serviriam para a automação
df_eval_segments = df_eval_segments.iloc[3:]

In [5]:
# Filtrando o dataset pelas linhas que contem o label referente a categoria Cat
df_eval_segments = df_eval_segments[df_eval_segments.labels.str.contains('/m/01yrx')]

In [6]:
# Métodos para substituir os labels de df_eval_segments pelo display_name do dataframe `df_class_labels_index` para facil compreensão

#captura o nome do label no dataset df_class_labels_index
def label_description(label):
  return df_class_labels_index[df_class_labels_index.mid == label]['display_name'].values[0]

#substitui o código do label pelo nome do labeldef get_text_from_label(labels):
def get_text_from_label(labels):
  labels_array = labels.split(',')

  for label in labels_array:
    labels = labels.replace(label, label_description(label))
    
  return labels


In [7]:
#cria nova coluna com o nome dos labels
df_eval_segments['Label_names'] = df_eval_segments['labels'].apply(lambda x: get_text_from_label(x) )

In [8]:
# Calcula a duração do intervalo desejado em segundos

# Converte as colunas de str para float
df_eval_segments = df_eval_segments.astype({"end_seconds": float, "start_seconds": float})

# Insere em nova coluna
df_eval_segments.insert(loc=3, column='duration_seconds', value=df_eval_segments['end_seconds'] - df_eval_segments['start_seconds'])

## Funções para download e conversão dos audios

In [9]:
def download_youtube_audio(yt_id, start, duration):
  destination_folder = '../../data/raw/youtube_audios/' + yt_id + '.wav'
  
  ydl_opts = {
      'format': 'bestaudio/best',
      'postprocessors': [{
          'key': 'FFmpegExtractAudio',
          'preferredcodec': 'wav',
          'preferredquality': '192',
      }],
      'outtmpl': destination_folder
  }

  yt_url = 'https://www.youtube.com/watch?v=' + yt_id

  retorno = ''

  try:
    ydl = YoutubeDL(ydl_opts)
    result = ydl.extract_info(yt_url, download=True)
    video = result['entries'][0] if 'entries' in result else result

    url = video['url']
    retorno = video['title']

    # Atrasando o download do prõximo video para não bloquear o google
    time.sleep(1)
  except Exception as e:
      retorno = e

  return retorno

In [10]:
def download_audio_dataset(df):
  line = 0
  df['title'] = ''

  for index, row in df.iterrows():
    yt_id = row[0]
    start = row[1]
    duration = row[3]
    
    line = line + 1
    df.loc[index, 'title'] = download_youtube_audio(yt_id, start, duration)

  return df 

In [11]:
df_with_titles = download_audio_dataset(df_eval_segments)

ing webpage
[download] Destination: ..\..\data\raw\youtube_audios\8ewn7JCJbEM.wav
[download] 100% of 1.05MiB in 00:00                          
[ffmpeg] Post-process file ..\..\data\raw\youtube_audios\8ewn7JCJbEM.wav exists, skipping
[youtube] 8gyY5kN5IjA: Downloading webpage
[download] Destination: ..\..\data\raw\youtube_audios\8gyY5kN5IjA.wav
[download] 100% of 865.55KiB in 00:00                  
[ffmpeg] Correcting container in "..\..\data\raw\youtube_audios\8gyY5kN5IjA.wav"
[ffmpeg] Post-process file ..\..\data\raw\youtube_audios\8gyY5kN5IjA.wav exists, skipping
[youtube] 8ws1ligErwo: Downloading webpage
[youtube] 8ws1ligErwo: Downloading MPD manifest
[download] Destination: ..\..\data\raw\youtube_audios\8ws1ligErwo.wav
[download] 100% of 156.66KiB in 00:00                  
[ffmpeg] Correcting container in "..\..\data\raw\youtube_audios\8ws1ligErwo.wav"
[ffmpeg] Post-process file ..\..\data\raw\youtube_audios\8ws1ligErwo.wav exists, skipping
[youtube] 8xoenlVVBZw: Downloading web

## Resumo dos downloads

In [12]:
df_with_titles

Unnamed: 0,YTID,start_seconds,end_seconds,duration_seconds,labels,Label_names,title
79,-9byM5-ih2M,0.0,10.0,10.0,"/m/01yrx,/m/068hy,/m/07qrkrw,/m/07r_k2n,/m/07r...","Cat,Domestic animals, pets,Meow,Yip,Bow-wow,Sp...",ERROR: This video has been removed for violati...
224,-ZJqu_4zLMc,30.0,40.0,10.0,"/m/01yrx,/m/068hy,/m/07qrkrw,/m/07r81j2,/m/09x...","Cat,Domestic animals, pets,Meow,Caterwaul,Spee...",СУПЕР !!! Кот УРИ отвечает... CAT URI
269,-gSfPQqi6nI,30.0,40.0,10.0,"/m/01yrx,/m/068hy,/m/07qrkrw,/m/07r81j2,/m/0jbk","Cat,Domestic animals, pets,Meow,Caterwaul,Animal",Cute Cat
366,-uGHAvfqs2I,30.0,40.0,10.0,"/m/01yrx,/m/068hy,/m/07qrkrw,/m/07r81j2,/m/0jbk","Cat,Domestic animals, pets,Meow,Caterwaul,Animal",Maine Coon Cat Is Anxious For Breakfast
370,-vZ-_4oFCzs,17.0,27.0,10.0,"/m/01yrx,/m/068hy,/m/07qrkrw,/m/07r81j2,/m/0jbk","Cat,Domestic animals, pets,Meow,Caterwaul,Animal",Cat Speak
...,...,...,...,...,...,...,...
19701,xa-lg7S1vns,51.0,61.0,10.0,"/m/01yrx,/m/02yds9,/m/068hy,/m/0jbk","Cat,Purr,Domestic animals, pets,Animal","Kitten, about 1 week old, purring"
20151,zCdOEvduBTo,30.0,40.0,10.0,"/m/01yrx,/m/03qc9zr,/m/068hy,/m/0jbk","Cat,Screaming,Domestic animals, pets,Animal",Bo reacts to cat screams
20164,zE6BVRSQNZU,50.0,60.0,10.0,"/m/01yrx,/m/02yds9,/m/068hy,/m/0jbk","Cat,Purr,Domestic animals, pets,Animal",Cat purr - Loud.
20298,zh4tznWde1M,330.0,340.0,10.0,"/m/01yrx,/m/02yds9,/m/068hy,/m/0jbk","Cat,Purr,Domestic animals, pets,Animal",Adorable rescue kittens purring SO loud! (bes...
