In [1]:
import pandas as pd
import re

from nltk.tokenize import RegexpTokenizer

from youtube_data_api import get_videos_with_keyword, get_video_statistics, get_channel_statistics



In [None]:
def standardize_text(df_filtered, text_field):
    df_filtered[text_field] = df_filtered[text_field].str.lower()
    df_filtered[text_field] = df_filtered[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs
    return df_filtered

In [4]:
MAX_RESULTS_PER_PAGE = 50
TOTAL_VIDEOS = 200
DELAY_BETWEEN_REQUESTS = 0.1  # delay in seconds

In [5]:

videos = get_videos_with_keyword('KSW', TOTAL_VIDEOS, MAX_RESULTS_PER_PAGE, DELAY_BETWEEN_REQUESTS)

In [6]:
video_data = []
for video in videos:
    video_id = video['id']['videoId']
    title = video['snippet']['title']
    statistics = get_video_statistics(video_id)
    view_count = statistics.get('viewCount', 0)
    channel_id = video['snippet']['channelId']
    channel = video['snippet']['channelTitle']
    channel_statistics = get_channel_statistics(channel_id)
    channel_subscribers = channel_statistics.get('subscriberCount', 0)
    video_info = {
        'video_id': video_id,
        'title': title,
        'view_count': view_count,
        'channel': channel,
        'channel_subscribers': channel_subscribers
    }
    video_data.append(video_info)

# Confersion of dicts to DataFrame
df = pd.DataFrame(video_data)


In [44]:
videos[0]

{'kind': 'youtube#searchResult',
 'etag': 'XAGCV0vnznx9K5ubd5wtShjap3U',
 'id': {'kind': 'youtube#video', 'videoId': 'aYZgNn1aaaY'},
 'snippet': {'publishedAt': '2023-10-02T14:00:01Z',
  'channelId': 'UCvgfXK4nTYKudb0rFR6noLA',
  'title': 'Bobby Green vs Tony Ferguson | FREE FIGHT | UFC Vegas 80',
  'description': 'Tune in to UFC Vegas 80 main card action on September 23 at 7pm ET / 4pm PT! Subscribe to get all the latest UFC content: ...',
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/aYZgNn1aaaY/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/aYZgNn1aaaY/mqdefault.jpg',
    'width': 320,
    'height': 180},
   'high': {'url': 'https://i.ytimg.com/vi/aYZgNn1aaaY/hqdefault.jpg',
    'width': 480,
    'height': 360}},
  'channelTitle': 'UFC',
  'liveBroadcastContent': 'none',
  'publishTime': '2023-10-02T14:00:01Z'}}

In [8]:
df.head(10)

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
0,MAlCshkwRIM,OSTASZEWSKI | Walka ZAŁĘCKI vs OMIELAŃCZUK. Sy...,16038,FANSPORTU TV,118000
1,wG9SyhB-obU,Sebastian PRZYBYSZ - faul Wikłacza | Słowa Mar...,9032,InTheCagePL,56200
2,9ApiSJm0nQo,KSW 84: TOP 10 Moments - Najlepsze akcje gali,20338,KSW,509000
3,_FsoU_XHAYk,"Nokaut, poddanie i walka wieczoru gali KSW 86 ...",16624,KSW,509000
4,UY2W1B2GEPQ,OMIELAŃCZUK | &quot;NIENAWIDZĘ DENISA ZAŁĘCKIE...,26650,FANSPORTU TV,118000
5,D5A7tFSgpp0,KSW 87: Roman Szymański vs Leo Brichta - Trailer,9015,KSW,509000
6,av889OfoiQE,Najlepsze momenty gali XTB KSW Colosseum 2 | X...,89293,KSW,509000
7,O9hQFOw_btA,&quot;Bardzo frajerski numer&quot;! Szef KSW w...,64391,KLATKA po KLATCE,12500
8,4UAwsq7B3fY,BARTOSZ KOWAL | STANY | CAVE | FREAKI | KHALID...,375,Tv F-O-T-O wydarzenia i wywiady,2440
9,outU0orcBTU,Otwarcie gali XTB KSW Colosseum 2 | Grand Opening,112416,KSW,509000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   video_id             200 non-null    object
 1   title                200 non-null    object
 2   view_count           200 non-null    object
 3   channel              200 non-null    object
 4   channel_subscribers  200 non-null    object
dtypes: object(5)
memory usage: 7.9+ KB


I need to get rid of videos from official KSW channels as they are not my target and make disturbs in data

In [10]:
df_filtered = df[df['channel'] != 'KSW']

In [11]:
df_filtered

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
0,MAlCshkwRIM,OSTASZEWSKI | Walka ZAŁĘCKI vs OMIELAŃCZUK. Sy...,16038,FANSPORTU TV,118000
1,wG9SyhB-obU,Sebastian PRZYBYSZ - faul Wikłacza | Słowa Mar...,9032,InTheCagePL,56200
4,UY2W1B2GEPQ,OMIELAŃCZUK | &quot;NIENAWIDZĘ DENISA ZAŁĘCKIE...,26650,FANSPORTU TV,118000
7,O9hQFOw_btA,&quot;Bardzo frajerski numer&quot;! Szef KSW w...,64391,KLATKA po KLATCE,12500
8,4UAwsq7B3fY,BARTOSZ KOWAL | STANY | CAVE | FREAKI | KHALID...,375,Tv F-O-T-O wydarzenia i wywiady,2440
...,...,...,...,...,...
193,o5IpISkgUn8,🎤 KSW Colosseum 2: MEDIA DAY [WSZYSTKIE WYWIADY],3367,InTheCagePL,56200
195,x_pve7KEvLw,K. Głowacki po głośnym debiucie w KSW zmienia ...,30451,Filip Lewandowski - rozmowy z fighterami,6980
197,I6xNnHmQ6WY,GWIAZDY KSW TYPUJĄ WALKĘ SZPILKA - PUDZIANOWSKI!,26966,KOLOSEUM,70000
198,aRoLjJR3pMo,BOREK ROZDAJE BONUSY NA XTB KSW 81,19215,Kanał Sportowy,1120000


In [13]:
clean_df = standardize_text(df_filtered, "title")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[text_field] = df_filtered[text_field].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[text_field] = df_filtered[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs


In [14]:
clean_df.head()

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
0,MAlCshkwRIM,ostaszewski | walka załęcki vs omielańczuk. sy...,16038,FANSPORTU TV,118000
1,wG9SyhB-obU,sebastian przybysz - faul wikłacza | słowa mar...,9032,InTheCagePL,56200
4,UY2W1B2GEPQ,omielańczuk | &quot;nienawidzę denisa załęckie...,26650,FANSPORTU TV,118000
7,O9hQFOw_btA,&quot;bardzo frajerski numer&quot;! szef ksw w...,64391,KLATKA po KLATCE,12500
8,4UAwsq7B3fY,bartosz kowal | stany | cave | freaki | khalid...,375,Tv F-O-T-O wydarzenia i wywiady,2440


In [15]:
clean_df.groupby("channel").count()

Unnamed: 0_level_0,video_id,title,view_count,channel_subscribers
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6PAK,1,1,1,1
AntyFakty,1,1,1,1
BOXDEL,1,1,1,1
Binge Society - Dark,1,1,1,1
Cage of MMA,2,2,2,2
FANSPORTU TV,8,8,8,8
FightsportPL,5,5,5,5
Filip Lewandowski - rozmowy z fighterami,1,1,1,1
Fortuna - Zakłady bukmacherskie,3,3,3,3
InTheCagePL,16,16,16,16


In [16]:


tokenizer = RegexpTokenizer(r'\w+')

clean_df["tokens"] = clean_df["title"].apply(tokenizer.tokenize)
clean_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["tokens"] = clean_df["title"].apply(tokenizer.tokenize)


Unnamed: 0,video_id,title,view_count,channel,channel_subscribers,tokens
0,MAlCshkwRIM,ostaszewski | walka załęcki vs omielańczuk. sy...,16038,FANSPORTU TV,118000,"[ostaszewski, walka, załęcki, vs, omielańczuk,..."
1,wG9SyhB-obU,sebastian przybysz - faul wikłacza | słowa mar...,9032,InTheCagePL,56200,"[sebastian, przybysz, faul, wikłacza, słowa, m..."
4,UY2W1B2GEPQ,omielańczuk | &quot;nienawidzę denisa załęckie...,26650,FANSPORTU TV,118000,"[omielańczuk, quot, nienawidzę, denisa, załęck..."
7,O9hQFOw_btA,&quot;bardzo frajerski numer&quot;! szef ksw w...,64391,KLATKA po KLATCE,12500,"[quot, bardzo, frajerski, numer, quot, szef, k..."
8,4UAwsq7B3fY,bartosz kowal | stany | cave | freaki | khalid...,375,Tv F-O-T-O wydarzenia i wywiady,2440,"[bartosz, kowal, stany, cave, freaki, khalidov..."


In [31]:
all_words = [word for sublist in clean_df["tokens"] for word in sublist]

# Convert array to Pandas Series
word_series = pd.Series(all_words)

# Filter out words that have at least 3 characters
filtered_words = word_series[word_series.apply(lambda x: len(x) > 3)]

# Count the occurrences of each word
word_counts = filtered_words.value_counts()

#I need to create a list of words that can disturb my data mining. These are parts of speech other than nouns and words with a meaning broader than mma
list_of_disturbing_words = ['quot', 'gorąco']

# Choose the 10 most common words
hot_words = word_counts.head(30).index.tolist()

for word in hot_words:
    if word in list_of_disturbing_words:

        hot_words.remove(word)

top_10_words = hot_words[0:10]
top_10_words.extend(['ksw', 'mma'])

In [32]:
print(top_10_words)

['colosseum', 'szpilka', 'borek', 'parnasse', 'khalidov', 'pudzianowski', 'wikłacz', 'przybysz', 'lewandowski', 'pudzian', 'ksw', 'mma', 'ufc']


now I'll read more data from API

In [36]:
TOTAL_VIDEOS = 10000

videos = get_videos_with_keyword(top_10_words, TOTAL_VIDEOS, MAX_RESULTS_PER_PAGE, DELAY_BETWEEN_REQUESTS)

video_data = []
for video in videos:
    video_id = video['id']['videoId']
    title = video['snippet']['title']
    publish_time = video['snippet']['publishTime']
    statistics = get_video_statistics(video_id)
    view_count = statistics.get('viewCount', 0)
    channel_id = video['snippet']['channelId']
    channel = video['snippet']['channelTitle']
    channel_statistics = get_channel_statistics(channel_id)
    channel_subscribers = channel_statistics.get('subscriberCount', 0)
    video_info = {
        'video_id': video_id,
        'title': title,
        'view_count': view_count,
        'channel': channel,
        'channel_subscribers': channel_subscribers,
        'publish_time': publish_time,
    }
    video_data.append(video_info)

# Conversion of dicts to DataFrame
df = pd.DataFrame(video_data)


In [37]:
videos[0]

{'kind': 'youtube#searchResult',
 'etag': 'XAGCV0vnznx9K5ubd5wtShjap3U',
 'id': {'kind': 'youtube#video', 'videoId': 'aYZgNn1aaaY'},
 'snippet': {'publishedAt': '2023-10-02T14:00:01Z',
  'channelId': 'UCvgfXK4nTYKudb0rFR6noLA',
  'title': 'Bobby Green vs Tony Ferguson | FREE FIGHT | UFC Vegas 80',
  'description': 'Tune in to UFC Vegas 80 main card action on September 23 at 7pm ET / 4pm PT! Subscribe to get all the latest UFC content: ...',
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/aYZgNn1aaaY/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/aYZgNn1aaaY/mqdefault.jpg',
    'width': 320,
    'height': 180},
   'high': {'url': 'https://i.ytimg.com/vi/aYZgNn1aaaY/hqdefault.jpg',
    'width': 480,
    'height': 360}},
  'channelTitle': 'UFC',
  'liveBroadcastContent': 'none',
  'publishTime': '2023-10-02T14:00:01Z'}}

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562 entries, 0 to 561
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   video_id             562 non-null    object
 1   title                562 non-null    object
 2   view_count           562 non-null    object
 3   channel              562 non-null    object
 4   channel_subscribers  562 non-null    object
 5   publish_time         562 non-null    object
dtypes: object(6)
memory usage: 26.5+ KB


In [39]:
df.head()

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers,publish_time
0,aYZgNn1aaaY,Bobby Green vs Tony Ferguson | FREE FIGHT | UF...,430705,UFC,16900000,2023-10-02T14:00:01Z
1,-xAKq2yWM3s,Charles Oliveira vs Islam Makhachev | FREE FIG...,4027978,UFC,16900000,2023-09-25T17:40:56Z
2,p5TkMyRqWe8,Charles Oliveira ACCUSES UFC of UNFAIR TREATME...,88542,Full Mount MMA,481000,2023-10-02T19:15:06Z
3,1N7SF_issiQ,Paulo Costa vs Luke Rockhold | FREE FIGHT | UF...,872278,UFC,16900000,2023-09-30T14:00:02Z
4,HdTiQnBzoEk,Dana forcing Belal to fight Usman - No Title Shot,21581,Mojahed Fudailat,531000,2023-10-03T14:18:18Z


In [40]:

# Export to CSV
df.to_csv('youtube_data.csv', index=False)


Here I finished data aggregation. Futher work will be done in seperate file.