In [1]:
import pandas as pd
import re

from nltk.tokenize import RegexpTokenizer

from youtube_data_api import get_videos_with_keyword, get_video_statistics, get_channel_statistics

In [2]:
MAX_RESULTS_PER_PAGE = 50
TOTAL_VIDEOS = 200
DELAY_BETWEEN_REQUESTS = 1  # delay in seconds

In [3]:

videos = get_videos_with_keyword('KSW', TOTAL_VIDEOS, MAX_RESULTS_PER_PAGE, DELAY_BETWEEN_REQUESTS)

In [4]:
video_data = []
for video in videos:
    video_id = video['id']['videoId']
    title = video['snippet']['title']
    statistics = get_video_statistics(video_id)
    view_count = statistics.get('viewCount', 0)
    channel_id = video['snippet']['channelId']
    channel = video['snippet']['channelTitle']
    channel_statistics = get_channel_statistics(channel_id)
    channel_subscribers = channel_statistics.get('subscriberCount', 0)
    video_info = {
        'video_id': video_id,
        'title': title,
        'view_count': view_count,
        'channel': channel,
        'channel_subscribers': channel_subscribers
    }
    video_data.append(video_info)

# Confersion of dicts to DataFrame
df = pd.DataFrame(video_data)


In [5]:
df.head(10)

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
0,jAPParfHrxE,KSW 85: Salahdine Parnasse vs Robert Ruchała |...,9255,KSW,506000
1,vDICqILiGQQ,"HISTORYCZNA SPORTOWA WALKA NA FAME MMA?(KSW,UF...",4371,MMA INFO,11600
2,T1Emy34rGIk,Najlepsze nokauty i poddania bohaterów XTB KSW...,256569,KSW,506000
3,nTj04YcA37k,Najlepsze nokauty i walka gali XTB KSW Colosse...,165422,KSW,506000
4,zbvfrGob5mY,"Kizo ft. Kabe, ReTo, Gruby Mielzky, Borixon - ...",6935619,MY TO SUKCES,1000000
5,9ApiSJm0nQo,KSW 84: TOP 10 Moments - Najlepsze akcje gali,17273,KSW,506000
6,av889OfoiQE,Najlepsze momenty gali XTB KSW Colosseum 2 | X...,84654,KSW,506000
7,cEy5oHrnCoQ,KSW 84: Bonusy - Najlepsze nokauty i poddania ...,13688,KSW,506000
8,AMrL_AcVf04,OLIWIER JARZECKI | 2 WYGRANE NA JEDNEJ GALI | ...,236,Tv F-O-T-O wydarzenia i wywiady,2250
9,-oG1dAzfq3I,Dzień z Adamem Soldaevem - Droga do XTB KSW Co...,138014,KSW,506000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   video_id             200 non-null    object
 1   title                200 non-null    object
 2   view_count           200 non-null    object
 3   channel              200 non-null    object
 4   channel_subscribers  200 non-null    object
dtypes: object(5)
memory usage: 7.9+ KB


I need to get rid of videos from official KSW channels as they are not my target and make disturbs in data

In [7]:
df_filtered = df[df['channel'] != 'KSW']

In [8]:
df_filtered

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
1,vDICqILiGQQ,"HISTORYCZNA SPORTOWA WALKA NA FAME MMA?(KSW,UF...",4371,MMA INFO,11600
4,zbvfrGob5mY,"Kizo ft. Kabe, ReTo, Gruby Mielzky, Borixon - ...",6935619,MY TO SUKCES,1000000
8,AMrL_AcVf04,OLIWIER JARZECKI | 2 WYGRANE NA JEDNEJ GALI | ...,236,Tv F-O-T-O wydarzenia i wywiady,2250
10,e1-u_eF0UBA,KSW Free Fight: Mamed Khalidov vs. Mariusz Pud...,144250,KSW International,16700
15,HDQDYa5jnWE,OGROMNE PROBLEMY KSW! CO DALEJ Z GALAMI? (VIAP...,10558,MMA INFO,11600
...,...,...,...,...,...
189,73HjFUU4ZgM,"KSW 84 - KREW, POT I ŁZY. NAROŻNIKI I BACKSTAG...",10297,Kanał Sportowy,1070000
190,SFopg1oGO-A,Fadipe vs Kaszubowski #ksw 84,233,afanik,613
192,6R0gjEtKNNc,SZPILKA ODGRYZA SIĘ PUDZIANOWSKIEMU po wygrane...,214322,KOLOSEUM,66700
195,WFuJ2oz7TkM,Kizo w KSW…,75380,MY TO SUKCES,1000000


In [9]:
def standardize_text(df_filtered, text_field):
    df_filtered[text_field] = df_filtered[text_field].str.lower()
    df_filtered[text_field] = df_filtered[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs
    return df_filtered

clean_df = standardize_text(df_filtered, "title")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[text_field] = df_filtered[text_field].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[text_field] = df_filtered[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs


In [10]:
clean_df.head()

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
1,vDICqILiGQQ,"historyczna sportowa walka na fame mma?(ksw,uf...",4371,MMA INFO,11600
4,zbvfrGob5mY,"kizo ft. kabe, reto, gruby mielzky, borixon - ...",6935619,MY TO SUKCES,1000000
8,AMrL_AcVf04,oliwier jarzecki | 2 wygrane na jednej gali | ...,236,Tv F-O-T-O wydarzenia i wywiady,2250
10,e1-u_eF0UBA,ksw free fight: mamed khalidov vs. mariusz pud...,144250,KSW International,16700
15,HDQDYa5jnWE,ogromne problemy ksw! co dalej z galami? (viap...,10558,MMA INFO,11600


In [11]:
clean_df.groupby("channel").count()

Unnamed: 0_level_0,video_id,title,view_count,channel_subscribers
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6PAK,1,1,1,1
BOXDEL,1,1,1,1
FANSPORTU TV,10,10,10,10
Fair Play Thinker podcast,1,1,1,1
FightsportPL,3,3,3,3
Fortuna - Zakłady bukmacherskie,2,2,2,2
InTheCagePL,15,15,15,15
Interia Sport,1,1,1,1
KANAŁ O WALKACH,1,1,1,1
KLATKA po KLATCE,19,19,19,19


In [12]:


tokenizer = RegexpTokenizer(r'\w+')

clean_df["tokens"] = clean_df["title"].apply(tokenizer.tokenize)
clean_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["tokens"] = clean_df["title"].apply(tokenizer.tokenize)


Unnamed: 0,video_id,title,view_count,channel,channel_subscribers,tokens
1,vDICqILiGQQ,"historyczna sportowa walka na fame mma?(ksw,uf...",4371,MMA INFO,11600,"[historyczna, sportowa, walka, na, fame, mma, ..."
4,zbvfrGob5mY,"kizo ft. kabe, reto, gruby mielzky, borixon - ...",6935619,MY TO SUKCES,1000000,"[kizo, ft, kabe, reto, gruby, mielzky, borixon..."
8,AMrL_AcVf04,oliwier jarzecki | 2 wygrane na jednej gali | ...,236,Tv F-O-T-O wydarzenia i wywiady,2250,"[oliwier, jarzecki, 2, wygrane, na, jednej, ga..."
10,e1-u_eF0UBA,ksw free fight: mamed khalidov vs. mariusz pud...,144250,KSW International,16700,"[ksw, free, fight, mamed, khalidov, vs, marius..."
15,HDQDYa5jnWE,ogromne problemy ksw! co dalej z galami? (viap...,10558,MMA INFO,11600,"[ogromne, problemy, ksw, co, dalej, z, galami,..."


In [13]:
all_words = [word for sublist in clean_df["tokens"] for word in sublist]

# Convert array to Pandas Series
word_series = pd.Series(all_words)

# Filter out words that have at least 3 characters
filtered_words = word_series[word_series.apply(lambda x: len(x) > 3)]

# Count the occurrences of each word
word_counts = filtered_words.value_counts()

#I need to create a list of words that can disturb my data mining. These are parts of speech other than nouns and words with a meaning broader than mma
list_of_disturbing_words = ['quot', 'przed', 'gali', 'fight', 'shorts', 'podsumowanie', 'mateusz', 'free', 'walce', 'słowa', '2023', 'dalej']

# Choose the 10 most common words
hot_words = word_counts.head(30).index.tolist()

for word in hot_words:
    if word in list_of_disturbing_words:

        hot_words.remove(word)

top_10_words = hot_words[0:10]

In [14]:
print(top_10_words)

['colosseum', 'szpilka', 'pudzianowski', 'khalidov', 'fame', 'mamed', 'pudzian', 'klatka', 'klatce', 'parnasse']


now I'll read more data from API

In [15]:
TOTAL_VIDEOS = 1000

videos = get_videos_with_keyword(top_10_words, TOTAL_VIDEOS, MAX_RESULTS_PER_PAGE, DELAY_BETWEEN_REQUESTS)

video_data = []
for video in videos:
    video_id = video['id']['videoId']
    title = video['snippet']['title']
    publish_time = video['snippet']['publishTime']
    statistics = get_video_statistics(video_id)
    view_count = statistics.get('viewCount', 0)
    channel_id = video['snippet']['channelId']
    channel = video['snippet']['channelTitle']
    channel_statistics = get_channel_statistics(channel_id)
    channel_subscribers = channel_statistics.get('subscriberCount', 0)
    video_info = {
        'video_id': video_id,
        'title': title,
        'view_count': view_count,
        'channel': channel,
        'channel_subscribers': channel_subscribers,
        'publish_time': publish_time,
    }
    video_data.append(video_info)

# Confersion of dicts to DataFrame
df = pd.DataFrame(video_data)


In [23]:
videos[0]

{'kind': 'youtube#searchResult',
 'etag': 'tjulP9HNDhumXRAEAuaHbSCY4cM',
 'id': {'kind': 'youtube#video', 'videoId': 'FdswhegK0mU'},
 'snippet': {'publishedAt': '2023-05-31T14:00:09Z',
  'channelId': 'UCRqvJVpMT9dzhX5UzRXPQbg',
  'title': 'KSW Free Fight: Salahdine Parnasse vs. Sebastian Rajewski',
  'description': "France's Salahdine Parnasse captured his second KSW title with this win over Sebastian Rajewski from KSW 76. He now faces ...",
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/FdswhegK0mU/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/FdswhegK0mU/mqdefault.jpg',
    'width': 320,
    'height': 180},
   'high': {'url': 'https://i.ytimg.com/vi/FdswhegK0mU/hqdefault.jpg',
    'width': 480,
    'height': 360}},
  'channelTitle': 'KSW International',
  'liveBroadcastContent': 'none',
  'publishTime': '2023-05-31T14:00:09Z'}}

In [21]:
df.head()

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers,publish_time
0,FdswhegK0mU,KSW Free Fight: Salahdine Parnasse vs. Sebasti...,148591,KSW International,16700,
1,IS6DTtPdb9I,Salahdine Parnasse - Wirtuoz i showman | KSW 68,37359,KSW,506000,
2,X_PW5kCXfzk,KSW Free Fight: Salahdine Parnasse vs. Daniel ...,231898,KSW International,16700,
3,KzaM4AOf-nU,TAKTYKI NA PARNASSE | TRENING,11911,Robert Ruchała,4140,
4,2pKUU93zyvQ,"Salahdine Parnasse, Le Jeune Prodige du MMA - ...",182965,ZACK,663000,


In [16]:

# Export to CSV
df.to_csv('youtube_data.csv', index=False)

Here I finished data aggregation. Futher work will be done in seperate file.