In [None]:
import requests
import time
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from youtube_data_api import get_videos_with_keyword, get_video_statistics

In [None]:
MAX_RESULTS_PER_PAGE = 50
TOTAL_VIDEOS = 500
DELAY_BETWEEN_REQUESTS = 1  # delay in seconds

In [None]:

videos = get_videos_with_keyword('KSW', TOTAL_VIDEOS, MAX_RESULTS_PER_PAGE, DELAY_BETWEEN_REQUESTS)

video_data = []
for video in videos:
    video_id = video['id']['videoId']
    title = video['snippet']['title']
    statistics = get_video_statistics(video_id)
    view_count = statistics.get('viewCount', 0)
    channel_subscribers = statistics.get('subscriberCount', 0)
    video_info = {
        'Video ID': video_id,
        'Title': title,
        'View Count': view_count,
        'Channel Subscribers': channel_subscribers
    }
    video_data.append(video_info)

# Confersion of dicts to DataFrame
df = pd.DataFrame(video_data)


In [None]:
df = pd.DataFrame(data_search[0]['items'])

for i in range(len(data_search)-1):
    df = pd.concat([df, pd.DataFrame(data_search[i+1]['items'])])
df = pd.DataFrame(df['snippet'])
df = df['snippet'].apply(pd.Series)
df = pd.concat([df, df_views], axis=1)
df = df.drop(['thumbnails', 'channelId', 'publishedAt'], axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 585 entries, 0 to 34
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 585 non-null    object
 1   description           585 non-null    object
 2   channelTitle          585 non-null    object
 3   liveBroadcastContent  585 non-null    object
 4   publishTime           585 non-null    object
 5   viewCount             585 non-null    object
dtypes: object(6)
memory usage: 32.0+ KB


In [None]:
df.head()

Unnamed: 0,title,description,channelTitle,liveBroadcastContent,publishTime,viewCount
0,MIKE TYSON I FRANCIS NGANNOU W JEDNYM TEAMIE ?...,Legendarny bokser Mike Tyson jest gotowy na ws...,MMA INFO,none,2023-07-25T13:38:06Z,1895
1,"Kizo ft. Kabe, ReTo, Gruby Mielzky, Borixon - ...",SPRAWDŹ W CYFRZE⬇ https://kizo.lnk.to/colosseu...,MY TO SUKCES,none,2023-05-11T13:00:12Z,6675688
2,KSW 84: Bonusy - Najlepsze nokauty i poddania ...,"Zobaczcie, do kogo trafiły bonusy po gali KSW ...",KSW,none,2023-07-18T09:58:54Z,13239
3,Najlepsze nokauty i poddania bohaterów XTB KSW...,Zobaczcie najlepsze nokauty i poddania bohater...,KSW,none,2023-06-03T09:26:47Z,256247
4,Najlepsze nokauty i walka gali XTB KSW Colosse...,"Zobaczcie, do kogo trafił bonusy po historyczn...",KSW,none,2023-06-05T17:00:35Z,165160


I need to get rid of videos from official KSW channels as they are not my target and make disturbs in data

In [None]:
df_filtered = df[df['channelTitle'] != 'KSW']

In [None]:
df_filtered

Unnamed: 0,title,description,channelTitle,liveBroadcastContent,publishTime,viewCount
0,MIKE TYSON I FRANCIS NGANNOU W JEDNYM TEAMIE ?...,Legendarny bokser Mike Tyson jest gotowy na ws...,MMA INFO,none,2023-07-25T13:38:06Z,1794
1,"Kizo ft. Kabe, ReTo, Gruby Mielzky, Borixon - ...",SPRAWDŹ W CYFRZE⬇ https://kizo.lnk.to/colosseu...,MY TO SUKCES,none,2023-05-11T13:00:12Z,6675387
4,BORYS MAŃKOWSKI | Kontuzja | Powrót | KSW Colo...,Wesprzyj nas na YouTube: ➡️ https://www.youtub...,myMMApl,none,2023-07-25T16:58:05Z,103
8,BGMI LIVE WITH TEAM KSW || Let&#39;s go for 40...,Hey folks! Watch me play BGMI with my team KHA...,RubyRuth,none,2023-07-23T20:06:07Z,76
9,"SEKUNDĘ OD TRAGEDII NA KSW 84 (MMA, DE FRIES, ...",ODCINEK SPONSOROWANY **** Skrót wszystkich wal...,MMA INFO,none,2023-07-16T14:58:41Z,40570
...,...,...,...,...,...,...
31,ビリミリオン神曲すぎる！#music #ビリミリオン #shorts,,天才バカ世games,none,2023-07-24T03:00:21Z,6612
32,"July 24, 2023",,Rachelle Kajer,none,2023-07-24T22:44:17Z,80315
33,やっぱり子供が欲しい。このタイミングで壊れる！？/掃除/チェキ会/胃カメラ検査/手作りおやつ...,,あたしの日常,none,2023-07-23T10:35:11Z,15933
34,Priyanka chahar Choudhary New Song Teaser 😮@pr...,Priyanka chahar Choudhary New Song Teaser ।‎@a...,Bollywood Highlights,none,2023-07-25T04:30:39Z,171403


In [None]:
def standardize_text(df_filtered, text_field):
    df_filtered[text_field] = df_filtered[text_field].str.lower()
    df_filtered[text_field] = df_filtered[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs
    return df_filtered

clean_df = standardize_text(df_filtered, "title")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[text_field] = df_filtered[text_field].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[text_field] = df_filtered[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs


In [None]:
clean_df.head()

Unnamed: 0,title,description,channelTitle,liveBroadcastContent,publishTime,viewCount
0,mike tyson i francis ngannou w jednym teamie ?...,Legendarny bokser Mike Tyson jest gotowy na ws...,MMA INFO,none,2023-07-25T13:38:06Z,1794
1,"kizo ft. kabe, reto, gruby mielzky, borixon - ...",SPRAWDŹ W CYFRZE⬇ https://kizo.lnk.to/colosseu...,MY TO SUKCES,none,2023-05-11T13:00:12Z,6675387
4,borys mańkowski | kontuzja | powrót | ksw colo...,Wesprzyj nas na YouTube: ➡️ https://www.youtub...,myMMApl,none,2023-07-25T16:58:05Z,103
8,bgmi live with team ksw || let&#39;s go for 40...,Hey folks! Watch me play BGMI with my team KHA...,RubyRuth,none,2023-07-23T20:06:07Z,76
9,"sekundę od tragedii na ksw 84 (mma, de fries, ...",ODCINEK SPONSOROWANY **** Skrót wszystkich wal...,MMA INFO,none,2023-07-16T14:58:41Z,40570


In [None]:
clean_df.groupby("channelTitle").count()

Unnamed: 0_level_0,title,description,liveBroadcastContent,publishTime,viewCount
channelTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6PAK,2,2,2,2,2
AI New World,1,1,1,1,1
AntyFakty,1,1,1,1,1
Artur Przybysz,2,2,2,2,2
BOXDEL,1,1,1,1,1
...,...,...,...,...,...
あたしの日常,1,1,1,1,1
ナナヲアカリ OFFICIAL,1,1,1,1,1
天才バカ世games,1,1,1,1,1
炊き鍋,1,1,1,1,1


In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

clean_df["tokens"] = clean_df["title"].apply(tokenizer.tokenize)
clean_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["tokens"] = clean_df["title"].apply(tokenizer.tokenize)


Unnamed: 0,title,description,channelTitle,liveBroadcastContent,publishTime,viewCount,tokens
0,mike tyson i francis ngannou w jednym teamie ?...,Legendarny bokser Mike Tyson jest gotowy na ws...,MMA INFO,none,2023-07-25T13:38:06Z,1794,"[mike, tyson, i, francis, ngannou, w, jednym, ..."
1,"kizo ft. kabe, reto, gruby mielzky, borixon - ...",SPRAWDŹ W CYFRZE⬇ https://kizo.lnk.to/colosseu...,MY TO SUKCES,none,2023-05-11T13:00:12Z,6675387,"[kizo, ft, kabe, reto, gruby, mielzky, borixon..."
4,borys mańkowski | kontuzja | powrót | ksw colo...,Wesprzyj nas na YouTube: ➡️ https://www.youtub...,myMMApl,none,2023-07-25T16:58:05Z,103,"[borys, mańkowski, kontuzja, powrót, ksw, colo..."
8,bgmi live with team ksw || let&#39;s go for 40...,Hey folks! Watch me play BGMI with my team KHA...,RubyRuth,none,2023-07-23T20:06:07Z,76,"[bgmi, live, with, team, ksw, let, 39, s, go, ..."
9,"sekundę od tragedii na ksw 84 (mma, de fries, ...",ODCINEK SPONSOROWANY **** Skrót wszystkich wal...,MMA INFO,none,2023-07-16T14:58:41Z,40570,"[sekundę, od, tragedii, na, ksw, 84, mma, de, ..."


In [None]:
all_words = [word for sublist in clean_df["tokens"] for word in sublist]

# Przekształć tablicę wyrazów w Pandas Series
word_series = pd.Series(all_words)

# Przefiltruj wyrazy, które mają co najmniej 3 znaki
filtered_words = word_series[word_series.apply(lambda x: len(x) > 3)]

# Zlicz wystąpienia każdego wyrazu
word_counts = filtered_words.value_counts()

#I need to create a list of words that can disturb my data mining. These are parts of speech other than nouns and words with a meaning broader than mma
list_of_disturbing_words = ['quot', 'przed', 'gali', 'fight', 'shorts', 'podsumowanie', 'mateusz', 'free', 'walce', 'słowa', '2023', 'dalej']

# Wybierz 10 najczęściej występujących wyrazów
hot_words = word_counts.head(30).index.tolist()

for word in hot_words:
    if word in list_of_disturbing_words:

        hot_words.remove(word)

top_10_words = hot_words[0:10]

In [None]:
print(top_10_words)

['colosseum', 'pudzianowski', 'szpilka', 'narodowym', 'pudzian', 'khalidov']


now I'll read more data from API

In [None]:
TOTAL_VIDEOS = 1000

videos = get_videos_with_keyword(top_10_words, TOTAL_VIDEOS, MAX_RESULTS_PER_PAGE, DELAY_BETWEEN_REQUESTS)

video_data = []
for video in videos:
    video_id = video['id']['videoId']
    title = video['snippet']['title']
    statistics = get_video_statistics(video_id)
    view_count = statistics.get('viewCount', 0)
    channel_subscribers = statistics.get('subscriberCount', 0)
    video_info = {
        'Video ID': video_id,
        'Title': title,
        'View Count': view_count,
        'Channel Subscribers': channel_subscribers
    }
    video_data.append(video_info)

# Confersion of dicts to DataFrame
df = pd.DataFrame(video_data)

In [None]:

# Export to CSV
df.to_csv('youtube_data.csv', index=False)

Here I finished data aggregation. Futher work will be done in seperate file.