In [51]:
import pandas as pd
import re

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


from nltk.tokenize import RegexpTokenizer


In [5]:
initial_df = pd.read_csv('youtube_data.csv')


In [6]:
initial_df.head()

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers
0,FdswhegK0mU,KSW Free Fight: Salahdine Parnasse vs. Sebasti...,148591,KSW International,16700
1,IS6DTtPdb9I,Salahdine Parnasse - Wirtuoz i showman | KSW 68,37359,KSW,506000
2,X_PW5kCXfzk,KSW Free Fight: Salahdine Parnasse vs. Daniel ...,231898,KSW International,16700
3,KzaM4AOf-nU,TAKTYKI NA PARNASSE | TRENING,11911,Robert Ruchała,4140
4,2pKUU93zyvQ,"Salahdine Parnasse, Le Jeune Prodige du MMA - ...",182965,ZACK,663000


In [7]:
initial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   video_id             544 non-null    object
 1   title                544 non-null    object
 2   view_count           544 non-null    int64 
 3   channel              544 non-null    object
 4   channel_subscribers  544 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 21.4+ KB


In [24]:
df = initial_df[initial_df['channel'] != 'KSW']
df = df[df['channel'] != 'KSW International']

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 3 to 543
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   video_id             500 non-null    object
 1   title                500 non-null    object
 2   view_count           500 non-null    int64 
 3   channel              500 non-null    object
 4   channel_subscribers  500 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 23.4+ KB


In [26]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs
    return df

clean_df = standardize_text(df, "title")

In [None]:
# Separating publish_time into day parts and lebel encoding of it
clean_df['publish_time'] = pd.to_datetime(clean_df['publish_time'])
clean_df['hour'] = clean_df['publish_ime'].clean_df.hour
clean_df['day_part'] = pd.cut(clean_df['hour'], bins=[0, 12, 18, 24], labels=['morning', 'afternoon', 'evening'], right=False)
clean_df['day_part'] = preprocessing.LabelEncoder().fit_transform(clean_df['day_part'])

In [47]:

clean_df['view_count'] = clean_df['view_count'].astype(float)
clean_df['view_level'] = pd.cut(clean_df['view_count'], bins=[0, 25000, 50000, 75000, 100000, 150000, 200000, 10000000], labels=['0-25tys', '25-50tys', '50-75tys', '75-100tys', '100-150tys', '150-200tys', '>200tys'], right=False)
clean_df['view_level'] = preprocessing.LabelEncoder().fit_transform(clean_df['view_level'])

In [50]:
clean_df.head()

Unnamed: 0,video_id,title,view_count,channel,channel_subscribers,view_level,tokens
3,KzaM4AOf-nU,taktyki na parnasse | trening,11911.0,Robert Ruchała,4140,0,"[taktyki, na, parnasse, trening]"
4,2pKUU93zyvQ,"salahdine parnasse, le jeune prodige du mma - ...",182965.0,ZACK,663000,2,"[salahdine, parnasse, le, jeune, prodige, du, ..."
7,KoDAt2aC5Pk,salahdine parnasse: „marian nie jest prawdziwy...,4731.0,MMA-bądź na bieżąco,117000,0,"[salahdine, parnasse, marian, nie, jest, prawd..."
8,mB39DNW--QU,secret du succes salahdine mma,52.0,MMA FR highlights,353,0,"[secret, du, succes, salahdine, mma]"
12,7__IhT1IEn4,salahdine parnasse épisode 10 : salahdine parn...,247523.0,Salahdine PARNASSE,37500,6,"[salahdine, parnasse, épisode, 10, salahdine, ..."


In [52]:
X = clean_df['title']
y = clean_df['view_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

In [44]:
vectorizer = TfidfVectorizer() # analyzer='word', token_pattern=r'\\w+'
vectors = vectorizer.fit_transform(clean_df['title']).toarray()

In [45]:
vectorizer.get_feature_names_out()

array(['02', '04', '07', ..., 'واحد', 'يا', '𝕃𝕀𝕍𝔼'], dtype=object)

In [46]:
print(vectors.shape)

(500, 1474)


In [53]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [60]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


poly_svm_clf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", SVC(kernel = "poly", degree = 3, coef0 = 1, C = 5))])

poly_svm_clf.fit(X_train_tfidf , y_train)


In [61]:
y_pred = poly_svm_clf.predict(X_test_tfidf)

# Dokonaj oceny modelu
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"Dokładność modelu: {accuracy}")

report = classification_report(y_test, y_pred)
print("Raport klasyfikacji:")
print(report)

Dokładność modelu: 0.69
Raport klasyfikacji:
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        71
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         3
           3       0.33      0.22      0.27         9
           4       0.67      0.40      0.50         5
           5       0.00      0.00      0.00         3
           6       0.20      0.17      0.18         6

    accuracy                           0.69       100
   macro avg       0.28      0.24      0.25       100
weighted avg       0.61      0.69      0.64       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# from sklearn.model_selection import cross_val_predict

# cv_pred = cross_val_predict(ovr_clf, X_train, y_train, cv = 3)

# accuracy_score(y_train, cv_pred)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]