In [1]:
import numpy as np
import pandas as pd
from typing import List, Set, Dict, Tuple, Optional
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import sklearn
import cloudpickle

from warnings import filterwarnings
filterwarnings('ignore')
sklearn.__version__

'0.24.2'

In [2]:
df = pd.read_json('for_subs_transformer.json.gz')
df.shape

(297614, 35)

In [3]:
df.loc[ df['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "

In [4]:
df2 = df[ df['subtitles'].apply(lambda x: len(' '.join(x)) > 500 )]

In [5]:
df2.loc[:,'subtitles'] = df2.loc[:,'subtitles'].apply(lambda x: ' '.join(x))

In [6]:
vectorizer = TfidfVectorizer(stop_words = 'english', min_df = .3, max_df = .95, ngram_range=(1,2))

X = vectorizer.fit_transform(df2.subtitles)


In [7]:
pca = PCA(n_components = 5, random_state=11)

In [8]:
X_pca = pca.fit_transform(X.toarray())


In [9]:
pca.explained_variance_

array([0.0621822 , 0.05017643, 0.03006869, 0.02640344, 0.02010825])

In [10]:
class Add_PCA(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):

    # Adds five rows for each video's subtitle transformation via TFIDF vectorizer and PCA

    def __init__(self):

        # Passing the pre-trained vectorizer and pca objects

        self.vectorizer = vectorizer
        self.pca = pca

        return None
    
    def fit(self, df: pd.DataFrame):

        return df

    def transform(self, df: pd.DataFrame):

        nd = df.copy()

        pca_cols = [ 'pca_' + str(n) for n in range(5) ]

        # Some subtitle fields may have empty lists; this is a poor fix

        nd.loc[ nd['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "
        nd.loc[:,'subtitles'] = nd.loc[:,'subtitles'].apply(lambda x: ' '.join(x))

        X_vec = self.vectorizer.transform(nd.subtitles)
        X_pca = self.pca.transform(X_vec.toarray())

        df[pca_cols] = X_pca
        
        return df

In [11]:
class YT_features(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):

    """
    Features added by this transformer:
    
    vid_name_chars -- number of characters in the video title
    vid_name_words -- number of words in the video title
    desc_chars -- number of characters in the video description
    desc_words -- number of words in the video description
    subtitle_chars -- number of characters in the video subtitles
    subtitle_words -- number of words in the video subtitles
    subtitles_words_unique -- number of distinct words appearing in the video subtitles
    has_profanity -- whether the video subtitles contain (redacted) profanity (boolean)
    has_music -- whether the video indicates musical accompaniment (boolean)
    has_links -- whether the video description has URLs (boolean)
    link_perc -- how much of the description text is devoted to URLs
    
    
    """

    def __init__(self):

        return None

    def fit(self, df: pd.DataFrame):

        return df

    def transform(self, df: pd.DataFrame):

        nd = df.copy()

        bleep = '[ __ ]'
        music = '[Music]'
        url_regex = re.compile(r'http.*')

        # Handling degenerate cases

        nd.loc[ nd['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "
        nd.loc[:,'subtitles'] = nd.loc[:,'subtitles'].apply(lambda x: x[0])

        nd.loc[ nd.description.isna(), 'description'] = " "

        # Adding features

        df.loc[:, 'vid_name_chars'] = nd.loc[:,'vid_name'].apply(len)
        df.loc[:, 'vid_name_words'] = nd.loc[:,'vid_name'].apply(lambda x: len(x.split()))

        df.loc[:, 'desc_chars'] = nd.loc[:,'description'].apply(len)
        df.loc[:, 'desc_words'] = nd.loc[:,'description'].apply(lambda x: len(x.split()))


        df.loc[:, 'subtitle_chars'] = nd.loc[:,'subtitles'].apply(len)
        df.loc[:, 'subtitle_words'] = nd.loc[:,'subtitles'].apply(lambda x: len(x.split()))
        df.loc[:, 'subtitle_words_unique'] = nd.loc[:,'subtitles'].apply(lambda x: len(set(x.split())))


        df.loc[:, 'has_profanity'] = nd.loc[:,'subtitles'].apply(lambda x: bleep in x)
        df.loc[:, 'has_music'] = nd.loc[:, 'subtitles'].apply(lambda x: music in x)

        df.loc[:, 'has_links'] = nd.loc[:, 'description'].apply(lambda x: bool(url_regex.search(x)))

        for i in nd.index:

            if df.loc[i, 'has_links']:
                desc = nd.loc[i, 'description']

                link_text = ''.join(url_regex.findall(desc))
                df.loc[i, 'link_perc'] = len(link_text) / len(desc)

            else:
                df.loc[i, 'link_perc'] = 0

        return df

In [12]:
ytt = YT_features()
pipe_pca = Add_PCA()

In [13]:
test_pipe = Pipeline(
    steps = [
        ('Basic Features', ytt),
        ('PCA', pipe_pca)

    ]
)

In [14]:
test_pipe.transform(df.sample(5))

Unnamed: 0,vid_id,chan_query,chan_id,chan_name,chan_viewcount,chan_subcount,chan_start_dt,chan_thumb,chan_vidcount,vid_name,...,subtitle_words_unique,has_profanity,has_music,has_links,link_perc,pca_0,pca_1,pca_2,pca_3,pca_4
1087281,0Zu2JtnkKWI,none,UCwbXPUMNQwUGCG_nCkAq3uA,Natural Life TV,904726734,3150000,2016-08-25T02:19:36Z,https://yt3.ggpht.com/ytc/AMLnZu_91WMFr_8DUpFZ...,2672,How to cook rice fried recipe - Cooking skill,...,22,False,True,False,0.0,0.123304,0.794111,0.25321,-0.160128,-0.010172
177847,oP3yAp7iA-w,German chocolate,UCabq3No3wXbs6Ut-Pux6SzA,The TRY Channel,401192622,1210000,2017-10-20T18:29:59Z,https://yt3.ggpht.com/ytc/AMLnZu8H4wWeeXVok_No...,715,Irish People Try Halloween Monster Cereal,...,550,True,True,True,0.187207,-0.464038,0.128316,-0.198684,0.020232,0.089166
218005,DJyhZgjSTj0,huevos rancheros eggs,UCCojmcJaaqnuBFCxYtPhiYw,We Be Cooking,1628,36,2020-09-21T07:00:37.814556Z,https://yt3.ggpht.com/ytc/AMLnZu8E3Q8OHn2MpBPO...,65,Roasted Garlic Shrimp,...,248,False,False,False,0.0,-0.203533,-0.150709,-0.040387,0.116234,-0.155216
189284,CciXylE5k3Q,petits fours,UCXPc8VcZdbPtk1m-HF1E0HQ,فن الطبخ مع عمورة | Ammoora's Kitchen,2407117,33700,2021-07-25T21:06:57.301501Z,https://yt3.ggpht.com/Z90yPb_qDje-t2XbUWyzlNvd...,42,شوربه القرع الاحمر بالكريمه | شوربة اليقطين با...,...,5,False,False,True,0.135511,0.453788,0.058343,-0.321114,0.217714,0.030967
6376,9VmBGdEkCzo,none,UCsP7Bpw36J666Fct5M8u-ZA,How To Cook That,889832468,4910000,2011-04-16T12:02:40Z,https://yt3.ggpht.com/ytc/AMLnZu8xCcCQLMP_kaid...,494,Frozen Fever Party Jelly Snowflakes by How To ...,...,9,False,False,True,0.50241,0.309502,-0.067866,-0.169119,-0.020473,-0.087384


In [15]:
cloudpickle.dump(test_pipe, open('feat_pca_pipe.cloudpickle', 'wb'))

In [16]:
recon = cloudpickle.load(open('feat_pca_pipe.cloudpickle', 'rb'))

In [17]:
recon.transform(df.sample(10))

Unnamed: 0,vid_id,chan_query,chan_id,chan_name,chan_viewcount,chan_subcount,chan_start_dt,chan_thumb,chan_vidcount,vid_name,...,subtitle_words_unique,has_profanity,has_music,has_links,link_perc,pca_0,pca_1,pca_2,pca_3,pca_4
615876,sD6aWaDiFA8,none,UCAxtVn4eS0WSE5jaKK6qlVA,mahalodotcom,690232400,1540000,2007-06-08T03:01:28Z,https://yt3.ggpht.com/ytc/AMLnZu80qHdVfJlFDHPf...,14032,Learn Adobe Photoshop - Ellipse Tool,...,208,False,False,True,0.037823,0.191145,0.109192,-0.238255,-0.028629,0.025812
167153,pJaM9DwtRuM,Nanaimo bars,UCw7Gvxtaa-_7CwjGW9glglg,The Tipsy Baker ST,819,27,2011-03-08T19:52:29Z,https://yt3.ggpht.com/ytc/AMLnZu8QviGoxKzO8doa...,45,The Tipsy Baker presents making pie dough fro...,...,1,False,False,False,0.0,0.0219,0.544572,0.000278,-0.025486,0.131388
380480,soDNhHzHiGU,sweet and sour chicken,UCQNUQ7x-aEPY5jmIq1RllEg,No Recipes,7687893,114000,2010-01-21T07:50:16Z,https://yt3.ggpht.com/MV90LeRmTAMbD6J82B6OmBQu...,239,1000 Pancakes! REAL Pancake Cereal Recipe,...,443,False,True,True,0.198205,-0.044685,-0.119297,0.132171,0.289244,-0.044774
183037,UtlfUFN44n8,simnel cake,UC40QkSCR2pwycpDJLFdvQ2Q,Gluten Free Victorian Woman Home Cooking,36829,629,2019-05-19T21:40:03Z,https://yt3.ggpht.com/VDeueCL0-fwMcVFck6evsFiR...,241,Gluten Free High Protein Chocolate Chip Cookies,...,254,False,True,True,0.058259,0.057539,0.103208,0.262779,-0.026383,-0.066867
32800,XyfbKQ7RoAY,none,UCbULqc7U1mCHiVSCIkwEpxw,Hiroyuki Terada - Diaries of a Master Sushi Chef,356208249,2020000,2011-10-24T05:59:15Z,https://yt3.ggpht.com/ytc/AMLnZu-Eb-9ExdPtueQL...,754,Nashville Hot Steak Experiment | BEST Asian Fr...,...,374,False,True,True,0.305511,-0.314232,0.103656,0.042966,0.162768,0.313787
438238,Le-qdx-PV8I,chop suey,UCNYqX0nxjhLaMeNsIgyZffQ,Lady S & Family,47578,2150,2020-07-18T20:47:15.914009Z,https://yt3.ggpht.com/f2whoLA0Dc9o4H88RjPIGJrm...,278,Learn to love and accept yourself,...,171,False,False,False,0.0,-0.039384,0.24886,-0.3413,-0.05523,-0.022328
27674,2CzMagqdjko,none,UChBEbMKI1eCcejTtmI32UEw,Joshua Weissman,936326402,7090000,2014-02-28T00:03:34Z,https://yt3.ggpht.com/ytc/AMLnZu-2pyUTWSFy7qzi...,465,The Easy Guide On Making Just About Any Smoothie,...,397,False,True,True,0.131371,0.006717,-0.154713,0.025755,-0.021947,-0.065001
51438,hc3TEaT3WHA,none,UCbpMy0Fg74eXXkvxJrtEn3w,Bon Appétit,1672520299,6150000,2008-04-29T22:26:01Z,https://yt3.ggpht.com/ytc/AMLnZu-8wcultJDj1204...,1373,Every Way to Cook a Potato (63 Methods) | Bon ...,...,1128,True,True,True,0.180396,-0.052669,-0.284443,0.166043,0.110519,0.002847
342077,y991oiNelFc,French dip,UCeaexWLoAkQoVZ24AYHPtCg,cookingfordads,8628015,21100,2007-09-10T15:32:21Z,https://yt3.ggpht.com/ytc/AMLnZu-4S0-t3S7LPNJz...,229,Indian Quesadilla with Chicken Curry,...,178,False,False,False,0.0,0.137808,-0.305968,0.094707,-0.280545,-0.007037
198012,YuYkaDYUIHo,Bloody Mary,UC72A1DHgfcOlz9f7G-jMQEw,Lanes BBQ,331750,4790,2019-04-18T01:25:21Z,https://yt3.ggpht.com/YEQd8JS1Al_AqWyhVo1uRFUJ...,66,Fried Catfish Tacos on a Campsite firepit with...,...,332,False,True,True,0.383224,-0.101644,-0.187213,0.240159,-0.07083,0.043587
