In [13]:
import api_utils as au
import extract_video_info as evi
import numpy as np
import pandas as pd
from typing import List, Set, Dict, Tuple, Optional
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import sklearn
import cloudpickle

In [2]:
df = pd.read_json('rnn_data.json.gz', compression='gzip')

# Starting data ~293K rows

In [25]:
df.loc[ df['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "

In [4]:
df2 = df[ df['subtitles'].apply(lambda x: len(x[0]) > 500 )]

In [6]:
df2.loc[:,'subtitles'] = df2.loc[:,'subtitles'].apply(lambda x: x[0])

In [7]:
vectorizer = TfidfVectorizer(stop_words = 'english', min_df = .3, max_df = .95, ngram_range=(1,2))

X = vectorizer.fit_transform(df2.subtitles)


128


In [8]:
pca = PCA(n_components = 5, random_state=11)

In [9]:
X_pca = pca.fit_transform(X.toarray())


In [11]:
pca.explained_variance_

array([0.05912749, 0.05323177, 0.02636224, 0.0235809 , 0.02184137])

In [39]:
class Add_PCA(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):

    # Adds five rows for each video's subtitle transformation via TFIDF vectorizer and PCA

    def __init__(self):

        # Passing the pre-trained vectorizer and pca objects

        self.vectorizer = vectorizer
        self.pca = pca

        return None
    
    def fit(self, df: pd.DataFrame):

        return df

    def transform(self, df: pd.DataFrame):

        nd = df.copy()

        pca_cols = [ 'pca_' + str(n) for n in range(5) ]

        # Some subtitle fields may have empty lists; this is a poor fix

        nd.loc[ nd['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "
        nd.loc[:,'subtitles'] = nd.loc[:,'subtitles'].apply(lambda x: x[0])

        X_vec = self.vectorizer.transform(nd.subtitles)
        X_pca = self.pca.transform(X_vec.toarray())

        df[pca_cols] = X_pca
        
        return df

In [96]:
class YT_features(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):

    """
    Features added by this transformer:
    
    vid_name_chars -- number of characters in the video title
    vid_name_words -- number of words in the video title
    desc_chars -- number of characters in the video description
    desc_words -- number of words in the video description
    subtitle_chars -- number of characters in the video subtitles
    subtitle_words -- number of words in the video subtitles
    subtitles_words_unique -- number of distinct words appearing in the video subtitles
    has_profanity -- whether the video subtitles contain (redacted) profanity (boolean)
    has_music -- whether the video indicates musical accompaniment (boolean)
    has_links -- whether the video description has URLs (boolean)
    link_perc -- how much of the description text is devoted to URLs
    
    
    """

    def __init__(self):

        return None

    def fit(self, df: pd.DataFrame):

        return df

    def transform(self, df: pd.DataFrame):

        nd = df.copy()

        bleep = '[ __ ]'
        music = '[Music]'
        url_regex = re.compile(r'http.*')

        # Handling degenerate cases

        nd.loc[ nd['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "
        nd.loc[:,'subtitles'] = nd.loc[:,'subtitles'].apply(lambda x: x[0])

        nd.loc[ nd.description.isna(), 'description'] = " "

        # Adding features

        df.loc[:, 'vid_name_chars'] = nd.loc[:,'vid_name'].apply(len)
        df.loc[:, 'vid_name_words'] = nd.loc[:,'vid_name'].apply(lambda x: len(x.split()))

        df.loc[:, 'desc_chars'] = nd.loc[:,'description'].apply(len)
        df.loc[:, 'desc_words'] = nd.loc[:,'description'].apply(lambda x: len(x.split()))


        df.loc[:, 'subtitle_chars'] = nd.loc[:,'subtitles'].apply(len)
        df.loc[:, 'subtitle_words'] = nd.loc[:,'subtitles'].apply(lambda x: len(x.split()))
        df.loc[:, 'subtitle_words_unique'] = nd.loc[:,'subtitles'].apply(lambda x: len(set(x.split())))


        df.loc[:, 'has_profanity'] = nd.loc[:,'subtitles'].apply(lambda x: bleep in x)
        df.loc[:, 'has_music'] = nd.loc[:, 'subtitles'].apply(lambda x: music in x)

        df.loc[:, 'has_links'] = nd.loc[:, 'description'].apply(lambda x: bool(url_regex.search(x)))

        for i in nd.index:

            if df.loc[i, 'has_links']:
                desc = nd.loc[i, 'description']

                link_text = ''.join(url_regex.findall(desc))
                df.loc[i, 'link_perc'] = len(link_text) / len(desc)

            else:
                df.loc[i, 'link_perc'] = 0

        return df

In [None]:
ytt = YT_features()

In [100]:
test_pipe = Pipeline(
    steps = [
        ('Basic Features', ytt),
        ('PCA', pca)

    ]
)

In [101]:
test_pipe.transform(df.sample(5))

Unnamed: 0,vid_id,chan_query,chan_id,chan_name,chan_viewcount,chan_subcount,chan_start_dt,chan_thumb,chan_vidcount,vid_name,...,subtitle_words_unique,has_profanity,has_music,has_links,link_perc,pca_0,pca_1,pca_2,pca_3,pca_4
121132,DRIP-nyNKMI,none,UCEjkioV3LO_OIUaSWRxFZ3A,Cheap Lazy Vegan,54811056,787000,2015-03-19T22:30:55Z,https://yt3.ggpht.com/ytc/AMLnZu87391ggfGUKf32...,850,How to: Vegan Breakfast & Lunch | Cheap Lazy V...,...,201,False,False,True,0.28045,0.215231,-0.277057,-0.114375,0.228549,-0.099765
288683,J6DQtFOtdKs,kim chi,UCt24PsusUxDWPCyflGQficw,Crazy Korean Cooking,4470519,47900,2008-04-11T07:06:48Z,https://yt3.ggpht.com/ytc/AMLnZu_Lz9f2v1D8dg36...,117,"Kimchi Jjim, Braised Kimchi and Pork",...,31,False,True,True,0.46558,0.203233,0.630152,0.12774,0.087116,-0.341182
274804,43OqWbAjYlU,stuffed mushrooms,UCcheUFAnBi4wFMsxgVtmPkQ,Jayne & Paul - Mostly Cooking,94627,374,2020-03-28T20:21:26.26274Z,https://yt3.ggpht.com/ytc/AMLnZu_hzvE6shqMNHMq...,256,Pears in wine with vanilla sugar and cardamom,...,289,False,False,False,0.0,0.011494,-0.238842,-0.048636,0.268194,-0.014247
194883,eUnELA2NssA,vegetable chilis,UCiy4YYqi_vNLfu4g-hGo8Fg,Simply Elegant Home Cooking,3768861,25800,2018-01-06T01:19:38Z,https://yt3.ggpht.com/ytc/AMLnZu8lli0KfrYMYvGL...,107,Shaved Zucchini Salad With Homemade Greek Lemo...,...,358,False,False,False,0.0,0.227365,-0.300496,-0.082801,0.114823,-0.124159
341015,x6dGA5q-vVE,chow mein,UCgxKxxQY9BKV9bYkS5Jswxw,Papa's Kitchen Recipes,10092347,137000,2017-07-20T02:49:57Z,https://yt3.ggpht.com/ytc/AMLnZu9mzbrW0RSuQ9aP...,229,Super Delicious Egg Curry ...!!!!😋 | Egg Recip...,...,1,False,True,False,0.0,0.367572,0.932859,0.182773,0.159696,-0.163719


In [102]:
cloudpickle.dump(test_pipe, open('feat_pca_pipe.cloudpickle', 'wb'))

In [103]:
recon = cloudpickle.load(open('feat_pca_pipe.cloudpickle', 'rb'))


In [104]:
recon.transform(df.sample(10))

Unnamed: 0,vid_id,chan_query,chan_id,chan_name,chan_viewcount,chan_subcount,chan_start_dt,chan_thumb,chan_vidcount,vid_name,...,subtitle_words_unique,has_profanity,has_music,has_links,link_perc,pca_0,pca_1,pca_2,pca_3,pca_4
204643,7JQ5F5Gq-NY,Monte Carlo,UClUojscPc9HTms34DwLFrHg,Monte Carlo Alimentos: Viver de food service,8494,153,2020-06-14T19:36:11.73606Z,https://yt3.ggpht.com/ytc/AMLnZu_FVlWnojUL6Bi6...,26,LINHAS DE CRÉDITO PRA OPERADORES FOOD SERVICE,...,6,False,False,True,0.225806,0.038073,0.194502,-0.150221,-0.103016,0.152356
301569,Y-XD3emJv5A,passover seder,UCfz7MZNxGZjqDUQW9Z5VdMA,Mrs.LNoble,101235,2630,2018-03-16T02:57:14Z,https://yt3.ggpht.com/3HW2a6JpaoUAX_yToQ01JO1u...,288,LET'S TALK ABOUT NATURAL HAIR ~ BLACK HAIR DIS...,...,265,True,True,True,0.080399,-0.328966,0.073285,-0.140236,-0.039597,-0.296276
298822,Unsjyz308og,lobster,UCjc7wI5Ztt7xwKQik4-24UQ,Fire & Water Cooking,1671148,10400,2018-04-16T18:39:52Z,https://yt3.ggpht.com/ytc/AMLnZu-TAcRMCcgUYjJl...,408,Sous Vide Medium Rare Beef Short Ribs on the K...,...,467,False,True,True,0.225141,-0.024352,-0.272447,0.399464,-0.080794,-0.050978
1020055,UNboAMkE1aM,none,UCoMpb6mYV8vgLDHpkROfzfQ,Mama Sue Garrett,1552063,21300,2020-07-25T15:56:23.813773Z,https://yt3.ggpht.com/KIxX9kpPAjYqXqYrmDjfEJWW...,387,Mama Sue makes NOODLES & TOMATOES | Mama Sue's...,...,295,False,True,True,0.527426,-0.003208,-0.17449,0.06882,0.069955,0.008676
272469,4p5BLUDt0I8,paella,UCl4i6QsatTho9QJW50GbLAA,BBC Good Food,81277070,404000,2006-12-19T12:11:49Z,https://yt3.ggpht.com/ytc/AMLnZu-qTKNCx6DKvpO4...,789,Ep. 6 | Halloween - Tom Kerridge Podcast - BBC...,...,1168,False,True,True,0.206413,-0.422685,0.062413,-0.151945,-0.114151,-0.068065
1051428,WXf0IMHLntk,none,UCsE_m2z1NrvF2ImeNWh84mw,Active Self Protection,1450686487,2770000,2013-04-09T16:46:57Z,https://yt3.ggpht.com/ytc/AMLnZu_mmx6i6xuXkIJ4...,2849,Ambush Knife Attack Caught on Camera,...,224,False,False,True,0.092907,-0.092552,-0.088796,0.142185,-0.095488,0.066943
40634,JJUAQ7beu_g,none,UC5xAkS4828lDivq8cKFGSyw,ochikeron,142487433,928000,2007-04-27T03:26:57Z,https://yt3.ggpht.com/ytc/AMLnZu90Hn7u84cV3m9-...,943,Zeppole di San Giuseppe (Father's Day Idea) ゼッ...,...,191,False,False,True,0.235822,0.402432,-0.115125,-0.256241,0.010094,0.056479
71905,_a43zC2HzQM,none,UC_R8qIXaTKpkAJuuiZhHTmA,Vahchef - VahRehVah,814135486,2480000,2007-07-16T03:21:01Z,https://yt3.ggpht.com/ytc/AMLnZu96p3jlbFCw_L7O...,2237,Tahini Sauce - By Vahchef @ vahrehvah.com,...,250,False,False,True,0.361702,0.204587,-0.165645,-0.264786,-0.093527,-0.076085
124428,kLWCz2iXOuw,none,UCDa7IzZX_Lb882M4qZc6nzg,Stump Kitchen,969395,6250,2016-03-08T21:08:22Z,https://yt3.ggpht.com/ytc/AMLnZu-wg0GI3izhHy1v...,442,How to make vegan Peanut Butter Cups! Stump Ki...,...,2,False,False,True,0.290686,0.123952,0.229421,-0.2141,-0.088312,0.14334
1067397,q_kd40ePFi0,none,UCu99W5TTPAucVN83TYQtCqQ,Homesteading Family,41543390,659000,2016-09-09T14:51:51Z,https://yt3.ggpht.com/ytc/AMLnZu9mNQcPoZRewlvu...,440,"Super Easy, Ready to Use Garlic; Fearless Ferm...",...,337,False,True,True,0.300904,0.096883,-0.219259,0.049551,0.014279,-0.15546
