In [3]:
import api_utils as au
import extract_video_info as evi
import numpy as np
import pandas as pd
from typing import List, Set, Dict, Tuple, Optional
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import sklearn
import cloudpickle

In [4]:
df = pd.read_json('data/rnn_data.json.gz', compression='gzip')

# Starting data ~293K rows

In [5]:
df.loc[ df['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "

In [6]:
df2 = df[ df['subtitles'].apply(lambda x: len(' '.join(x)) > 500 )]

In [7]:
df2.loc[:,'subtitles'] = df2.loc[:,'subtitles'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.loc[:,'subtitles'] = df2.loc[:,'subtitles'].apply(lambda x: ' '.join(x))


In [8]:
vectorizer = TfidfVectorizer(stop_words = 'english', min_df = .3, max_df = .95, ngram_range=(1,2))

X = vectorizer.fit_transform(df2.subtitles)


In [9]:
pca = PCA(n_components = 5, random_state=11)

In [10]:
X_pca = pca.fit_transform(X.toarray())


In [11]:
pca.explained_variance_

array([0.06240621, 0.05025503, 0.03003561, 0.02643839, 0.02015723])

In [12]:
class Add_PCA(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):

    # Adds five rows for each video's subtitle transformation via TFIDF vectorizer and PCA

    def __init__(self):

        # Passing the pre-trained vectorizer and pca objects

        self.vectorizer = vectorizer
        self.pca = pca

        return None
    
    def fit(self, df: pd.DataFrame):

        return df

    def transform(self, df: pd.DataFrame):

        nd = df.copy()

        pca_cols = [ 'pca_' + str(n) for n in range(5) ]

        # Some subtitle fields may have empty lists; this is a poor fix

        nd.loc[ nd['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "
        nd.loc[:,'subtitles'] = nd.loc[:,'subtitles'].apply(lambda x: ' '.join(x))

        X_vec = self.vectorizer.transform(nd.subtitles)
        X_pca = self.pca.transform(X_vec.toarray())

        df[pca_cols] = X_pca
        
        return df

In [13]:
class YT_features(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator):

    """
    Features added by this transformer:
    
    vid_name_chars -- number of characters in the video title
    vid_name_words -- number of words in the video title
    desc_chars -- number of characters in the video description
    desc_words -- number of words in the video description
    subtitle_chars -- number of characters in the video subtitles
    subtitle_words -- number of words in the video subtitles
    subtitles_words_unique -- number of distinct words appearing in the video subtitles
    has_profanity -- whether the video subtitles contain (redacted) profanity (boolean)
    has_music -- whether the video indicates musical accompaniment (boolean)
    has_links -- whether the video description has URLs (boolean)
    link_perc -- how much of the description text is devoted to URLs
    
    
    """

    def __init__(self):

        return None

    def fit(self, df: pd.DataFrame):

        return df

    def transform(self, df: pd.DataFrame):

        nd = df.copy()

        bleep = '[ __ ]'
        music = '[Music]'
        url_regex = re.compile(r'http.*')

        # Handling degenerate cases

        nd.loc[ nd['subtitles'].apply(lambda x: len(x) == 0), 'subtitles'] = " "
        nd.loc[:,'subtitles'] = nd.loc[:,'subtitles'].apply(lambda x: x[0])

        nd.loc[ nd.description.isna(), 'description'] = " "

        # Adding features

        df.loc[:, 'vid_name_chars'] = nd.loc[:,'vid_name'].apply(len)
        df.loc[:, 'vid_name_words'] = nd.loc[:,'vid_name'].apply(lambda x: len(x.split()))

        df.loc[:, 'desc_chars'] = nd.loc[:,'description'].apply(len)
        df.loc[:, 'desc_words'] = nd.loc[:,'description'].apply(lambda x: len(x.split()))


        df.loc[:, 'subtitle_chars'] = nd.loc[:,'subtitles'].apply(len)
        df.loc[:, 'subtitle_words'] = nd.loc[:,'subtitles'].apply(lambda x: len(x.split()))
        df.loc[:, 'subtitle_words_unique'] = nd.loc[:,'subtitles'].apply(lambda x: len(set(x.split())))


        df.loc[:, 'has_profanity'] = nd.loc[:,'subtitles'].apply(lambda x: bleep in x)
        df.loc[:, 'has_music'] = nd.loc[:, 'subtitles'].apply(lambda x: music in x)

        df.loc[:, 'has_links'] = nd.loc[:, 'description'].apply(lambda x: bool(url_regex.search(x)))

        for i in nd.index:

            if df.loc[i, 'has_links']:
                desc = nd.loc[i, 'description']

                link_text = ''.join(url_regex.findall(desc))
                df.loc[i, 'link_perc'] = len(link_text) / len(desc)

            else:
                df.loc[i, 'link_perc'] = 0

        return df

In [14]:
ytt = YT_features()
pipe_pca = Add_PCA()

In [15]:
test_pipe = Pipeline(
    steps = [
        ('Basic Features', ytt),
        ('PCA', pipe_pca)

    ]
)

In [16]:
test_pipe.transform(df.sample(5))

Unnamed: 0,vid_id,chan_query,chan_id,chan_name,chan_viewcount,chan_subcount,chan_start_dt,chan_thumb,chan_vidcount,vid_name,...,subtitle_words_unique,has_profanity,has_music,has_links,link_perc,pca_0,pca_1,pca_2,pca_3,pca_4
1070554,7ncXVNjNUlE,none,UCuw4-InDchDnYg0LPjHiGQA,Jessica O'Donohue,14321510,103000,2011-09-30T00:38:07Z,https://yt3.ggpht.com/ytc/AMLnZu9fWg7hcb_UVKY3...,375,HUGE WALMART GROCERY HAUL | EASY CROCKPOT RECI...,...,540,False,True,True,0.15829,-0.221926,-0.135469,-0.024123,-0.041443,-0.066559
34189,zwDDM0PXeFw,none,UCbRj3Tcy1Zoz3rcf83nW5kw,SAM THE COOKING GUY,524065067,3440000,2011-07-22T05:32:54Z,https://yt3.ggpht.com/ytc/AMLnZu_J4iIfeusuVEFl...,1589,Simple Chicken Flatbread | SAM THE COOKING GUY,...,309,False,False,True,0.194139,-0.17017,-0.155699,0.022694,0.089975,0.015101
262231,KVERSJIK3kQ,gnocchi,UCIAJ7fd67S0XpsWk15jeIcQ,Cooking Light,10718341,43000,2006-08-12T15:20:38Z,https://yt3.ggpht.com/ytc/AMLnZu8EnMJDCK90eYOx...,890,Cheesy Pull-Apart Eggplant | Wow! | Cooking Light,...,1,False,False,True,0.304403,0.072432,0.743272,0.228359,-0.038117,0.094982
57992,LBcaPSbdtks,none,UC8Y-jrV8oR3s2Ix4viDkZtA,Food Network,563975854,2150000,2006-11-16T09:06:09Z,https://yt3.ggpht.com/AIUdnGJYCXdxZPNUtT5khTOb...,5067,7 Tips for Baking a Cake From Our Food Network...,...,2,False,False,True,0.22651,0.146404,0.260532,-0.178524,0.025297,0.083467
318121,Zzmd__QVZYU,smoked pork,UCfOZuEpYpw34VOjEUxL06dw,Weekend Warrior BBQ,8123305,59500,2017-06-17T13:49:32Z,https://yt3.ggpht.com/ytc/AMLnZu8uhTFL2W7QOKls...,257,Smoked Pork Picnic | How to Smoke a Picnic Sho...,...,570,False,True,True,0.253203,-0.11809,-0.197336,0.295464,0.311546,0.046933


In [17]:
cloudpickle.dump(test_pipe, open('data/feat_pca_pipe.cloudpickle', 'wb'))

In [18]:
recon = cloudpickle.load(open('data/feat_pca_pipe.cloudpickle', 'rb'))


In [19]:
recon.transform(df.sample(10))

Unnamed: 0,vid_id,chan_query,chan_id,chan_name,chan_viewcount,chan_subcount,chan_start_dt,chan_thumb,chan_vidcount,vid_name,...,subtitle_words_unique,has_profanity,has_music,has_links,link_perc,pca_0,pca_1,pca_2,pca_3,pca_4
235123,jgz5sYmS6o4,dulce de,UC51O5CJFmd_5cyvDfeHv1ZQ,Mi Vida en un Dulce,17635008,176000,2015-12-22T19:48:51Z,https://yt3.ggpht.com/ytc/AMLnZu-D26J_yQ5ScSta...,253,Queque de Camote con Miel y Especias,...,11,False,False,True,0.049689,0.405978,-0.14234,0.011053,-0.379024,0.24806
1034336,3eoSbLpLA-w,none,UCqJkAAmi4QKCPCF62r_-BhQ,India Food Network,117758625,462000,2012-08-28T10:02:17Z,https://yt3.ggpht.com/ytc/AMLnZu8tGj3ow2dHlAzf...,2952,Baisakhi Special Puri Aloo By Seema,...,176,False,False,True,0.282178,0.325901,-0.163726,-0.081613,-0.075811,0.000991
1089564,v8E4CkeEy1s,none,UCwiTOchWeKjrJZw7S1H__1g,Food Insider,1705649569,4460000,2017-10-17T21:11:29Z,https://yt3.ggpht.com/ytc/AMLnZu80hN5x1GVlHZ3V...,1332,Mega Poke Bowl Weighs 20 Pounds,...,128,False,True,True,0.28267,-0.166292,0.314252,0.178171,0.02138,-0.119654
114769,0PWBu2o2oyk,none,UCYk8BsOEXzccXnoLyhuhevQ,Toniawyt,354622,2280,2018-11-21T20:46:18Z,https://yt3.ggpht.com/ytc/AMLnZu8EPZGl4TSfiK5Z...,251,Lemon Yogurt Cake Recipe || Lemon Bundt Cake R...,...,2,False,True,False,0.0,0.254448,0.917287,0.396814,-0.132439,-0.144223
1107487,x1bebdk-SZ0,none,UCyeXIDrwTEyFe-16MWN9xZg,CupofJoe Caribbean,620204,4640,2014-08-06T00:21:42Z,https://yt3.ggpht.com/ytc/AMLnZu--7E0r3KAQZMD6...,1532,SPICED FROZEN HOT CHOCOLATE | Joel | Zan | Ova...,...,112,True,True,True,0.545673,0.050354,0.189444,-0.055459,0.051938,0.06378
116572,gH7iXkJlUiU,none,UCOaZMVOKzeunRuvTvNBvtMA,Beers-Jack of BBQ,1047240,6400,2017-12-07T22:41:52Z,https://yt3.ggpht.com/ytc/AMLnZu8Cu7fqvL04nKsg...,191,Smashburgers on the Kamado Joe Soapstone! Pit ...,...,340,False,True,True,0.212436,-0.102059,-0.225031,0.20187,-0.025316,-0.006821
155118,SogcWMwjxS0,none,UCgoxyzvouZM-tCgsYzrYtyg,NishaMadhulika,2728583942,13100000,2009-08-02T13:13:39Z,https://yt3.ggpht.com/821Dq97TRsnhpQhewahvSJPa...,1973,Rava upma recipe - Sooji Upma Recipe - Semolin...,...,5,False,False,True,0.422535,0.589815,0.058319,-0.31245,0.272837,0.113087
144774,XMCPIXOI_ZA,none,UCmoX4QULJ9MB00xW4coMiOw,Sanjeev Kapoor Khazana,1374735008,7130000,2009-07-29T04:09:28Z,https://yt3.ggpht.com/GVTtEiBD6Mt1FR_Y5FKgYa8m...,12787,Apple Halwa | एप्पल हलवा | Sanjeev Kapoor Khazana,...,1,False,False,True,0.117647,0.254448,0.917287,0.396814,-0.132439,-0.144223
27676,NFiHLJ-E_xs,none,UChBEbMKI1eCcejTtmI32UEw,Joshua Weissman,936326402,7090000,2014-02-28T00:03:34Z,https://yt3.ggpht.com/ytc/AMLnZu-2pyUTWSFy7qzi...,465,How To Make Steamed Dumplings Completely From ...,...,612,False,True,True,0.377868,-0.062335,-0.100895,-0.05975,0.115902,-0.075143
1091792,zQawJBSx0f0,none,UCwr3OhGtU4HLWSjYOxXliZA,Sharmin’s Kitchen,2412118,14000,2019-12-08T03:20:48.639258Z,https://yt3.ggpht.com/ytc/AMLnZu-Xmm3xJplRvQ-6...,780,How to make Fried Grey Snapper Curry Guyanese ...,...,294,False,True,False,0.0,0.123817,-0.159194,0.23467,0.416281,0.180853
