In [1]:
import os
import numpy as np
import pandas as pd
import cloudpickle
import emoji

from sklearn.preprocessing import OrdinalEncoder
import tensorflow as tf

from tqdm import notebook

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
ens_data = pd.read_json('/home/alevink/capstone/ens_data.json.gz')
images_path = '/home/alevink/capstone/ens_images/images'
cnn_model_path = '/home/alevink/capstone/checkpoints_folder_fit/ckpt/19-2.163541'

In [3]:
AUTOTUNE = tf.data.AUTOTUNE
def image_prep(image_path, image_size:tuple):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, image_size)
    img /= 255
    return img

def load_data(dataframe:pd.DataFrame, images_path:str, image_size:tuple=(224, 224), shuffle:bool=True):
    dataframe_ = dataframe.copy()
    if shuffle:
        dataframe_ = dataframe_.sample(frac=1)

    for_preds = pd.Series(index=dataframe_['vid_id'].values, name='cnn_thumb_preds', dtype='float64')

    dataframe_['thumb_name'] = dataframe_['thumb_name'].apply(lambda name: os.path.join(images_path, name))
    
    test_data = tf.data.Dataset.from_tensor_slices((dataframe_['thumb_name'].values, dataframe_['vid_id'].values))

    test_data = test_data.map(lambda x, y: (image_prep(x, image_size=image_size), y), num_parallel_calls=AUTOTUNE)

    test_data = test_data.batch(1).prefetch(buffer_size=AUTOTUNE)

    return test_data, for_preds


In [4]:
ens_data_for_cnn, for_preds = load_data(ens_data, images_path)

2022-12-10 06:57:26.119223: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-10 06:57:26.746506: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14635 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:3b:00.0, compute capability: 7.0


In [5]:
CnnModel = tf.keras.models.load_model(cnn_model_path)

In [6]:
for data_batch in notebook.tqdm(ens_data_for_cnn):
    image, vid_id = data_batch

    preds = CnnModel.predict(image)
    
    for_preds[vid_id.numpy()[0].decode('utf8')] = preds[0][0]

  0%|          | 0/71233 [00:00<?, ?it/s]

2022-12-10 06:57:29.950177: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8401


In [7]:
ens_data = ens_data.merge(for_preds, left_on='vid_id', right_index=True)

In [8]:
train_frac = .80
ens_data = ens_data.sample(frac=1)
train = ens_data.iloc[:int(train_frac*ens_data.shape[0]), :]
test = ens_data.iloc[int(train_frac*ens_data.shape[0]):, :]
train.shape, test.shape

((56986, 36), (14247, 36))

In [9]:
transformer = cloudpickle.load(open('/home/alevink/capstone/feat_pca_pipe.cloudpickle', 'rb'))
train = transformer.transform(train)
test = transformer.transform(test)
train.shape, test.shape

((56986, 52), (14247, 52))

In [11]:
def show_stats(df):
    dtypes = df.dtypes
    nulls = df.isnull().sum()
    unique_count = df.apply(lambda x: np.nan if isinstance(x[0], list) else x.nunique())
    columns = ['dtypes', 'null count', 'unique count']
    df_ = pd.concat([dtypes, nulls, unique_count], axis=1)
    df_.columns = columns
    return df_

In [12]:
show_stats(train)

Unnamed: 0,dtypes,null count,unique count
vid_id,object,0,56986.0
chan_query,object,0,183.0
chan_id,object,0,4981.0
chan_name,object,0,4979.0
chan_viewcount,int64,0,5303.0
chan_subcount,int64,0,2397.0
chan_start_dt,object,0,4981.0
chan_thumb,object,0,4982.0
chan_vidcount,int64,0,961.0
vid_name,object,0,56779.0


In [13]:
cols_to_drop = ['chan_query', 
                'chan_id', 
                'chan_name', 
                'chan_viewcount',
                'chan_subcount',
                'chan_start_dt',
                'chan_thumb', 
                'chan_vidcount', 
                'vid_name',
                'vid_publish_dt', 
                'vid_thumb',
                'vid_duration', 
                'vid_viewcount', 
                'vid_likecount', 
                'vid_commentcount',
                'description',
                'age_limit',
                'is_live',
                'thumb_name', 
                'subtitles']



train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

In [14]:
train['categories'] = train['categories'].apply(lambda x: ' '.join(x))
test['categories'] = test['categories'].apply(lambda x: ' '.join(x))

In [15]:
train = train.merge(pd.get_dummies(train['categories']).astype('int64'), left_index=True, right_index=True).drop('categories', axis=1)
test = test.merge(pd.get_dummies(test['categories']).astype('int64'), left_index=True, right_index=True).drop('categories', axis=1)

In [16]:
train.iloc[:, -15:].sum().sort_values(ascending=False)

Howto & Style            27896
People & Blogs           13673
Entertainment             7795
Education                 2500
Travel & Events           1414
Film & Animation          1046
News & Politics            705
Gaming                     492
Comedy                     387
Sports                     373
Music                      221
Science & Technology       206
Nonprofits & Activism      136
Pets & Animals              83
Autos & Vehicles            59
dtype: int64

In [17]:
to_combine = ['Gaming', 'Comedy', 'Sports', 'Music', 'Science & Technology', 'Nonprofits & Activism', 'Pets & Animals', 'Autos & Vehicles']

train['other_category'] = train[to_combine].sum(axis=1)
test['other_category'] = test[to_combine].sum(axis=1)

train.drop(to_combine, axis=1, inplace=True)
test.drop(to_combine, axis=1, inplace=True)

In [18]:
def tag_stats(df:pd.DataFrame, drop_tags_col:bool=True):
    df['num_tags'] = df['tags'].apply(lambda x: len(x))
    df['num_emoji_in_tags'] = df['tags'].apply(lambda x: emoji.emoji_count(' '.join(x)))
    if drop_tags_col:
        df.drop('tags', axis=1, inplace=True)
    return df

train = tag_stats(train)
test = tag_stats(test)

In [19]:
train.isnull().sum().sort_values(ascending=False)[:5]

vbr       8493
abr         25
width        1
height       1
fps          1
dtype: int64

In [20]:
nas = train[['vcodec', 'vbr']]
nas = nas[nas['vbr'].isnull()]
nas = nas.fillna(1)
nas = nas.groupby('vcodec').count()
vc = train['vcodec'].value_counts()
nas.merge(vc, left_index=True, right_index=True).rename({'vbr':'vbr_null_count', 'vcodec': 'total_count'}, axis=1)

Unnamed: 0,vbr_null_count,total_count
avc1.42001E,1,1
avc1.4d400a,7,8
avc1.4d400b,117,117
avc1.4d400c,123,129
avc1.4d4014,213,213
avc1.4d4016,101,109
avc1.64001F,24,24
avc1.64001e,2256,2265
none,1,1
vp9,5650,17441


In [21]:
train.dropna(axis=0, inplace=True)
test.dropna(axis=0, inplace=True)

In [22]:
ord_enc = OrdinalEncoder()
train[['vcodec', 'acodec']] = ord_enc.fit_transform(train[['vcodec', 'acodec']])
test[['vcodec', 'acodec']] = ord_enc.transform(test[['vcodec', 'acodec']])
train.shape, test.shape

((48493, 40), (12104, 40))

In [23]:
train.to_json('/home/alevink/capstone/train_data.json.gz')
test.to_json('/home/alevink/capstone/test_data.json.gz')