In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |▎                               | 10kB 24.8MB/s eta 0:00:01[K     |▌                               | 20kB 33.8MB/s eta 0:00:01[K     |▊                               | 30kB 22.1MB/s eta 0:00:01[K     |█                               | 40kB 19.1MB/s eta 0:00:01[K     |█▏                              | 51kB 15.5MB/s eta 0:00:01[K     |█▌                              | 61kB 14.6MB/s eta 0:00:01[K     |█▊                              | 71kB 14.1MB/s eta 0:00:01[K     |██                              | 81kB 13.5MB/s eta 0:00:01[K     |██▏                             | 92kB 13.4MB/s eta 0:00:01[K     |██▍                             | 102kB 13.0MB/s eta 0:00:01[K     |██▋                             | 112kB 13.0MB/s eta 0:00:01[K     |███                             | 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# copy USvideos from google drive
!cp "/content/drive/MyDrive/DL Project_Recommendation System/USvideos.csv" ./

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from transformers import BertTokenizer, TFBertModel

df = pd.read_csv('USvideos.csv',encoding  ='utf8',error_bad_lines = False, sep=',')

# group by name and take mean of 'views', 'likes', 'dislikes', 'comment_total'
df  = df.groupby(['video_id','title', 'tags'])['views', 'likes', 'dislikes', 'comment_total'].mean().reset_index()

# format tags column
df['tags'] = df['tags'].astype(str)
df['tags'] = df['tags'].str.replace('|', ',')

# replace [non] with empty string
df['tags'] = df['tags'].str.replace('[none]', '')

## generate 20 users
print("generating 20 users...")
df['userId'] = np.random.randint(1, 21, df.shape[0])

# manipulate the data, cold start
print("manipulating user click and user rating..")
df['user_click'] = np.random.randint(0, 2, df.shape[0])
df['user_rating'] = np.random.randint(0, 6, df.shape[0])
print("manupulating user like and time sepend on each video ...")
df['user_like'] = np.random.randint(0, 2, df.shape[0])
df['time_spend'] = np.random.randint(0, 11, df.shape[0])
df.loc[df['user_click'] == 0, 'user_like'] = 0
df.loc[df['user_click'] == 0, 'time_spend'] = 0

print("manupulating recommendated video ranking position (sorted by view count)...")
df['position'] = df.groupby("userId")["views"].rank(ascending=False)
df['pos_bias'] = df['user_click']+1
df.loc[df['pos_bias'] == 2, 'pos_bias'] = 0

# missing feature: device info
print("manuipulate device info...")
df['device_info'] = [random.choice(['ios','android','web']) for _ in range(0,df.shape[0])]

#embedding for query and candidate items (user and context features)
print("generating bert embedding videos ....")

# video embeddings
df['video_emb'] = df['title']

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertModel.from_pretrained('bert-base-cased')
max_length = 10 
batch_encoding = tokenizer.batch_encode_plus(df['video_emb'].tolist(), max_length=max_length, pad_to_max_length=True)

outputs = model(tf.convert_to_tensor(batch_encoding['input_ids'])) # shape: (batch,sequence length, hidden state)
embeddings_video = tf.reduce_mean(outputs[0],1)
df['video_emb'] = embeddings_video.numpy().tolist()

print("generating bert embedding for user...")
# get user embeddings
# assuming tags as user interested tags
batch_encoding_user = tokenizer.batch_encode_plus(df['tags'].tolist(), max_length=max_length, pad_to_max_length=True)
outputs_user = model(tf.convert_to_tensor(batch_encoding_user['input_ids'])) # shape: (batch,sequence length, hidden state)
embeddings_user = tf.reduce_mean(outputs_user[0],1)
df['user_emb'] = embeddings_user.numpy().tolist()

# to speedup:
df = df.reset_index(drop=True) #shuffle df


b'Skipping line 2401: expected 11 fields, saw 21\nSkipping line 2800: expected 11 fields, saw 21\nSkipping line 5297: expected 11 fields, saw 12\nSkipping line 5299: expected 11 fields, saw 12\nSkipping line 5300: expected 11 fields, saw 12\nSkipping line 5301: expected 11 fields, saw 12\n'
  # Remove the CWD from sys.path while we load stuff.


generating 20 users...
manipulating user click and user rating..
manupulating user like and time sepend on each video ...
manupulating recommendated video ranking position (sorted by view count)...
manuipulate device info...
generating bert embedding videos ....


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defa

generating bert embedding for user...


Save file as csv for future needs

In [5]:
df.to_csv('videos.csv') 
#!cp "videos.csv" "/content/drive/MyDrive/DL Project_Recommendation System/"

In [6]:
df.head()

Unnamed: 0,video_id,title,tags,views,likes,dislikes,comment_total,userId,user_click,user_rating,user_like,time_spend,position,pos_bias,device_info,video_emb,user_emb
0,--JinobXWPk,DANGEROUS Jungle Spider!,"advtur,advturus,aimals,brakig,brakig trail,cyt...",1319945.0,38949.0,533.0,6768.0,20,1,3,0,10,22.0,0,web,"[-0.09443346410989761, -0.032111819833517075, ...","[0.3605319857597351, 0.11368177086114883, 0.52..."
1,-1fzGnFwz9M,9 Things You Need To Know About Kittens - Simo...,"cart,sims cat,sim's cat,simscat,sim tfild,sim ...",189414.0,7070.0,112.0,288.0,15,1,4,1,3,78.0,0,ios,"[0.268877774477005, 0.2353297770023346, -0.039...","[0.6430438160896301, 0.2000298798084259, 0.220..."
2,-3AGlBYyLjo,Best Tom Petty Interview Ever,"tm,ptty,tm ptty,Tm Ptty,Tm Ptty (Musical Artis...",2143.0,16.0,2.0,4.0,5,1,1,1,2,122.0,0,ios,"[-0.123487189412117, 0.16194400191307068, 0.12...","[0.3175366520881653, 0.2795766592025757, 0.465..."
3,-3lMEZ6k5NA,170912 BTS singing 'Closer' with The Chainsmokers,170912 BTS,201901.0,10034.0,148.0,591.0,20,1,4,1,4,70.0,0,ios,"[0.1369580775499344, 0.2950386106967926, -0.02...","[-0.005064257886260748, 0.1868782937526703, -0..."
4,-5sCWsLlTCI,SNL Host Kumail Nanjiani and P!nk Share Favori...,"saturday ight liv,sl,sl sas 43,kumail ajiai,sl...",85052.0,1458.0,97.0,132.0,15,1,0,1,1,96.0,0,web,"[0.09312273561954498, -0.25278744101524353, -0...","[0.11355461925268173, -0.22810769081115723, 0...."
