## YouTube Reccomendation Cleaner ##

This script processes, cleans and enrich the YouTube recommendation data gathered trhough an experiment by De Volkskrant and De Correspondent. Participants (n=78) were asked to run a script [ad link] on their computer. This script searched for ten key words on YouTube and clicked on ten recommendations and repeated the last step 3 times (so depth is 4). We also asked the participants to provide some personal data. Identifying data has been hashed, to protect the privacy of the participants. 

In [6]:
#!pip3 install --upgrade google-api-python-client


Requirement already up-to-date: google-api-python-client in /home/dim/Environments/youtube/lib/python3.6/site-packages (1.7.4)


In [8]:
import pandas as pd
import numpy as np
import os
import json
import glob
import re
import csv
import hashlib
from config import *

#and import some Google API libraries for later on
from apiclient.discovery import build
from apiclient.errors import HttpError
#from oauth2client.tools import argparse

In [None]:
#load json documents from the experiment and append them to dataframe

df = pd.DataFrame()
path = config.path_raw_data
path_temp = config.path_temp_data

for file in glob.glob(path + '/**/*.json', recursive=True):
    try:
        name = os.path.basename(file)
        name = re.sub('youtube(-onderzoek)?-(mac|win)-\s?', '', name)
        name = re.sub('-\d{4}-\d{2}-\d{2}\.json', '', name)
        dirname = os.path.dirname(file)
        dirname = dirname.replace('/Users/mindyourownbusiness/Desktop/resultaten_yt/', '')
        df_temp = json.load(open(file))
        for item in df_temp:
            item['search_term'] = name
            item['subject'] = dirname
        df = df.append(df_temp)
    except:
        pass

In [None]:
#hash email adresses in dataframe

df['user_hash'] = [hashlib.md5(val).hexdigest() for val in df['subject'].str.encode('utf-8')]
df.drop('subject', axis=1, inplace=True)

In [None]:
#write dataframe to file

df.to_csv(path_temp + 'recommendations.csv')

## Extract titles from YouTube ##

Most recommendations are in a list and only the video_id is given. It is therefore necessary to extract the lists to rows and use the video_id to query the YouTube API for extra iformation, like title, channel, etc.

In [None]:
vids = pd.read_csv(path_temp + 'recommendations.csv')

In [None]:
#Extract the video ids from the list in the recommendations column

def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def splitListToRows(row, separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    df.apply(splitListToRows, axis=1, args = (separator, ))
    new_df = pd.DataFrame(row_accumulator)
    return new_df

In [None]:
vids_new = splitDataFrameList(vids, 'recommendations', ',')

In [None]:
# and clean it up a bit

vids_new.recommendations = vids_new.recommendations.astype(str) #convert to strings
vids_new.recommendations = vids_new.recommendations.str.replace("'|\[|\]|\s", '') #remove some clutter

In [None]:
#write extracted and clean version to file

vids_new.to_csv(path_temp + 'recommendations_unpacked.csv')

In [6]:
#load the file and find the unique video ids to query YouTube

vids_new = pd.read_csv(path_temp + 'recommendations_unpacked.csv')
lookup = vids_new.recommendations.unique()

In [3]:
#load credentials from config file

DEVELOPER_KEY = config.DEVELOPER_KEY
YOUTUBE_API_SERVICE_NAME = config.YOUTUBE_API_SERVICE_NAME
YOUTUBE_API_VERSION = config.YOUTUBE_API_VERSION

In [4]:
#load search function

def youtube_search(video_id):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    search_response = youtube.videos().list(
    id = video_id,
    part="snippet",
    ).execute()
    return search_response

In [7]:
#query the YouTube API. Due to the large amount of videos I think it's better to break the list up in chunks. 

#lookup = lookup[230179:] you can uncomment this when the session breaks, e.g. due to ssl problems (which I had). Just continue from where you left off.
total_count = len(lookup)

chunks = total_count // 200 + 1

for i in range(chunks):
    batch = lookup[i*200:(i+1)*200]
    for video_id in batch:
        video_json = youtube_search(video_id)
        try:
            for video in video_json['items']:
                Id = video['id']
                publishedAt = video['snippet']['publishedAt']
                title = video['snippet']['title']
                channelId = video['snippet']['channelId']
                channelTitle = video['snippet']['channelTitle']
                #description = video['snippet']['description']
                with open(path_temp + "results.csv", "a") as csvFile:
                    fieldnames = ['id', 'publishedAt', 'title', 'channelId', 'channelTitle']
                    writer = csv.DictWriter(csvFile, fieldnames)
                    writer.writerow({'id': Id, 'publishedAt': publishedAt, 'title': title, 'channelId': channelId, 'channelTitle': channelTitle})
                del video
        except:
            continue
    del batch

In [11]:
#compare columns of lookup and results to see if we haven't missed anything.

results = pd.read_csv(path_temp + 'results.csv', header=None, names=['id', 'date', 'title', 'channel_id', 'channel_name'])
lookup = vids_new.recommendations.unique()
lookup = pd.DataFrame(lookup)

In [27]:
lookup.columns = ['id']
lookup.sort_values(by=['id'], axis=0, inplace=True)

In [28]:
results.sort_values(by=['id'], axis=0, inplace=True)

In [30]:
todo = pd.Index.symmetric_difference(pd.Index(results.id), pd.Index(lookup.id))

In [32]:
vids = []

for vid in todo:
    vids.append(vid)

## Bring it all together

So now we have three csv files:
1. recommendations_unpacked.csv - this is the raw json data from the experiment, flattened into a dataframe.
2. antwoorden_yt_experiment.csv - this are the answers from the subjects that can be used as filters (edge attributes if we use a graph).
3. results.csv - this contains the extracted lists of the first df['recommendations'], appended with video ids and titles, channel ids and titles and descriptions.

In [10]:
users = pd.read_csv(path_temp_data + '/antwoorden_yt_experiment.csv', sep=';', encoding = "ISO-8859-1")

In [15]:
users['user_hash'] = [hashlib.md5(val).hexdigest() for val in users['mail'].str.encode('utf-8')]
users.drop('mail', axis=1, inplace=True)

In [131]:
columns = ['video_id', 'upload_date', 'video_title', 'channel_id', 'channel_title']
results = pd.read_csv(path_temp + 'results.csv', names=columns)

In [132]:
results.sort_values(by='video_id', axis=0, inplace=True)

In [133]:
recommendations = pd.read_csv('recommendations_unpacked.csv')

In [134]:
recommendations.rename(index=str, columns={"recommendations": "video_id"}, inplace=True)

In [135]:
semi_final = pd.merge(recommendations, users, on='user_hash', how='left')

In [136]:
yt = pd.merge(semi_final, results, on='video_id', how='left')

In [137]:
#and it still needs some cleaning

yt.drop(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'key', 'mult', 'nb_recommendations', 'Timestamp', 'yt_gebruik', 'sociale_media', 'deel_computer'], axis=1, inplace=True)


In [138]:
for col in ['depth', 'search_term', 'user_hash', 'woonplaats', 'geslacht', 'opleiding', 'politiek']:
    yt[col] = yt[col].astype('category')

In [139]:
yt.columns = ['from_channel_title', 
              'depth', 
              'dislikes', 
              'likes', 
              'from_video_id', 
              'search_term', 
              'from_video_title', 
              'user_hash',
             'views',
             'woonplaats',
             'leeftijd',
             'geslacht',
             'opleiding',
             'politiek',
             'upload_date',
             'to_video_title',
             'to_channel_id', 
             'to_channel_title']

In [140]:
yt = yt[['from_video_id',
         'from_video_title',
         'from_channel_title', 
         'depth',
         'likes',
         'dislikes',
         'views',
         'to_video_title',
         'upload_date',
         'to_channel_id', 
         'to_channel_title',
         'search_term',
         'user_hash',
         'woonplaats',
         'leeftijd',
         'geslacht',
         'opleiding',
         'politiek']]

In [141]:
yt = yt.dropna(subset=['from_video_id'])

In [142]:
yt = yt.dropna(subset=['to_video_title'])

In [143]:
yt = yt.dropna(subset=['user_hash'])

In [144]:
#write result to csv

yt.to_csv(path_temp + 'yt_final.csv')

That's it. You can find the Notebook for the analysis of the data here [ad link]