In [29]:
import csv
import json
from functools import reduce
import dateutil.parser
from datetime import timedelta

In [30]:
def get_rows(filename):
    rows = []
    video_ids = {}
    with open(filename, "r", encoding="ISO-8859-1") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')  # encoding = "ISO-8859-1"
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                cols = ", ".join(row)
                c = list(map(lambda e: (e, row[e]), range(len(row))))
            else:
                if row[0] not in video_ids:
                    video_ids[row[0]] = 1
                    rows.append(row)
            line_count += 1
    return rows

In [31]:
def adjust_date_with_timezone(ts, utc_hours_diff):
    return ts + timedelta(hours=utc_hours_diff)

In [39]:
def unify_data_from_countries():
    unified_data = []
    
    nations = [("CA", -4), ("DE", 2), ("FR", 2), ("GB", 1), ("IN", 5.5), ("JP", 9), ("KR", 9), ("MX", -5), ("RU", 3), ("US", -5)]
    for nation, utc_diff in nations:
        print(f"Preparing for nation: {nation}")
        filename = f"../youtube-new/{nation}videos.csv"
        rows = get_rows(filename)
        for row in rows:
            # publish ts, nb views
            unified_data.append((adjust_date_with_timezone(dateutil.parser.isoparse(row[5]), utc_diff), row[7]))
    return unified_data

In [40]:
unified_data = unify_data_from_countries()

Preparing for nation: CA
Preparing for nation: DE
Preparing for nation: FR
Preparing for nation: GB
Preparing for nation: IN
Preparing for nation: JP
Preparing for nation: KR
Preparing for nation: MX
Preparing for nation: RU
Preparing for nation: US


In [81]:
def group_by_weekday_and_hour(unified_data):
    values = list(map(lambda e: [e[0].weekday(), e[0].hour, int(e[1])], unified_data))
    
    group_by = {}
    for value in values:
        if (value[0], value[1]) not in group_by:
            group_by[(value[0], value[1])] = 0
        group_by[(value[0], value[1])] += value[2]
    
    return [[k[1], k[0], v] for (k, v) in group_by.items()]


In [82]:
def group_by_month_and_hour(unified_data):
    values = list(map(lambda e: [e[0].month, e[0].hour, int(e[1])], unified_data))
    
    group_by = {}
    for value in values:
        if (value[0], value[1]) not in group_by:
            group_by[(value[0], value[1])] = 0
        group_by[(value[0], value[1])] += value[2]
    
    return [[k[1], k[0], v] for (k, v) in group_by.items()]

In [83]:
grouped = group_by_month_and_hour(unified_data)
grouped

[[13, 11, 341793755],
 [15, 11, 300881891],
 [14, 11, 287110668],
 [7, 11, 200036553],
 [3, 11, 78263808],
 [19, 11, 357007293],
 [16, 11, 347118092],
 [10, 11, 272314061],
 [11, 11, 265462557],
 [12, 11, 329510418],
 [18, 11, 342593365],
 [21, 11, 182309126],
 [8, 11, 203165111],
 [17, 11, 263584474],
 [22, 11, 262113127],
 [20, 11, 247062048],
 [0, 11, 150383834],
 [23, 11, 191204402],
 [4, 11, 134835817],
 [1, 11, 140869603],
 [21, 10, 37627],
 [2, 11, 137507789],
 [15, 10, 234649],
 [9, 11, 199754365],
 [2, 2, 173841663],
 [5, 11, 111111988],
 [15, 12, 443477645],
 [17, 9, 313519],
 [18, 9, 134298],
 [6, 11, 279807687],
 [12, 10, 287771],
 [22, 12, 353698866],
 [22, 3, 371629529],
 [16, 10, 4385013],
 [1, 3, 243707363],
 [4, 10, 426284],
 [22, 9, 585832],
 [11, 12, 365804063],
 [12, 12, 434121596],
 [8, 12, 177207507],
 [10, 12, 463107848],
 [0, 12, 290039378],
 [1, 12, 192330533],
 [16, 12, 439840516],
 [9, 12, 287853837],
 [19, 12, 500004078],
 [14, 12, 411406257],
 [20, 12, 4194

In [84]:
len(grouped)

271