In [1]:
import csv
import json
from functools import reduce
import dateutil.parser

In [2]:
def get_rows(filename):
    rows = []
    video_ids = {}
    with open(filename, "r", encoding="ISO-8859-1") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')  # encoding = "ISO-8859-1"
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                cols = ", ".join(row)
                c = list(map(lambda e: (e, row[e]), range(len(row))))
                #print(c)
            else:
                if row[0] not in video_ids:
                    video_ids[row[0]] = 1
                    rows.append([row[1], int(row[4]), dateutil.parser.isoparse(row[5]), int(row[7]), int(row[8]), int(row[9]), int(row[10])])
            line_count += 1
    return rows

In [3]:
def group_by_category(rows):
    categories = {}
    for row in rows:
        cat_id = str(row[1])
        if cat_id not in categories:
            categories[cat_id] = {
                "count": 0,
                "views": 0
            }
        categories[cat_id]["count"] += 1
        categories[cat_id]["views"] += row[3]
    return categories

For the firs graph of our visualization (map with category distribution per category) we need the following data for each country:
1. Number of total views
2. Number of videos
3. Average number of views per video (Number of total views / Number of videos)
4. The distribution of views and number of videos per category

In [4]:
def category_distribution_per_nation():
    distr = {}
    nations = ["CA", "DE", "FR", "GB", "IN", "JP", "KR", "MX", "RU", "US"]
    for nation in nations:
        print(f"Preparing for nation: {nation}")
        filename = f"../youtube-new/{nation}videos.csv"
        rows = get_rows(filename)
        distribution = group_by_category(rows)
                
        nb_views = reduce(lambda acc, val: acc + val["views"], list(distribution.values()), 0)
        nb_videos = reduce(lambda acc, val: acc + val["count"], list(distribution.values()), 0)
        
        distr[nation] = {
            "category_distribution": distribution,
            "nb_views": nb_views,
            "nb_videos": nb_videos,
            "avg_views_per_video": int(nb_views / nb_videos)
        }
    return distr

distribution = category_distribution_per_nation()

Preparing for nation: CA
Preparing for nation: DE
Preparing for nation: FR
Preparing for nation: GB
Preparing for nation: IN
Preparing for nation: JP
Preparing for nation: KR
Preparing for nation: MX
Preparing for nation: RU
Preparing for nation: US


In [5]:
distribution

{'CA': {'category_distribution': {'10': {'count': 1564, 'views': 2055194663},
   '23': {'count': 1946, 'views': 1080548028},
   '24': {'count': 8248, 'views': 3724671352},
   '25': {'count': 2941, 'views': 645131321},
   '22': {'count': 2559, 'views': 929579787},
   '26': {'count': 1272, 'views': 541943057},
   '1': {'count': 1151, 'views': 609046665},
   '28': {'count': 636, 'views': 362522426},
   '20': {'count': 772, 'views': 386218794},
   '17': {'count': 1932, 'views': 1069969973},
   '29': {'count': 50, 'views': 24957964},
   '15': {'count': 211, 'views': 80578126},
   '19': {'count': 202, 'views': 49516868},
   '2': {'count': 248, 'views': 83092413},
   '27': {'count': 590, 'views': 174064063},
   '43': {'count': 104, 'views': 55528692},
   '30': {'count': 1, 'views': 225528}},
  'nb_views': 11872789720,
  'nb_videos': 24427,
  'avg_views_per_video': 486051},
 'DE': {'category_distribution': {'24': {'count': 10846, 'views': 3222709357},
   '23': {'count': 1756, 'views': 85180054