In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

<br>
<br>
<br>

### Data Collection

In [2]:
# load data
data = pd.read_json("../data/europe/great_britain.json")
data.head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed
0,Jw1Y-zhQURU,1510597800,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10 02:08:29,"[christmas, john lewis christmas, john lewis, ...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False
1,3s1rvMFUweQ,1510597800,Taylor Swift: …Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12 00:54:44,"[SNL, Saturday Night Live, SNL Season 43, Epis...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False


In [3]:
data.dtypes

video_id                          object
trending_date                      int64
title                             object
channel_title                     object
category_id                        int64
publish_time              datetime64[ns]
tags                              object
views                              int64
likes                              int64
dislikes                           int64
comment_count                      int64
thumbnail_link                    object
comments_disabled                   bool
ratings_disabled                    bool
video_error_or_removed              bool
dtype: object

<br>
<br>
<br>

### Data Preparation

#### video_id

In [4]:
data.video_id.isna().any()

False

In [5]:
(data.video_id == "").value_counts()

False    38916
Name: video_id, dtype: int64

<br>
<br>

#### trending_date

In [6]:
def format_date(x):
    return datetime.fromtimestamp(x)

In [7]:
# convert to datetime
data.trending_date = data.trending_date.apply(format_date)
data.trending_date.head(2)

0   2017-11-14
1   2017-11-14
Name: trending_date, dtype: datetime64[ns]

<br>
<br>

#### title

In [8]:
data.title.isna().any()

False

In [9]:
(data.video_id == "").value_counts()

False    38916
Name: video_id, dtype: int64

<br>
<br>

#### channel_title

In [10]:
print(data.channel_title.isna().any())
(data.channel_title == "").value_counts()

False


False    38916
Name: channel_title, dtype: int64

<br>
<br>

#### category_id

In [11]:
data.category_id.dtype

dtype('int64')

In [12]:
data.category_id.isna().any()

False

<br>
<br>

#### publish_time

In [13]:
data.publish_time.isna().any()

False

<br>
<br>

#### tags

In [14]:
data.tags.isna().any()

False

In [15]:
data.tags.head(2)

0    [christmas, john lewis christmas, john lewis, ...
1    [SNL, Saturday Night Live, SNL Season 43, Epis...
Name: tags, dtype: object

<br>
<br>

#### views, likes, dislikes, comment_count

In [16]:
print(data.views.dtype)
print(data.likes.dtype)
print(data.dislikes.dtype)
print(data.comment_count.dtype)

int64
int64
int64
int64


In [17]:
print(data.views.isna().any())
print(data.likes.isna().any())
print(data.dislikes.isna().any())
print(data.comment_count.isna().any())

False
False
False
False


<br>
<br>

#### thumbnail_link

In [18]:
print(data.thumbnail_link.isna().any())
(data.thumbnail_link == "").value_counts()

False


False    38916
Name: thumbnail_link, dtype: int64

<br>
<br>

#### comments, ratings, video_errors

In [19]:
print(data.comments_disabled.dtype)
print(data.ratings_disabled.dtype)
print(data.video_error_or_removed.dtype)

bool
bool
bool


In [20]:
print(data.comments_disabled.isna().any())
print(data.ratings_disabled.isna().any())
print(data.video_error_or_removed.isna().any())

False
False
False


<br>
<br>
<br>

### Saving

In [21]:
# verify data
data.head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed
0,Jw1Y-zhQURU,2017-11-14,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10 02:08:29,"[christmas, john lewis christmas, john lewis, ...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False
1,3s1rvMFUweQ,2017-11-14,Taylor Swift: …Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12 00:54:44,"[SNL, Saturday Night Live, SNL Season 43, Epis...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False


In [22]:
# verify datatypes
data.dtypes

video_id                          object
trending_date             datetime64[ns]
title                             object
channel_title                     object
category_id                        int64
publish_time              datetime64[ns]
tags                              object
views                              int64
likes                              int64
dislikes                           int64
comment_count                      int64
thumbnail_link                    object
comments_disabled                   bool
ratings_disabled                    bool
video_error_or_removed              bool
dtype: object

<br>
<br>

In [23]:
from pymongo import MongoClient

<br>

In [24]:
# connect to database
connection_str = "mongodb://localhost:27017"
client = MongoClient(connection_str)
database = client.youtube_stats
collection = database.great_britain

In [25]:
# convert to dict

data_dict = []

for i in range(len(data)):
    data_dict.append(data.iloc[i].to_dict())

In [26]:
len(data_dict)

38916

In [27]:
data_dict[0]

{'video_id': 'Jw1Y-zhQURU',
 'trending_date': Timestamp('2017-11-14 00:00:00'),
 'title': 'John Lewis Christmas Ad 2017 - #MozTheMonster',
 'channel_title': 'John Lewis',
 'category_id': 26,
 'publish_time': Timestamp('2017-11-10 02:08:29'),
 'tags': ['christmas',
  'john lewis christmas',
  'john lewis',
  'christmas ad',
  'mozthemonster',
  'christmas 2017',
  'christmas ad 2017',
  'john lewis christmas advert',
  'moz'],
 'views': 7224515,
 'likes': 55681,
 'dislikes': 10247,
 'comment_count': 9479,
 'thumbnail_link': 'https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg',
 'comments_disabled': False,
 'ratings_disabled': False,
 'video_error_or_removed': False}

In [28]:
# save data
collection.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x140a7c85e50>

In [29]:
collection.find_one()

{'_id': ObjectId('640f3ae8a7a2bb8d5ca4579e'),
 'video_id': 'Jw1Y-zhQURU',
 'trending_date': datetime.datetime(2017, 11, 14, 0, 0),
 'title': 'John Lewis Christmas Ad 2017 - #MozTheMonster',
 'channel_title': 'John Lewis',
 'category_id': 26,
 'publish_time': datetime.datetime(2017, 11, 10, 2, 8, 29),
 'tags': ['christmas',
  'john lewis christmas',
  'john lewis',
  'christmas ad',
  'mozthemonster',
  'christmas 2017',
  'christmas ad 2017',
  'john lewis christmas advert',
  'moz'],
 'views': 7224515,
 'likes': 55681,
 'dislikes': 10247,
 'comment_count': 9479,
 'thumbnail_link': 'https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg',
 'comments_disabled': False,
 'ratings_disabled': False,
 'video_error_or_removed': False}

In [30]:
# close connection
client.close()