In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json


## Pre-processamento

In [0]:
df = pd.read_csv('USvideos.csv')

In [36]:
df.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description'],
      dtype='object')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 16 columns):
video_id                  40949 non-null object
trending_date             40949 non-null object
title                     40949 non-null object
channel_title             40949 non-null object
category_id               40949 non-null int64
publish_time              40949 non-null object
tags                      40949 non-null object
views                     40949 non-null int64
likes                     40949 non-null int64
dislikes                  40949 non-null int64
comment_count             40949 non-null int64
thumbnail_link            40949 non-null object
comments_disabled         40949 non-null bool
ratings_disabled          40949 non-null bool
video_error_or_removed    40949 non-null bool
description               40379 non-null object
dtypes: bool(3), int64(5), object(8)
memory usage: 4.2+ MB


In [38]:
df.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,40949.0,40949.0,40949.0,40949.0,40949.0
mean,19.972429,2360785.0,74266.7,3711.401,8446.804
std,7.568327,7394114.0,228885.3,29029.71,37430.49
min,1.0,549.0,0.0,0.0,0.0
25%,17.0,242329.0,5424.0,202.0,614.0
50%,24.0,681861.0,18091.0,631.0,1856.0
75%,25.0,1823157.0,55417.0,1938.0,5755.0
max,43.0,225211900.0,5613827.0,1674420.0,1361580.0


In [0]:
def count_unique(x):
  for col in x.columns:
    print(f'{col}: {x[col].nunique()}')

In [40]:
count_unique(df)

video_id: 6351
trending_date: 205
title: 6455
channel_title: 2207
category_id: 16
publish_time: 6269
tags: 6055
views: 40478
likes: 29850
dislikes: 8516
comment_count: 13773
thumbnail_link: 6352
comments_disabled: 2
ratings_disabled: 2
video_error_or_removed: 2
description: 6901


In [46]:
# Drop columns with no important information in our analysis:
#thumbnail_link: link to small image which identify the video in the search results

df.drop(columns='thumbnail_link', inplace=True)
print(df.columns)

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'comments_disabled', 'ratings_disabled', 'video_error_or_removed',
       'description'],
      dtype='object')


In [82]:
# Read json with category_id names and assign them in new dataframe column

import json

with open('US_category_id.json', 'r') as f:
  categ_us = json.load(f)

for k in categ_us['items']:
  print(k['id'], k['snippet']['title'])



1 Film & Animation
2 Autos & Vehicles
10 Music
15 Pets & Animals
17 Sports
18 Short Movies
19 Travel & Events
20 Gaming
21 Videoblogging
22 People & Blogs
23 Comedy
24 Entertainment
25 News & Politics
26 Howto & Style
27 Education
28 Science & Technology
29 Nonprofits & Activism
30 Movies
31 Anime/Animation
32 Action/Adventure
33 Classics
34 Comedy
35 Documentary
36 Drama
37 Family
38 Foreign
39 Horror
40 Sci-Fi/Fantasy
41 Thriller
42 Shorts
43 Shows
44 Trailers


In [42]:
df.video_id.unique()[:10]

array(['2kyS6SvSYSE', '1ZAPwfrtAFY', '5qpjK5DgCt4', 'puqaWrEC7tY',
       'd380meD0W0M', 'gHZ1Qz0KiKM', '39idVpFF7NQ', 'nc99ccSXST0',
       'jr9QtXwC9vc', 'TUmyygCMMGA'], dtype=object)

In [43]:
df.iloc[0,:]

video_id                                                        2kyS6SvSYSE
trending_date                                                      17.14.11
title                                    WE WANT TO TALK ABOUT OUR MARRIAGE
channel_title                                                  CaseyNeistat
category_id                                                              22
publish_time                                       2017-11-13T17:13:01.000Z
tags                                                        SHANtell martin
views                                                                748374
likes                                                                 57527
dislikes                                                               2966
comment_count                                                         15954
thumbnail_link               https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg
comments_disabled                                                     False
ratings_disa

In [44]:
df.iloc[1, :]

video_id                                                        1ZAPwfrtAFY
trending_date                                                      17.14.11
title                     The Trump Presidency: Last Week Tonight with J...
channel_title                                               LastWeekTonight
category_id                                                              24
publish_time                                       2017-11-13T07:30:00.000Z
tags                      last week tonight trump presidency|"last week ...
views                                                               2418783
likes                                                                 97185
dislikes                                                               6146
comment_count                                                         12703
thumbnail_link               https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg
comments_disabled                                                     False
ratings_disa

In [45]:
df.iloc[2, :]

video_id                                                        5qpjK5DgCt4
trending_date                                                      17.14.11
title                     Racist Superman | Rudy Mancuso, King Bach & Le...
channel_title                                                  Rudy Mancuso
category_id                                                              23
publish_time                                       2017-11-12T19:05:24.000Z
tags                      racist superman|"rudy"|"mancuso"|"king"|"bach"...
views                                                               3191434
likes                                                                146033
dislikes                                                               5339
comment_count                                                          8181
thumbnail_link               https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg
comments_disabled                                                     False
ratings_disa