In [None]:
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
import collections
import pickle
import gc

In [None]:
# Read all files
json_files=[]
csv_files=[]
for dirname, _, filenames in os.walk('archive'):
    for filename in filenames:
        fname=os.path.join(dirname, filename)
        if fname.endswith('csv'):
            csv_files.append(fname)
        elif fname.endswith('json'):
            json_files.append(fname)
            
            
# Reorder CSV files
country_codes=list(map(lambda string:''.join(list(filter(lambda word:word.isupper(),string))),csv_files))

country_codes, order=zip(*sorted(list(zip(country_codes,range(len(country_codes)))), key=lambda val:val[0]))
csv_files=[csv_files[ind] for ind in order]

# Reorder json files
country_codes=list(map(lambda string:''.join(list(filter(lambda word:word.isupper(),string))),json_files))
country_codes,order=zip(*sorted(list(zip(country_codes,range(len(country_codes)))), key=lambda val:val[0]))
json_files=[json_files[ind] for ind in order]


def initialize_country_dataframe(dataframe,json_fname,country_code):
    '''First, remove duplicate rows from the dataframe, second, map category_id column to actual categories, third,
    new column in the dataframe called country_code'''
    
    df=dataframe.copy()
    df.drop_duplicates(inplace=True)
    
    with open(json_fname,'r') as f:
        json_data=json.loads(f.read())

    mapping_dict=dict([(int(dictionary['id']),dictionary['snippet']['title']) for dictionary in json_data['items']])

    df['category']=df['category_id'].replace(mapping_dict)
    del df['category_id']

    df['country_code']=country_code
    
    return df

# Initialize country-by-country dataframe using above written function
dataframes=[]
for ind,code in enumerate(country_codes):
    try:
        df=pd.read_csv(csv_files[ind])
    except:
        df=pd.read_csv(csv_files[ind],engine='python')
                
    df=initialize_country_dataframe(df,json_files[ind],code)
    print(code,df.shape)
    dataframes.append(df)
    
    
# Concatenate individual dataframe to form single main dataframe
dataframe=pd.concat(dataframes)
print(dataframe.shape)


# Remove videos with unknown video id
drop_index=dataframe[dataframe.video_id.isin(['#NAME?','#VALUE!'])].index
dataframe.drop(drop_index, axis=0, inplace=True)

In [5]:
dataframes

[          video_id trending_date  \
 0      n1WpP7iowLc      17.14.11   
 1      0dBIkQ4Mz1M      17.14.11   
 2      5qpjK5DgCt4      17.14.11   
 3      d380meD0W0M      17.14.11   
 4      2Vv-BfVoq4g      17.14.11   
 ...            ...           ...   
 40876  sGolxsMSGfQ      18.14.06   
 40877  8HNuRNi8t70      18.14.06   
 40878  GWlKEM3m2EE      18.14.06   
 40879  lbMKLzQ4cNQ      18.14.06   
 40880  POTgw38-m58      18.14.06   
 
                                                    title     channel_title  \
 0             Eminem - Walk On Water (Audio) ft. Beyoncé        EminemVEVO   
 1                          PLUSH - Bad Unboxing Fan Mail         iDubbbzTV   
 2      Racist Superman | Rudy Mancuso, King Bach & Le...      Rudy Mancuso   
 3                               I Dare You: GOING BALD!?          nigahiga   
 4            Ed Sheeran - Perfect (Official Music Video)        Ed Sheeran   
 ...                                                  ...               ...   
 

In [24]:
dataframes[-1]

Unnamed: 0,video_id,trending_date,title,channel_title,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,country_code
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,People & Blogs,US
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",Entertainment,US
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,Comedy,US
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,Entertainment,US
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,Entertainment,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40944,BZt0qjTWNhw,18.14.06,The Cat Who Caught the Laser,AaronsAnimals,2018-05-18T13:00:04.000Z,"aarons animals|""aarons""|""animals""|""cat""|""cats""...",1685609,38160,1385,2657,https://i.ytimg.com/vi/BZt0qjTWNhw/default.jpg,False,False,False,The Cat Who Caught the Laser - Aaron's Animals,Pets & Animals,US
40945,1h7KV2sjUWY,18.14.06,True Facts : Ant Mutualism,zefrank1,2018-05-18T01:00:06.000Z,[none],1064798,60008,382,3936,https://i.ytimg.com/vi/1h7KV2sjUWY/default.jpg,False,False,False,,People & Blogs,US
40946,D6Oy4LfoqsU,18.14.06,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,2018-05-18T17:34:22.000Z,I gave safiya nygaard a perfect hair makeover ...,1066451,48068,1032,3992,https://i.ytimg.com/vi/D6Oy4LfoqsU/default.jpg,False,False,False,I had so much fun transforming Safiyas hair in...,Entertainment,US
40947,oV0zkMe1K8s,18.14.06,How Black Panther Should Have Ended,How It Should Have Ended,2018-05-17T17:00:04.000Z,"Black Panther|""HISHE""|""Marvel""|""Infinity War""|...",5660813,192957,2846,13088,https://i.ytimg.com/vi/oV0zkMe1K8s/default.jpg,False,False,False,How Black Panther Should Have EndedWatch More ...,Film & Animation,US


In [25]:
# video in trending video in US videos, 

df = dataframes[-1]

# video_ids=df.video_id.unique().tolist()

df_em = pd.DataFrame()

for i, row in df.iterrows():
    if row.video_id == "oV0zkMe1K8s":
        print(row.trending_date, row.views, row.comment_count, row.likes)

# for index, row in dataframes[-1].iterrows():
    

18.12.06 5590430 13030 192037
18.13.06 5629147 13065 192568
18.14.06 5660813 13088 192957


### Feature creation 

In [None]:
# Create feature num_days that indicates the number of days the videos are in trend
video_ids=dataframe.video_id.unique().tolist()
num_days=[]
id_days={}
for vid in tqdm(video_ids):
    days=len(dataframe[dataframe.video_id==vid].trending_date.unique())
    id_days[vid]=days
    num_days.append(days)
    

# ## don't need to do this
# # Create feature num_countries that indicates the number of countries in the videos trended
# video_ids=dataframe.video_id.unique().tolist()
# num_countries=[]
# id_countries={}
# for vid in tqdm(video_ids):
#     days=len(dataframe[dataframe.video_id==vid].country_code.unique())
#     id_countries[vid]=days
#     num_countries.append(days)

In [None]:
num_days

In [None]:
num_countries

In [None]:
# Create feature days_lapse that indicates the number of days before videos are in trend
def unique_video_id(keep='last'):
    '''Removes duplicate videos to keep single record according to trending_date and keep argument.'''
    df=dataframe.copy()
    
    df.sort_values(by=['video_id','trending_date'],axis=0,inplace=True)
    df.drop_duplicates(subset='video_id',keep='last',inplace=True)
    
    return df

df=unique_video_id(keep='first')

def publish_date(string):
    return string.split('T')[0]

df['publish_date']=pd.to_datetime(df.publish_time.apply(func=lambda val:publish_date(val)),format='%Y-%m-%d')
df['trending_date']=pd.to_datetime(df.trending_date,format='%y.%d.%m')
df['days_lapse']=df['trending_date']-df['publish_date']

df.days_lapse=df.days_lapse.apply(func=lambda val:val.days).values
id_days_lapse=dict(zip(df.video_id.values,df.days_lapse.values))

def n_days_lapse_replace(vid):
    return id_days_lapse[vid]

dataframe['days_lapse']=dataframe.video_id.apply(func=n_days_lapse_replace)

# Create feature trend_month that indicates month the videos are in trend
def trend_month(string):
    return int(string.split('.')[2])

dataframe['trend_month']=dataframe.trending_date.apply(func=lambda val:trend_month(val))

# Create feature publish_month that indicates the months that the videos are published in
def publish_month(string):
    return int(string.split('T')[0].split('-')[1])
dataframe['publish_month']=dataframe.publish_time.apply(func=lambda val:publish_month(val))

# Create feature publish_hour that indicates the hours that the videos are published in
def publish_hour(string):
    return int(string.split('T')[1].split(':')[0])

dataframe['publish_hour']=dataframe.publish_time.apply(func=lambda val:publish_hour(val))

In [None]:


trending_days=collections.Counter(num_days)
days,freq=zip(*sorted(trending_days.items(),key=lambda val:val[0]))

fig,[ax1,ax2]=plt.subplots(nrows=2,ncols=1,figsize=(14,10))

cmap = plt.get_cmap('GnBu') 
colors=[cmap(i) for i in np.linspace(0, 1, len(days))]
ax1.bar(range(len(days)),np.log(freq),color=colors)
ax1.set_xticks(range(len(days)))
ax1.set_xticklabels(days)    

labels=[str(val) for val in freq]
for ind,val in enumerate(np.log(freq)):
    ax1.text(ind,val+0.1,labels[ind],ha='center')

ax1.set_xticks(range(len(days)))
ax1.set_xticklabels(days)

ax1.set_ylabel('Log frequency')

cum_arr=np.cumsum(freq)
max_val=np.max(cum_arr)
min_val=np.min(cum_arr)

ax2.plot((cum_arr-min_val)/(max_val-min_val))
ax2.set_xticks(range(len(days)))
ax2.set_xticklabels(days)
ax2.set_ylabel('Cumulative proportion of number of videos')
ax2.set_xlabel('For number of days videos are in trend');