In [109]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import datetime
import seaborn as sns
import pickle
import gzip
import matplotlib.pyplot as plt



from helpers import *

The aim of this notebook is to identify the channels that talk about a release event to analyze their evolution in terms of the number of subscribers compared to those that do not talk about the event. First, we focus on 1 event, the release of the iPhone x. Then, we will generalize to the 5 others. From the title and tags of the videos, we classify them according to the presence of the item "iPhone x" in this metadata : treat,if talks about the release, control if not. We focus on videos that have an upload date from the release till one month later.
Then, we classify the channels considering that if a channel has at least one video that talks about the release,then it is a treat channel. 
We compare the growth of channels (measured in terms of the number of subscribers) for treat and control channels to see if channels that talk about a release have a higher growth than the ones that do not. To do this comparison, we use the time series data and focus on a period that goes from 15 days before the release till 15 days after.
Then, we will do an observational study (causal analysis) with the outcome being the number of subscribers while identifying the potential confounders.
We first, focus on 1 event "iPhone x"

1. classify channels in the treat/control group,
2. causal observation (like lab 5) with outcome being the nb of subscribers one month after the event 

### Channels' classification

In [110]:
tech_video_metadata = "/Users/salma/Desktop/ADA/project/datasets/new_datasets/yt_tech_channels_metadata.tsv.gz"
tech_channels_metadata = "/Users/salma/Desktop/ADA/project/datasets/new_datasets/tech_review_channels.tsv.gz"

In [111]:
ITEMS = ['iphone x', 'iphone 7', 'iphone 6', 'galaxy s9', 'galaxy note 9', 'galaxy fold']
RELEASE_DATES = {'iphone x': '2017-11-03','iphone 7': '2016-09-16','iphone 6': '2014-09-19',
    'galaxy s9': '2018-03-16',
   'galaxy note 9': '2018-08-24',
    'galaxy fold': '2019-02-01'
}


In [112]:
df_video = pd.read_table(tech_video_metadata)
df_channel = pd.read_table(tech_channels_metadata)

In [113]:
df_video


Unnamed: 0.1,Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count,tech_review,num_comms,clean_title,clean_tags
0,0,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,0.0,1m8g3JLXh3U,498,36.0,"iFi,Headphone,Headphone Amp,DAC,Digital to Ana...",iFi Headphone Amp and DAC Lineup Review,2019-09-27,1050.0,False,2.0,"['ifi', 'headphone', 'amp', 'dac', 'lineup', '...","['ifi', 'headphone', 'headphone', 'amp', 'dac'..."
1,1,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,1.0,gPB8IxkaIeI,115,12.0,,Raleigh-Durham eCommerce Meetup Recap,2019-09-27,222.0,False,0.0,"['raleigh', 'durham', 'ecommerce', 'meetup', '...",[]
2,2,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,2.0,mgnAMH80Q_U,507,74.0,"pro-ject,pro-ject turntables,pro-ject turntabl...",Pro-Ject T1 Turntable Review,2019-09-25,4120.0,False,12.0,"['pro', 'ject', 'turntable', 'review']","['pro', 'ject', 'pro', 'ject', 'turntable', 'p..."
3,3,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,34.0,izzlMMHvPIE,420,255.0,"Bowers & Wilkins,Bowers & Wilkins PX,Bowers & ...",Bowers & Wilkins PX 7 vs. Bose Noise Cancellin...,2019-09-24,16854.0,False,116.0,"['bower', 'wilkins', 'px', 'bose', 'noise', 'c...","['bower', 'wilkins', 'bower', 'wilkins', 'px',..."
4,4,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,9.0,UdxpNgNLYfk,220,130.0,"Bowers & Wilkins,Bowers & Wilkins PX,Bowers & ...",FIRST LOOK!! Bowers & Wilkins P Series Headpho...,2019-09-20,10767.0,False,44.0,"['first', 'look', 'bower', 'wilkins', 'series'...","['bower', 'wilkins', 'bower', 'wilkins', 'px',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740013,740013,Science & Technology,UCrxStpzUb40yGqo_EaI972g,0.0,8IKgARICTOk,227,0.0,"Increase Internet Speed,Block Unnecessary Apps...",Increase Internet Speed : Block Unnecessary Ap...,2017-07-05,8.0,False,0.0,"['increase', 'internet', 'speed', 'block', 'un...","['increase', 'internet', 'speed', 'block', 'un..."
740014,740014,Science & Technology,UCrxStpzUb40yGqo_EaI972g,52.0,AI-aaZz9ULQ,552,246.0,"How to Download Lynda Tutorials For Free,FULL ...",How to Download Lynda Tutorials For Free - FUL...,2017-03-06,53577.0,False,90.0,"['download', 'lynda', 'tutorial', 'free', 'ful...","['download', 'lynda', 'tutorial', 'free', 'ful..."
740015,740015,Science & Technology,UCrxStpzUb40yGqo_EaI972g,2.0,fx_ERvJFQcU,408,0.0,How To Compress Video For Youtube Without Losi...,How To Compress Video For Youtube Without Losi...,2017-03-05,468.0,False,0.0,"['compress', 'video', 'youtube', 'without', 'l...","['compress', 'video', 'youtube', 'without', 'l..."
740016,740016,Science & Technology,UCrxStpzUb40yGqo_EaI972g,41.0,3SodCZoI_l4,167,217.0,"How To Download Google Drive File With IDM,How...",How To Download Google Drive File With IDM On ...,2017-03-02,31778.0,False,135.0,"['download', 'google', 'drive', 'file', 'idm',...","['download', 'google', 'drive', 'file', 'idm',..."


In [114]:
df_channel

Unnamed: 0.1,Unnamed: 0,join_date,channel_id,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,count_tech_videos
0,0,2010-12-21,UCsTcErHg8oDvUnTzoqsYeNw,Unbox Therapy,15300000,1672,206.0,2.0870,406.0
1,1,2015-10-19,UCOhHO2ICt0ti9KAh-QHvttQ,Technical Guruji,13979890,2539,240.0,2.0870,209.0
2,2,2012-05-18,UCe_vXdMrHHseZ_esYUskSBw,CrazyRussianHacker,11200000,1411,444.0,2.0870,105.0
3,3,2005-06-22,UCE_M8A5yxnLfW0KghEeajjw,Apple,9970000,291,489.0,2.0870,64.0
4,4,2008-03-21,UCBJycsmduvYEL83R_U4JriQ,Marques Brownlee,9400000,1148,530.0,2.0870,370.0
...,...,...,...,...,...,...,...,...,...
1245,1245,2012-06-06,UCgMHqEDFll6X962CgXPzh4w,FocusCamera TV,10000,278,940963.0,40.6530,42.0
1246,1246,2018-02-25,UCJjacdkHH7Zxw8ouKQrirUg,The Best Coders,10200,28,941074.0,42.4845,2.0
1247,1247,2016-07-17,UCviAcL8AlEoGqLDWx7t_iPw,Mobile Tag,10300,20,952387.0,53.1435,6.0
1248,1248,2011-02-20,UCncAHfjOv1B_U8esTmLYukQ,Curtis Paradis,10300,97,963290.0,53.1435,11.0


In [115]:
df_video_cop = df_video.copy()
# We create a new column with the treat attribute being set to 1 if the video talks about the release, 0 otherwise
df_video_cop["treat"]= 0




In [116]:
# We transform all the titles and tags to lower case to look for the item's name
df_video_cop["tags"] = df_video_cop["tags"].str.lower()
df_video_cop["title"]= df_video_cop["title"].str.lower()
df_video_cop

Unnamed: 0.1,Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count,tech_review,num_comms,clean_title,clean_tags,treat
0,0,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,0.0,1m8g3JLXh3U,498,36.0,"ifi,headphone,headphone amp,dac,digital to ana...",ifi headphone amp and dac lineup review,2019-09-27,1050.0,False,2.0,"['ifi', 'headphone', 'amp', 'dac', 'lineup', '...","['ifi', 'headphone', 'headphone', 'amp', 'dac'...",0
1,1,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,1.0,gPB8IxkaIeI,115,12.0,,raleigh-durham ecommerce meetup recap,2019-09-27,222.0,False,0.0,"['raleigh', 'durham', 'ecommerce', 'meetup', '...",[],0
2,2,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,2.0,mgnAMH80Q_U,507,74.0,"pro-ject,pro-ject turntables,pro-ject turntabl...",pro-ject t1 turntable review,2019-09-25,4120.0,False,12.0,"['pro', 'ject', 'turntable', 'review']","['pro', 'ject', 'pro', 'ject', 'turntable', 'p...",0
3,3,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,34.0,izzlMMHvPIE,420,255.0,"bowers & wilkins,bowers & wilkins px,bowers & ...",bowers & wilkins px 7 vs. bose noise cancellin...,2019-09-24,16854.0,False,116.0,"['bower', 'wilkins', 'px', 'bose', 'noise', 'c...","['bower', 'wilkins', 'bower', 'wilkins', 'px',...",0
4,4,Science & Technology,UCzVy1DW9NFp4c910hMd_jIw,9.0,UdxpNgNLYfk,220,130.0,"bowers & wilkins,bowers & wilkins px,bowers & ...",first look!! bowers & wilkins p series headpho...,2019-09-20,10767.0,False,44.0,"['first', 'look', 'bower', 'wilkins', 'series'...","['bower', 'wilkins', 'bower', 'wilkins', 'px',...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740013,740013,Science & Technology,UCrxStpzUb40yGqo_EaI972g,0.0,8IKgARICTOk,227,0.0,"increase internet speed,block unnecessary apps...",increase internet speed : block unnecessary ap...,2017-07-05,8.0,False,0.0,"['increase', 'internet', 'speed', 'block', 'un...","['increase', 'internet', 'speed', 'block', 'un...",0
740014,740014,Science & Technology,UCrxStpzUb40yGqo_EaI972g,52.0,AI-aaZz9ULQ,552,246.0,"how to download lynda tutorials for free,full ...",how to download lynda tutorials for free - ful...,2017-03-06,53577.0,False,90.0,"['download', 'lynda', 'tutorial', 'free', 'ful...","['download', 'lynda', 'tutorial', 'free', 'ful...",0
740015,740015,Science & Technology,UCrxStpzUb40yGqo_EaI972g,2.0,fx_ERvJFQcU,408,0.0,how to compress video for youtube without losi...,how to compress video for youtube without losi...,2017-03-05,468.0,False,0.0,"['compress', 'video', 'youtube', 'without', 'l...","['compress', 'video', 'youtube', 'without', 'l...",0
740016,740016,Science & Technology,UCrxStpzUb40yGqo_EaI972g,41.0,3SodCZoI_l4,167,217.0,"how to download google drive file with idm,how...",how to download google drive file with idm on ...,2017-03-02,31778.0,False,135.0,"['download', 'google', 'drive', 'file', 'idm',...","['download', 'google', 'drive', 'file', 'idm',...",0


In [117]:
# We make a dataframe for each one of the item 

df_iphone_x = df_video_cop.copy()
df_iphone_7 = df_video_cop.copy()
df_iphone_6 = df_video_cop.copy()
df_galaxy_s9 = df_video_cop.copy()
df_galaxy_note9 = df_video_cop.copy()
df_galaxy_fold = df_video_cop.copy()


In [118]:
# We filter to focus on videos uploaded from the release date till 1 month after.

df_iphone_x = df_iphone_x.loc[(df_iphone_x["upload_date"] > '2017-08-03')& (df_iphone_x["upload_date"] <'2018-02-03')]
df_iphone_7 = df_iphone_7.loc[(df_iphone_7["upload_date"] > '2016-06-16')& (df_iphone_7["upload_date"] <'2016-12-16')] 
df_iphone_6 = df_iphone_6.loc[(df_iphone_6["upload_date"] > '2014-06-14')& (df_iphone_6["upload_date"] <'2014-12-19')]
df_galaxy_s9 = df_galaxy_s9.loc[(df_galaxy_s9["upload_date"] > '2017-12-16')& (df_galaxy_s9["upload_date"] <'2018-06-16')]
df_galaxy_note9 = df_galaxy_note9.loc[(df_galaxy_note9["upload_date"] > '2018-05-24')& (df_galaxy_note9["upload_date"] <'2018-11-24')]
df_galaxy_fold = df_galaxy_fold.loc[(df_galaxy_fold["upload_date"] > '2018-11-01')& (df_galaxy_fold["upload_date"] <'2018-05-01')]


In [119]:
df_iphone_6

Unnamed: 0.1,Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count,tech_review,num_comms,clean_title,clean_tags,treat
187,187,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,4.0,hf89nYXqX4c,93,43.0,"time warner (tv station owner),television,time...",tech tip: twc box power save fix,2014-12-13,29006.0,False,6.0,"['tech', 'tip', 'twc', 'box', 'power', 'save',...","['time', 'warner', 'tv', 'station', 'owner', '...",0
188,188,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,1.0,gkCeGDRD9D8,196,27.0,"jl audio (business operation),subwoofer,high e...",jl audio e series subwoofers,2014-11-25,5142.0,False,2.0,"['jl', 'audio', 'series', 'subwoofer']","['jl', 'audio', 'business', 'operation', 'subw...",0
189,189,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,1.0,yIUwIohjoSw,134,18.0,"bowers & wilkins (award winner),sound,loudspea...",bowers & wilkins b&w cm series speaker updates,2014-11-25,7113.0,False,0.0,"['bower', 'wilkins', 'cm', 'series', 'speaker'...","['bower', 'wilkins', 'award', 'winner', 'sound...",0
190,190,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,7.0,bqE43B39s3E,130,10.0,"aq,apple tv (computer),audioquest,power cord,p...","audioquest power cable for apple tv, more",2014-11-25,2098.0,False,2.0,"['audioquest', 'power', 'cable', 'apple', 'tv']","['aq', 'apple', 'tv', 'computer', 'audioquest'...",0
191,191,Music,UCzVy1DW9NFp4c910hMd_jIw,0.0,uMUGy0zGJpo,167,4.0,"electrostatic loudspeaker,martinlogan (busines...",martinlogan: crafting electrostatic panels,2014-07-10,961.0,False,0.0,"['martinlogan', 'crafting', 'electrostatic', '...","['electrostatic', 'loudspeaker', 'martinlogan'...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739690,739690,Science & Technology,UCs-2DhK0sAghcpVCap0cH2g,0.0,OGSeBoIioYE,220,4.0,"rapoo,rapoo m10 mouse,rapoo m10 wireless mouse...",rapoo m10 review [tamil review],2014-06-23,704.0,False,1.0,"['rapoo', 'review', 'tamil', 'review']","['rapoo', 'rapoo', 'mouse', 'rapoo', 'wireless...",0
739691,739691,Science & Technology,UCs-2DhK0sAghcpVCap0cH2g,1.0,YErezpwu2V0,91,6.0,"celkon,celkon campus series,celkon a35k",celkon a35k review hands-on [tamil],2014-06-21,902.0,False,1.0,"['celkon', 'review', 'hand', 'tamil']","['celkon', 'celkon', 'campus', 'series', 'celk...",0
739692,739692,Science & Technology,UCs-2DhK0sAghcpVCap0cH2g,5.0,u0SUg9aqqd0,450,34.0,"samsung electronics (organization founder),sam...",samsung gear 2 review in tamil,2014-06-18,3250.0,False,11.0,"['samsung', 'gear', 'review', 'tamil']","['samsung', 'electronics', 'organization', 'fo...",0
739693,739693,People & Blogs,UCs-2DhK0sAghcpVCap0cH2g,3.0,qT2mHGpGL70,290,21.0,"lapcare,lapcare fusion,laptop coolers,laptop c...",lapcare fusion review [tamil],2014-06-16,1167.0,False,3.0,"['lapcare', 'fusion', 'review', 'tamil']","['lapcare', 'lapcare', 'fusion', 'laptop', 'co...",0


In [120]:
def find_in_name_and_tags(df,item):
  # look for the item's name in the title 
  df.loc[df["title"].str.contains(item),"treat"] = 1 
  #example of display_id of video that talks about iphone x : lFD3YQgZC-o
  # look for the item's name in the tags. Here, we need to pay attention to the missing values 
  df.loc[df["tags"].str.contains(item,na=False),"treat"] = 1 


In [121]:
find_in_name_and_tags(df_iphone_x,"iphone x")
find_in_name_and_tags(df_iphone_7,'iphone 7')
find_in_name_and_tags(df_iphone_6,'iphone 6')
find_in_name_and_tags(df_galaxy_s9,'galaxy s9')
find_in_name_and_tags(df_galaxy_note9,'galaxy note 9')
find_in_name_and_tags(df_galaxy_fold,'galaxy fold')


In [122]:
display(df_iphone_6)
display(df_iphone_6.loc[df_iphone_6['treat'] == 1])

Unnamed: 0.1,Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count,tech_review,num_comms,clean_title,clean_tags,treat
187,187,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,4.0,hf89nYXqX4c,93,43.0,"time warner (tv station owner),television,time...",tech tip: twc box power save fix,2014-12-13,29006.0,False,6.0,"['tech', 'tip', 'twc', 'box', 'power', 'save',...","['time', 'warner', 'tv', 'station', 'owner', '...",0
188,188,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,1.0,gkCeGDRD9D8,196,27.0,"jl audio (business operation),subwoofer,high e...",jl audio e series subwoofers,2014-11-25,5142.0,False,2.0,"['jl', 'audio', 'series', 'subwoofer']","['jl', 'audio', 'business', 'operation', 'subw...",0
189,189,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,1.0,yIUwIohjoSw,134,18.0,"bowers & wilkins (award winner),sound,loudspea...",bowers & wilkins b&w cm series speaker updates,2014-11-25,7113.0,False,0.0,"['bower', 'wilkins', 'cm', 'series', 'speaker'...","['bower', 'wilkins', 'award', 'winner', 'sound...",0
190,190,Howto & Style,UCzVy1DW9NFp4c910hMd_jIw,7.0,bqE43B39s3E,130,10.0,"aq,apple tv (computer),audioquest,power cord,p...","audioquest power cable for apple tv, more",2014-11-25,2098.0,False,2.0,"['audioquest', 'power', 'cable', 'apple', 'tv']","['aq', 'apple', 'tv', 'computer', 'audioquest'...",0
191,191,Music,UCzVy1DW9NFp4c910hMd_jIw,0.0,uMUGy0zGJpo,167,4.0,"electrostatic loudspeaker,martinlogan (busines...",martinlogan: crafting electrostatic panels,2014-07-10,961.0,False,0.0,"['martinlogan', 'crafting', 'electrostatic', '...","['electrostatic', 'loudspeaker', 'martinlogan'...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739690,739690,Science & Technology,UCs-2DhK0sAghcpVCap0cH2g,0.0,OGSeBoIioYE,220,4.0,"rapoo,rapoo m10 mouse,rapoo m10 wireless mouse...",rapoo m10 review [tamil review],2014-06-23,704.0,False,1.0,"['rapoo', 'review', 'tamil', 'review']","['rapoo', 'rapoo', 'mouse', 'rapoo', 'wireless...",0
739691,739691,Science & Technology,UCs-2DhK0sAghcpVCap0cH2g,1.0,YErezpwu2V0,91,6.0,"celkon,celkon campus series,celkon a35k",celkon a35k review hands-on [tamil],2014-06-21,902.0,False,1.0,"['celkon', 'review', 'hand', 'tamil']","['celkon', 'celkon', 'campus', 'series', 'celk...",0
739692,739692,Science & Technology,UCs-2DhK0sAghcpVCap0cH2g,5.0,u0SUg9aqqd0,450,34.0,"samsung electronics (organization founder),sam...",samsung gear 2 review in tamil,2014-06-18,3250.0,False,11.0,"['samsung', 'gear', 'review', 'tamil']","['samsung', 'electronics', 'organization', 'fo...",0
739693,739693,People & Blogs,UCs-2DhK0sAghcpVCap0cH2g,3.0,qT2mHGpGL70,290,21.0,"lapcare,lapcare fusion,laptop coolers,laptop c...",lapcare fusion review [tamil],2014-06-16,1167.0,False,3.0,"['lapcare', 'fusion', 'review', 'tamil']","['lapcare', 'lapcare', 'fusion', 'laptop', 'co...",0


Unnamed: 0.1,Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,tags,title,upload_date,view_count,tech_review,num_comms,clean_title,clean_tags,treat
1284,1284,Science & Technology,UCzLaQ6eeTVuAltzTrN7fzyg,1.0,fOzzidDkB4I,394,55.0,"asktheandroidguy,ask the android guy,android,4...",gear vr oculus cinema,2014-12-18,7082.0,False,4.0,"['gear', 'vr', 'oculus', 'cinema']","['asktheandroidguy', 'ask', 'android', 'guy', ...",1
1285,1285,Science & Technology,UCzLaQ6eeTVuAltzTrN7fzyg,2.0,cOGJOLOuLeI,320,27.0,"asktheandroidguy,ask the android guy,android,4...",gear vr walkthrough (how to navigate),2014-12-18,5091.0,False,1.0,"['gear', 'vr', 'walkthrough', 'navigate']","['asktheandroidguy', 'ask', 'android', 'guy', ...",1
1286,1286,Science & Technology,UCzLaQ6eeTVuAltzTrN7fzyg,1.0,jR9BNq7ekDs,394,35.0,"asktheandroidguy,ask the android guy,android,4...",gear vr 1st impressions by the android guy,2014-12-18,3534.0,False,5.0,"['gear', 'vr', 'st', 'impression', 'android', ...","['asktheandroidguy', 'ask', 'android', 'guy', ...",1
1287,1287,Science & Technology,UCzLaQ6eeTVuAltzTrN7fzyg,1.0,ii_H99JTqwk,308,49.0,"asktheandroidguy,ask the android guy,android,4...",samsung gear vr unboxing by the android guy,2014-12-17,2870.0,False,3.0,"['samsung', 'gear', 'vr', 'unboxing', 'android...","['asktheandroidguy', 'ask', 'android', 'guy', ...",1
1288,1288,Science & Technology,UCzLaQ6eeTVuAltzTrN7fzyg,1.0,TI3nNlH7elo,1636,27.0,"asktheandroidguy,ask the android guy,android,4...",android weekly & q&a live ep 44 - galaxy s6 at...,2014-12-15,1197.0,False,5.0,"['android', 'weekly', 'live', 'ep', 'galaxy', ...","['asktheandroidguy', 'ask', 'android', 'guy', ...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738343,738343,Science & Technology,UCsCex8aMDs2b39TS_49Mpcg,8.0,om3UrfeTla4,284,95.0,"iphone (video game platform),ipod,iphone,touch...",iphone 6 trident aegis case review!,2014-10-14,10918.0,True,18.0,"['iphone', 'trident', 'aegis', 'case', 'review']","['iphone', 'video', 'game', 'platform', 'ipod'...",1
738347,738347,People & Blogs,UCsCex8aMDs2b39TS_49Mpcg,11.0,gPMHFH2Btyg,317,122.0,"iphone (video game platform),battery,charging,...",rapid charging on your iphone?,2014-10-02,3612.0,False,20.0,"['rapid', 'charging', 'iphone']","['iphone', 'video', 'game', 'platform', 'batte...",1
738351,738351,Science & Technology,UCsCex8aMDs2b39TS_49Mpcg,30.0,EvY3qRMw8vo,238,333.0,"iphone (video game platform),apple inc. (publi...",apple iphone 6 leather case review (midnight b...,2014-09-24,75510.0,False,28.0,"['apple', 'iphone', 'leather', 'case', 'review...","['iphone', 'video', 'game', 'platform', 'apple...",1
738352,738352,Science & Technology,UCsCex8aMDs2b39TS_49Mpcg,8.0,byInt8YHkh8,572,252.0,"iphone (video game platform),camera (invention...","complete iphone 6 camera samples (stills, cine...",2014-09-22,19649.0,False,53.0,"['complete', 'iphone', 'camera', 'sample', 'st...","['iphone', 'video', 'game', 'platform', 'camer...",1


In [123]:
def classify_channel(df):
   df_cop = df.copy() 
   # We group the videos by channel's id and do a summation on the treat attribute. 
   # If this summation is strictly positive, then it is a treat channel.
   new_df = df_cop.groupby("channel_id",sort=False)["treat"].count()
    
   # List of the channels
   channel_list = df_cop["channel_id"].values.tolist()
    
   # Select treat channels and put them into a list
   list_channel_treat =[]
   for i in channel_list :
    if (new_df[i]>0) :
     list_channel_treat.append(i)
   list_treat_channel =  list(set(list_channel_treat)) 
    
   # With the list of treat channels, we transform the channel dataframe by adding the channels' 
   #classification as treat/control
   df_channel_cop = df_channel.copy()
   df_channel_cop["treat"]= 0
   df_channel_cop.loc[df_channel_cop['channel_id'].isin(list_treat_channel),"treat"] = 1  
   return df_channel_cop,list_treat_channel


In [124]:
df_channel_iphone_x,list_channels_iphone_x = classify_channel(df_iphone_x)
df_channel_iphone_7, list_channels_iphone_7 = classify_channel(df_iphone_7)
df_channel_iphone_6, list_channels_iphone_6 = classify_channel(df_iphone_6)
df_channel_galaxy_s9, list_channels_galaxy_s9 = classify_channel(df_galaxy_s9)
df_channel_galaxy_note9, list_channels_galaxy_note9 = classify_channel(df_galaxy_note9)
df_channel_galaxy_fold, list_channels_galaxy_fold = classify_channel(df_galaxy_fold)

In [125]:
display(df_channel_iphone_6)

Unnamed: 0.1,Unnamed: 0,join_date,channel_id,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,count_tech_videos,treat
0,0,2010-12-21,UCsTcErHg8oDvUnTzoqsYeNw,Unbox Therapy,15300000,1672,206.0,2.0870,406.0,1
1,1,2015-10-19,UCOhHO2ICt0ti9KAh-QHvttQ,Technical Guruji,13979890,2539,240.0,2.0870,209.0,0
2,2,2012-05-18,UCe_vXdMrHHseZ_esYUskSBw,CrazyRussianHacker,11200000,1411,444.0,2.0870,105.0,1
3,3,2005-06-22,UCE_M8A5yxnLfW0KghEeajjw,Apple,9970000,291,489.0,2.0870,64.0,1
4,4,2008-03-21,UCBJycsmduvYEL83R_U4JriQ,Marques Brownlee,9400000,1148,530.0,2.0870,370.0,1
...,...,...,...,...,...,...,...,...,...,...
1245,1245,2012-06-06,UCgMHqEDFll6X962CgXPzh4w,FocusCamera TV,10000,278,940963.0,40.6530,42.0,1
1246,1246,2018-02-25,UCJjacdkHH7Zxw8ouKQrirUg,The Best Coders,10200,28,941074.0,42.4845,2.0,0
1247,1247,2016-07-17,UCviAcL8AlEoGqLDWx7t_iPw,Mobile Tag,10300,20,952387.0,53.1435,6.0,0
1248,1248,2011-02-20,UCncAHfjOv1B_U8esTmLYukQ,Curtis Paradis,10300,97,963290.0,53.1435,11.0,0


In [126]:
def features_transformation(df):
 # define average upload frequency per channel
 tech_mt_uf = df.copy()
 tech_mt_uf['join_date'] = pd.to_datetime(tech_mt_uf.join_date)
 tech_mt_uf.sort_values(by=['channel_id','join_date'],ascending=True,inplace=True)
 tech_mt_uf['delay'] = tech_mt_uf.upload_date - tech_mt_uf.shift(1).join_date
 # set all first uploads to 0
 first_upload = tech_mt_uf.groupby('channel_id',as_index=False)[['join_date']].min()
 first_upload['is_first'] = True
 tech_df_upload_freq = pd.merge(tech_mt_uf,first_upload,on=['channel_id','join_date'],how='left')
 tech_df_upload_freq.is_first.fillna(False,inplace=True)
 tech_df_upload_freq.loc[tech_df_upload_freq.is_first,'delay'] = pd.Timedelta(seconds=0)
 tech_df_upload_freq['upload_year'] = tech_df_upload_freq.upload_date.dt.year
 tech_df_upload_freq['upload_month'] = tech_df_upload_freq.upload_date.dt.month
 tech_df_avg = tech_df_upload_freq.groupby(['channel_id','upload_year','upload_month'],as_index=False)[['delay']].mean().groupby(['channel_id','upload_year'],as_index=False)[['delay']].mean().groupby(['channel_id'],as_index=False)[['delay']].mean()
 tech_df_avg.rename({'delay':'avg_delay'},axis=1,inplace=True)
 tech_df_avg.avg_delay = (tech_df_avg.avg_delay / np.timedelta64(1, 'D')).astype(float)

 # define average duration
 tech_df_avg_dur = tech_df_upload_freq.groupby(['channel_id','upload_year','upload_month'],as_index=False)[['duration']].mean().groupby(['channel_id','upload_year'],as_index=False)[['duration']].mean().groupby(['channel_id'],as_index=False)[['duration']].mean()
 tech_df_avg_dur.rename({'duration':'avg_duration'},axis=1,inplace=True)

 # define dataframe for observational study
 df_obs = pd.merge(df[['channel_id','subscribers_cc']],tech_df_avg_dur,on='channel_id',how='left')
 df_obs = pd.merge(df_obs,tech_df_avg,on='channel_id',how='left')
 
 #define a new feature 
 df_obs['ratio']= df_obs['count_tech_videos']/df_obs['videos_cc']

 return df_obs

In [127]:
new_df_iphone_x = features_transformation(df_channel_iphone_x)
new_df_iphone_7 = features_transformation(df_channel_iphone_7)
new_df_iphone_6 = features_transformation(df_channel_iphone_6)
new_df_galaxy_s9 = features_transformation(df_channel_galaxy_s9)
new_df_galaxy_note9 = features_transformation(df_channel_galaxy_note9)
new_df_galaxy_fold =features_transformation(df_channel_galaxy_fold)

AttributeError: 'DataFrame' object has no attribute 'upload_date'

In [None]:
# balance the treat and control groups
def match_1 (df) : 
 match = balance_data(df,treat_column='treat',continuous_features=['avg_duration','avg_delay'])
 df_matched = df.iloc[match]
 print("number of channels before the match":df.shape[0])
 print("number of channels after the match ":df_matched.shape[0])
 new_channels_list = df_matched["channel_id"].values.tolist() 
 return df_matched,new_channels_list

In [None]:
def match_2(df) : 
 # balance the treat and control groups
 
 match = balance_data(df,treat_column='treat',continuous_features=['avg_duration','avg_delay','ratio' ])
 df_matched = df.iloc[match]
 print("number of channels before the match":df.shape[0])
 print("number of channels after the match ":df_matched.shape[0])
 new_channels_list = df_matched["channel_id"].values.tolist()  
 return df_matched,new_channels_list

In [None]:
df_match_1_iphone_x, new_channels_list_m1_iphone_x = match_1(new_df_iphone_x )
df_match_1_iphone_7, new_channels_list_m1_iphone_7 = match_1(new_df_iphone_7)
df_match_1_iphone_6,new_channels_list_m1_iphone_6 = match_1(new_df_iphone_6)
df_match_1_galaxy_s9,new_channels_list_m1_galaxy_s9 = match_1(new_df_galaxy_s9)
df_match_1_galaxy_note9,new_channels_list_m1_note9 = match_1(new_df_galaxy_note9)
df_match_1_galaxy_fold,new_channels_list_m1_galaxy_fold  = match_1(new_df_galaxy_fold)


In [None]:
df_match_2_iphone_x, new_channels_list_m2_iphone_x = match_2(new_df_iphone_x )
df_match_2_iphone_7, new_channels_list_m2_iphone_7 = match_2(new_df_iphone_7)
df_match_2_iphone_6,new_channels_list_m2_iphone_6 = match_2(new_df_iphone_6)
df_match_2_galaxy_s9,new_channels_list_m2_galaxy_s9 = match_2(new_df_galaxy_s9)
df_match_2_galaxy_note9,new_channels_list_m2_note9 = match_2(new_df_galaxy_note9)
df_match_2_galaxy_fold,new_channels_list_m2_galaxy_fold  = match_2(new_df_galaxy_fold)

### Plots before the observational analysis

In [None]:
# we load the time series data.
path_time_series = "/Users/salma/Desktop/ADA/project/datasets/df_timeseries_en.tsv.gz"
time_series_data = pd.read_table(path_time_series)
display(time_series_data)

In [None]:

time_series_cop = time_series_data.copy()
time_series_cop["treat"] = 0
#time_series_cop = time_series_cop.loc[(time_series_cop["datetime"] > '2017-10-15')& (time_series_cop["datetime"] <'2017-11-18')]

In [None]:
time_series_iphone_x = time_series_cop.copy()
time_series_iphone_7 = time_series_cop.copy()
time_series_iphone_6 = time_series_cop.copy()
time_series_iphone_s9 = time_series_cop.copy()
time_series_galaxy_note9 = time_series_cop.copy()
time_series_galaxy_fold = time_series_cop.copy()


In [None]:
# filtering the time series data: keep the ones with a datetime from 9 months before the release till 9 months after 
time_series_iphone_x = time_series_iphone_x.loc[(time_series_iphone_x["datetime"] > '2017-02-03')& (time_series_iphone_x["datetime"] <'2018-07-03')]
time_series_iphone_7 = time_series_iphone_7.loc[(time_series_iphone_7["datetime"] > '2016-01-16')& (time_series_iphone_7["datetime"] <'2017-06-16')]
time_series_iphone_6 = time_series_iphone_6.loc[(time_series_iphone_6["datetime"] > '2014-01-19')& (time_series_iphone_6["datetime"] <'2014-06-19')]
time_series_iphone_s9= time_series_iphone_s9.loc[(time_series_iphone_s9["datetime"] > '2017-06-16')& (time_series_iphone_s9["datetime"] <'2018-12-16')]
time_series_galaxy_note9 = time_series_galaxy_note9.loc[(time_series_galaxy_note9["datetime"] > '2017-12-24')& (time_series_galaxy_note9["datetime"] <'2019-05-24')]
time_series_galaxy_fold = time_series_galaxy_fold.loc[(time_series_galaxy_fold["datetime"] > '2018-05-01')& (time_series_galaxy_fold["datetime"] <'2019-11-01')]

In [None]:
#display(time_series_cop)

In [None]:
def classify_channel_in_time_series(list_channel,ts):
  ts.loc[ts['channel'].isin(list_channel),"treat"]= 1
  #filtering the times series dataframe by channel's category
  df_treat = ts.loc[ts["treat"] == 1] # no need to make a copy?
  df_control = ts.loc[ts["treat"] == 0]#idem ? 
  df_treat = df_treat.sort_values("datetime")
  df_control = df_control.sort_values("datetime")  
  return df_treat, df_control

In [None]:
df_treat_m1_iphone_x, df_control_m1_iphone_x = classify_channel_in_time_series(new_channels_list_m1_iphone_x,time_series_iphone_x)
df_treat_m1_iphone_7, df_control_m1_iphone_7 = classify_channel_in_time_series(new_channels_list_m1_iphone_7,time_series_iphone_7)
df_treat_m1_iphone_6, df_control_m1_iphone_6 = classify_channel_in_time_series(new_channels_list_m1_iphone_6,time_series_iphone_6)
df_treat_m1_iphone_s9, df_control_m1_iphone_s9 = classify_channel_in_time_series(new_channels_list_m1_galaxy_s9,time_series_iphone_s9)
df_treat_m1_galaxy_note9, df_control_m1_galaxy_note9 = classify_channel_in_time_series(new_channels_list_m1_galaxy_note9,time_series_galaxy_note9)
df_treat_m1_galaxy_fold, df_control_m1_galaxy_fold = classify_channel_in_time_series(new_channels_list_m1_galaxy_fold,time_series_galaxy_fold)

In [None]:
df_treat_m2_iphone_x, df_control_m2_iphone_x = classify_channel_in_time_series(new_channels_list_m2_iphone_x,time_series_iphone_x)
df_treat_m2_iphone_7, df_control_m2_iphone_7 = classify_channel_in_time_series(new_channels_list_m2_iphone_7,time_series_iphone_7)
df_treat_m2_iphone_6, df_control_m2_iphone_6 = classify_channel_in_time_series(new_channels_list_m2_iphone_6,time_series_iphone_6)
df_treat_m2_iphone_s9, df_control_m2_iphone_s9 = classify_channel_in_time_series(new_channels_list_m2_galaxy_s9,time_series_iphone_s9)
df_treat_m2_galaxy_note9, df_control_m2_galaxy_note9 = classify_channel_in_time_series(new_channels_list_m2_galaxy_note9,time_series_galaxy_note9)
df_treat_m2_galaxy_fold, df_control_m2_galaxy_fold = classify_channel_in_time_series(new_channels_list_m2_galaxy_fold,time_series_galaxy_fold)

In [None]:
def plot_time_series(df_treat,df_control) :
    
# We compare the number of subscribers and its evolution from 9 months before the release 
# till 9 months after this by plotting the evolution of the number of subscribers for the treat/control channels
plt.figure(figsize=(12,4))
sns.barplot(x = df_treat['datetime'], y = df_treat['delta_subs'])
plt.xticks(rotation=90)
plt.title('Evolution of number of subscribers for the treat channels before the  release and after')
plt.show()

plt.figure(figsize=(12,4))# changer le rapport à 18,6
sns.barplot(x = df_control['datetime'], y = df_control['delta_subs'])
plt.xticks(rotation=90)
plt.title('Evolution of number of subscribers for the control channels before the release and after')
plt.show()

    

In [None]:
plot_time_series(df_treat_m1_iphone_x, df_control_m1_iphone_x)
plot_time_series(df_treat_m1_iphone_7, df_control_m1_iphone_7)
plot_time_series(df_treat_m1_iphone_6, df_control_m1_iphone_6)
plot_time_series(df_treat_m1_iphone_s9, df_control_m1_iphone_s9)
plot_time_series(df_treat_m1_galaxy_note9, df_control_m1_galaxy_s9)
plot_time_series(df_treat_m1_galaxy_fold,df_control_m1_galaxy_fold)

In [None]:
plot_time_series(df_treat_m2_iphone_x, df_control_m2_iphone_x)
plot_time_series(df_treat_m2_iphone_7, df_control_m2_iphone_7)
plot_time_series(df_treat_m2_iphone_6, df_control_m2_iphone_6)
plot_time_series(df_treat_m2_iphone_s9, df_control_m2_iphone_s9)
plot_time_series(df_treat_m2_galaxy_note9, df_control_m2_galaxy_s9)
plot_time_series(df_treat_m2_galaxy_fold,df_control_m2_galaxy_fold)

### Conclusion from the plots
The release date is "2017-11-03". We can see that the treat channels (that talk about the release) have more subscribers than the control ones, which don't talk about it. This could even be observed before the release. It is as if the already successful review channels will be the ones that will talk about it. The rise in the subscriber number coincides with the release period (data of "2017-11-05") for the treat channels.This difference in subscribers' number between the treat and control channels appears also far after the release event. We can conclude that there is a kind of long-term effect of talking about the release on the success of the channel, represented by its number of subscribers. Hence, the channels that talk about the release of iPhone x are more successful and have long-term success with a higher number of subscribers.