In [1]:
from db_utils import query_hive_ssh, execute_hive_expression, get_hive_timespan
from censorship import *
import pandas as pd
% matplotlib inline

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 28 days


# Generate Data Set

In [2]:
def get_PVSpanComparison_df(pvsc):
    """
    Pull a dataframe with articles that jumped
    at least one standard deviation in view proportion
    """
    query = """
    SELECT *,
    (post_n_tpc / 3 - pre_n_tpc) / pre_n_tpc as clean_tpc_view_delta
    FROM %(db)s.%(span_table)s
    WHERE post_n_tpc > 300
    AND pre_n_wd > 1000
    AND (post_n_tpc / 3 - pre_n_tpc) / pre_n_tpc > 2.0
    """
    df = query_hive_ssh(query % pvsc.params, 'get_PVSpanComparison_df', priority = True)
    df.columns = [c.split('.')[1] if len(c.split('.')) == 2 else c for c in df.columns]
    df.sort('normalized_tpc_view_proportion_delta', inplace  = True, ascending = 0)
    return df

In [3]:
# create a comparison between two time periods before the https trnasiton
cmp_ctr = PVSpanComparison(['2015-05-01', '2015-05-14'], ['2015-05-15', '2015-05-28'], 'censorship', dry = True)
# create a comparison between two weeks before and several weeks after the https transition
cmp = PVSpanComparison(['2015-05-15', '2015-05-28'], ['2015-06-17', '2015-07-30'], 'censorship', dry = True)

In [4]:
# pull reduced data from Hive into local pandas df
df = get_PVSpanComparison_df(cmp)
df_ctr = get_PVSpanComparison_df(cmp_ctr)

# Investigate Top Articles per Country

In [49]:
df.sort('clean_tpc_view_delta', inplace  = True, ascending = 0)

In [53]:
country = 'Bahrain'
cols = ['p', 't', 'c', 'en_page_title', 'clean_tpc_view_delta', 'normalized_tpc_view_proportion_delta', 'pre_n_tpc', 'post_n_tpc',  ]
df[df['c'] == country][cols].head(100)

Unnamed: 0,p,t,c,en_page_title,clean_tpc_view_delta,normalized_tpc_view_proportion_delta,pre_n_tpc,post_n_tpc
213485,en.wikipedia,Yakub_Memon,Bahrain,Yakub_Memon,432.166667,5.294950,2,2599
158907,bg.wikipedia,Начална_страница,Bahrain,Main_Page,415.666667,12.881844,1,1250
228356,en.wikipedia,Eid_Mubarak,Bahrain,Eid_Mubarak,271.333333,4.476766,1,817
31270,ar.wikipedia,عبد_الله_بن_عبد_العزيز_بن_مساعد_آل_سعود,Bahrain,,107.666667,6.483884,1,326
10334,ar.wikipedia,محرك_نفاث_عنفي,Bahrain,Turbojet,107.333333,5.915399,1,325
162300,en.wikipedia,Indian_Idol_Junior_(season_2),Bahrain,Indian_Idol_Junior_(season_2),82.500000,6.573423,2,501
221402,ar.wikipedia,صلاة_العيدين,Bahrain,Eid_prayers,76.777778,4.335378,3,700
120009,en.wikipedia,Ruby_Rose,Bahrain,Ruby_Rose,63.833333,1.646020,6,1167
182055,en.wikipedia,Rainbow_flag_(LGBT_movement),Bahrain,Rainbow_flag_(LGBT_movement),49.500000,1.668557,2,303
123389,ar.wikipedia,يوم_الأب,Bahrain,Father's_Day,47.250000,0.885202,4,579


In [None]:
df[]

# Check Individual Articles

Check if an article was trending up during the https transition.

In [58]:
article = 'Anal_sex'

In [59]:
df[df['t'] == article][cols].head(30)

Unnamed: 0,p,t,c,en_page_title,clean_tpc_view_delta,normalized_tpc_view_proportion_delta,pre_n_tpc,post_n_tpc
176144,en.wikipedia,Anal_sex,Iran,Anal_sex,13.072917,12.819376,224,9457
176143,en.wikipedia,Anal_sex,France,Anal_sex,9.667634,5.186167,1034,33091


In [21]:
df_ctr[df_ctr['en_page_title'] == article][cols].head(30)

Unnamed: 0,p,t,c,en_page_title,clean_tpc_view_delta,normalized_tpc_view_proportion_delta,pre_n_tpc,post_n_tpc


# Save to File

In [38]:
def save_PVSpanComparison_df(df, fname):
    df_clean = df[['p', 't', 'c', 'id', 'en_page_title', 'tpc_view_delta']]
    df_clean.columns = ['project', 'title', 'country', 'wikidata_id', 'en_title', 'tpc_view_delta']
    df_clean.to_csv(fname, encoding = 'utf8', sep = '\t', index = False)

In [39]:
save_PVSpanComparison_df(df, './data/https_transition_comparison.tsv')
save_PVSpanComparison_df(df_ctr, './data/control_comparison.tsv')

In [3]:
d_censorship = pd.read_csv('./data/https_transition_comparison.tsv', sep = '\t', encoding = 'utf8')

In [27]:
d_censorship[d_censorship['en_title'] == 'Thor']

Unnamed: 0,project,title,country,wikidata_id,en_title,normalized_wikidata_view_proportion_delta,normalized_article_view_proportion_delta
17098,cs.wikipedia,Thór,United States,Q42952,Thor,-0.475413,5.898402
99481,fa.wikipedia,ثور_(اساطیر),Iran,Q42952,Thor,3.819679,2.565641
129280,en.wikipedia,Thor,Austria,Q42952,Thor,1.262351,2.165867
207447,sk.wikipedia,Thor,Slovak Republic,Q42952,Thor,2.28628,1.533235


In [38]:
d_censorship[d_censorship['country'] == 'Iran']

KeyError: 'tpc_view_proportion_delta'