In [1]:
from db_utils import query_hive_ssh, execute_hive_expression, get_hive_timespan
from censorship import *
import pandas as pd
% matplotlib inline

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 25 days


# Generate Data Set

In [4]:
def get_PVSpanComparison_df(pvsc):
    """
    Pull a dataframe with articles that jumped
    at least one standard deviation in view proportion
    """
    query = """
    SELECT *
    FROM %(db)s.%(span_table)s
    WHERE post_n_tpc > 500
    AND normalized_wdc_view_proportion_delta > 1.0
    """
    df = query_hive_ssh(query % pvsc.params, 'get_PVSpanComparison_df')
    df.columns = [c.split('.')[1] for c in df.columns]
    df.sort('normalized_wdc_view_proportion_delta', inplace  = True, ascending = 0)
    return df

In [5]:
# create a comparison between two time periods before the https trnasiton
cmp_ctr = PVSpanComparison(['2015-05-01', '2015-05-14'], ['2015-05-15', '2015-05-28'], 'censorship', dry = True)
# create a comparison between two weeks before and several weeks after the https transition
cmp = PVSpanComparison(['2015-05-15', '2015-05-28'], ['2015-06-17', '2015-07-30'], 'censorship', dry = True)

In [6]:
# pull reduced data from Hive into local pandas df
df = get_PVSpanComparison_df(cmp)
df_ctr = get_PVSpanComparison_df(cmp_ctr)

# Investigate Top Articles per Country

In [18]:
country = 'Iran'
cols = ['p', 't', 'c', 'en_page_title', 'normalized_wdc_view_proportion_delta']
df[df['c'] == 'Iran'][cols].head(10)

Unnamed: 0,p,t,c,en_page_title,normalized_wdc_view_proportion_delta
367600,en.wikipedia,MILF_(slang),Iran,MILF_(slang),35.757029
185555,en.wikipedia,.xxx,Iran,.xxx,26.660146
140303,fa.wikipedia,.xxx,Iran,.xxx,26.660146
430245,fa.wikipedia,تاهیتی,Iran,Tahiti,26.393169
33075,en.wikipedia,Tahiti,Iran,Tahiti,26.393169
151963,en.wikipedia,Alex_Rocco,Iran,Alex_Rocco,24.612433
80424,en.wikipedia,Bitch_Better_Have_My_Money,Iran,Bitch_Better_Have_My_Money,24.17787
25538,fa.wikipedia,پرچم_مکزیک,Iran,Flag_of_Mexico,23.82242
109082,fa.wikipedia,آلما_ماتر,Iran,Alma_mater,23.775999
83698,fa.wikipedia,دریای_سیبری_شرقی,Iran,East_Siberian_Sea,23.633949


# Check Individual Articles

Check if an article was trending up during the https transition.

In [19]:
article = 'Salman_Rushdie'

In [20]:
df[df['en_page_title'] == article][cols].head(30)

Unnamed: 0,p,t,c,en_page_title,normalized_wdc_view_proportion_delta
407067,fa.wikipedia,سلمان_رشدی,Iran,Salman_Rushdie,11.139004
416351,en.wikipedia,Salman_Rushdie,Finland,Salman_Rushdie,5.491577
308081,fi.wikipedia,Salman_Rushdie,Finland,Salman_Rushdie,5.491577
192994,bg.wikipedia,Салман_Рушди,Bulgaria,Salman_Rushdie,2.465808
308082,id.wikipedia,Salman_Rushdie,Indonesia,Salman_Rushdie,1.454014
416350,en.wikipedia,Salman_Rushdie,Indonesia,Salman_Rushdie,1.454014


In [21]:
df_ctr[df_ctr['en_page_title'] == article][cols].head(30)

Unnamed: 0,p,t,c,en_page_title,normalized_wdc_view_proportion_delta
59516,ja.wikipedia,サルマン・ラシュディ,Japan,Salman_Rushdie,2.413006
49604,pl.wikipedia,Salman_Rushdie,Poland,Salman_Rushdie,2.22827


# Save to File

In [30]:
def save_PVSpanComparison_df(df, fname):
    df_clean = df[['p', 't', 'c', 'id', 'en_page_title', 'normalized_wdc_view_proportion_delta', 'normalized_tpc_view_proportion_delta']]
    df_clean.columns = ['project', 'title', 'country', 'wikidata_id', 'en_title', 'normalized_wikidata_view_proportion_delta', 'normalized_article_view_proportion_delta']
    df_clean.to_csv(fname, encoding = 'utf8', sep = '\t', index = False)

In [31]:
save_PVSpanComparison_df(df, 'https_transition_comparison.tsv')
save_PVSpanComparison_df(df_ctr, 'control_comparison.tsv')