In [65]:
from db_utils import query_hive_ssh, execute_hive_expression, get_hive_timespan
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
% matplotlib inline
import shutil
import os
import numpy as np


# Code 

In [133]:
def get_country_project_condition(d):
        pairs = []
        for c,pl in d.items():
            for p in pl:
                pairs.append("(project = '%s.wikipedia' AND country = '%s')" % (p,c))
        return "(" + " OR ".join(pairs) + ")"
    
def get_hive_ts(d):
    query = """
        DROP TABLE IF EXISTS censorship.daily_ts;
        CREATE TABLE censorship.daily_ts
        AS SELECT 
            ts.year, 
            ts.month, 
            ts.day, 
            ts.country, 
            ts.project, 
            ts.page_title,
            ts.n,
            ts.n / agg.n_agg as proportion,
            wd.en_page_title
        FROM 
            (SELECT
                year, 
                month, 
                day, 
                country, 
                project, 
                page_title,
                SUM(view_count) as n
            FROM wmf.pageview_hourly
                WHERE agent_type = 'user'
                AND year = 2015
                AND page_title not RLIKE ':'
                AND %(cp_conditions)s
            GROUP BY
                year,
                month,
                day,
                country,
                project,
                page_title
            ) ts
        LEFT JOIN
            (SELECT
                year, 
                month, 
                day, 
                project, 
                page_title,
                SUM(view_count) as n_agg
            FROM wmf.pageview_hourly
                WHERE agent_type = 'user'
                AND year = 2015
                AND page_title not RLIKE ':'
            GROUP BY
                year,
                month,
                day,
                project,
                page_title
            ) agg
            ON (    ts.year = agg.year
                AND ts.month = agg.month
                AND ts.day = agg.day
                AND ts.project = agg.project
                AND ts.page_title = agg.page_title)
        LEFT JOIN censorship.wikidata wd
            ON (ts.page_title = wd.page_title AND ts.project = wd.project);
    """
    query %= {'cp_conditions' : get_country_project_condition(cp_dict)}
    query_hive_ssh(query, 'ts', priority = True)

    
    
def get_local_ts(cp_dict, articles):
    
    params = {
        'article_condition': " en_page_title in ('" + "', '".join(articles) + "')",
        'pc_condition': get_country_project_condition(cp_dict)
    }

    query = """
    SELECT *
    FROM censorship.daily_ts
    WHERE %(pc_condition)s
    AND %(article_condition)s
    """

    df =  query_hive_ssh(query % params, 'ts', priority = True)
    df.columns = [c.split('.')[1] for c in df]
    df.index  = pd.to_datetime(df.year*1000000 + df.month*10000 + df.day * 100, format='%Y%m%d%H')
    return df


def get_single_ts(df, start, stop, project, country, title, field):
    indices = (df['project'] == project+ '.wikipedia') & (df['country'] == country) & (df['en_page_title'] == title)
    data = df[indices]
    ts = pd.Series(data[field], index = pd.date_range(start=start, end=stop, freq='d') )
    ts.fillna(0, inplace = True)
    return ts


def plot_series(df, start, stop, project, country, title, fig_dir, smooth = 1):
    ts = get_single_ts(df, start, stop, project, country, title, 'n')
    ts = pd.rolling_mean(ts, smooth)

    ts_prop = get_single_ts(df, start, stop, project, country, title, 'proportion')
    ts_prop = pd.rolling_mean(ts_prop, smooth)
    
    f, axarr = plt.subplots(2, sharex=True)
    
    english_end = datetime.strptime('2015-06-12 09:40', "%Y-%m-%d %H:%M") # End transition of English Wikipedia, including Mobile

    # plot transition point
    axarr[0].axvline(english_end, color='red', label = 'HTTPS transition', linewidth=0.5)
    axarr[1].axvline(english_end, color='red', label = 'HTTPS transition', linewidth=0.5)
    
    axarr[0].plot(ts.index, ts.values)
    axarr[1].plot(ts_prop.index, ts_prop.values)

    
    fig_name = '_'.join([country, title, project, '.pdf'])
    fig_name = fig_name.replace('/', '-')
    plt.savefig(os.path.join(fig_dir, fig_name))
    plt.close(f)

    
def plot_all_series(df, start, stop, cp_dict, articles, fig_dir, smooth = 1):
    if os.path.exists(fig_dir):
        shutil.rmtree(fig_dir)
    os.makedirs(fig_dir)
    for c,pl in cp_dict.items():
        for p in pl:  
            for a in articles:
                plot_series(df, start, stop, p, c, a, fig_dir, smooth = smooth)

# Make Table of Time Series in Hive

In [122]:
cp_dict = {'Iran':                        ['en', 'fa',],
           'Saudi Arabia':                ['en', 'ar',],
           'Turkey':                      ['en', 'tr',],
           'Rebublic of Korea':           ['en', 'ko',],
           'Iraq':                        ['en', 'ar',],
           'Cuba':                        ['en', 'es',],
           'Venezuela':                   ['en', 'es',],
           'Pakistan':                    ['en', 'ur',],
           'Vietnam':                     ['en', 'vi',],
           'Singapore':                   ['en', 'zh',],
           'Uzbekistan':                  ['en', 'uz',],
           'Nigeria':                     ['en', 'en',],
           'Egypt':                       ['en', 'ar',],
           'Thailand':                    ['en', 'th',],
           'Morocco':                     ['en', ],
           'Bangladesh':                  ['en', ],
           'United States':               ['en', ],
           'China':                       ['en', ],
           'Russia':                      ['en', 'ru',],
          }
#get_hive_ts(cp_dict)

# Inspect Topics

In [136]:
start = '2015-05-01 00'
stop = '2015-10-25 03'

In [138]:
articles = [ 'Lesbian', 'LGBT', 'Gay', 'Transgender', 'Bisexuality', 'Homosexuality']
fig_dir = './figs_queer'
df = get_local_ts(cp_dict, articles) 
plot_all_series(df, start, stop, cp_dict, articles, fig_dir, smooth = 7 )

In [139]:
articles = ['Sex', 'Anal_sex', 'BDSM', 'Brazzers', 'Cunnilingus', 'Dildo', 'Fellatio', 'Oral_sex', 'Human_penis', 'Vulva', 'Scrotum', 'Vagina']
fig_dir = './figs_sex'
df = get_local_ts(cp_dict, articles) 
plot_all_series(df, start, stop, cp_dict, articles, fig_dir, smooth = 7)

In [140]:
articles = ['Mustafa_Kemal_Atatürk', 'Human_penis', 'Vulva', 'Scrotum', 'Vagina', 'Opinion_polling_for_the_Turkish_general_election,_June_2015']
fig_dir = './figs_turkey_suggestions'
cp = {'Turkey': ['tr', 'en']}
df = get_local_ts(cp, articles) 
plot_all_series(df, start, stop, cp, articles, fig_dir, smooth = 7 )

In [141]:
articles = ['Salman_of_Saudi_Arabia']
fig_dir = './figs_saudi_king'
cp = {'Saudi Arabia': ['ar', 'en']}
df = get_local_ts(cp, articles) 
plot_all_series(df, start, stop, cp, articles, fig_dir, smooth = 7 )

# Inspect Countries

In [3]:
c_censorship = 'Iran'
c_control = 'United States'
countries = [c_censorship, c_control]
projects = ['en.wikipedia', 'fa.wikipedia']

In [4]:
# get top candidates

In [5]:
c = 'Iran'

# en articles from paper
#blocked_articles = list(pd.read_csv('./data/blocked_articles.tsv')['article']) 

d_censorship = pd.read_csv('./data/https_transition_comparison.tsv', sep = '\t', encoding = 'utf8')

# outliers in censored country

outlier_articles = [tuple(x) for x in d_censorship[d_censorship['country'] == c][['project', 'title']][:300].values]

# their english counter parts
en_outlier_articles = [ ('en.wikipedia', x) for x  in d_censorship[d_censorship['country'] == c][:300]['en_title']]

# get times series for all
articles = set([str(e[1]) for e in outlier_articles + en_outlier_articles if "'" not in str(e[1])])

In [6]:
#d_censorship[d_censorship['country'] == c_censorship].head(200)

In [54]:

df.index  = pd.to_datetime(df.year*1000000 + df.month*10000 + df.day * 100, format='%Y%m%d%H')

In [55]:
df.head()

Unnamed: 0,year,month,day,country,project,page_title,n,proportion,en_page_title,dt
2015-08-28,2015,8,28,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,4,0.666667,Lesbian,2015-08-28
2015-09-14,2015,9,14,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,3,0.6,Lesbian,2015-09-14
2015-10-25,2015,10,25,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,6,1.0,Lesbian,2015-10-25
2015-10-28,2015,10,28,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,7,0.875,Lesbian,2015-10-28
2015-09-03,2015,9,3,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,5,0.555556,Lesbian,2015-09-03


In [33]:
df.head()

Unnamed: 0,year,month,day,country,project,page_title,n,proportion,en_page_title
0,2015,8,28,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,4,0.666667,Lesbian
1,2015,9,14,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,3,0.6,Lesbian
2,2015,10,25,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,6,1.0,Lesbian
3,2015,10,28,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,7,0.875,Lesbian
4,2015,9,3,Vietnam,vi.wikipedia,Người_đồng_tính_nữ,5,0.555556,Lesbian


In [36]:
df = d 
pd.to_datetime(df.year*1000000 + df.month*10000 + df.day * 100, format='%Y%m%d')


TypeError: 'int' object is unsliceable

In [9]:
def compare_countries(start, stop, c_censorship, c_control, a_censorship, a_control, smooth = 4):
    f, axarr = plt.subplots(2, sharex=True)
    
    # plot transition point
    english_end = datetime.strptime('2015-06-12 09:40', "%Y-%m-%d %H:%M") # End transition of English Wikipedia, including Mobile
    axarr[0].axvline(english_end, color='red', label = 'HTTPS transition', linewidth=0.5)
    axarr[1].axvline(english_end, color='red', label = 'HTTPS transition', linewidth=0.5)

    # plot ts for article in censored country
    project = a_censorship[0]
    title = str(a_censorship[1])
    ts0 = get_series(start, stop, project, c_censorship, title)
    ts0 = pd.rolling_mean(ts0, smooth)
    axarr[0].plot(ts0.index, ts0.values)
    ylabel = c_censorship 
    axarr[0].set_ylabel(ylabel)
    
        
    # plot ts for articles in control
    en_project = a_control[0]
    en_title = str(a_control[1])
    ts1 = get_series(start, stop, en_project, c_control, en_title)
    ts1 = pd.rolling_mean(ts1, smooth)
    axarr[1].plot(ts1.index, ts1.values)
    ylabel = c_control #+ us_article[0].split('.')[0] + ' ' + us_article[1]
    axarr[1].set_ylabel(ylabel)
    
    axarr[0].set_title(project.split('.')[0] + ' ' + en_title)
    
    fig_dir = './figs_' + c_censorship
    if en_title is not np.nan:
        fig_name =  en_title  +  '.pdf'
    else:
        fig_name =  title  +  '.pdf'
    
    fig_name = fig_name.replace('/', '-')
    plt.savefig(os.path.join(fig_dir, fig_name))
    plt.close(f)
    

In [10]:
fig_dir = './figs_' + c_censorship
if os.path.exists(fig_dir):
    shutil.rmtree(fig_dir)
os.makedirs(fig_dir)
    
for i, article in enumerate(outlier_articles):
    compare_countries(start, stop, c_censorship, c_control, article, en_outlier_articles[i] , smooth = 24)
    