In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from db_utils import query_hive_ssh, execute_hive_expression, get_hive_timespan
import pandas as pd
import seaborn as sns
from datetime import datetime
import shutil
import os
import numpy as np
from labeling_utils import *
from span_comparison import *
import matplotlib
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt


import copy
import tensorflow as tf

### Create Hive table of Iran time series

In [None]:
cp_dict = {'Iran': ['en', 'fa',]}
start = '2015-05-11'
stop = '2015-08-01'
ts_table_name = 'iran_daily_ts'
#create_hive_daily_ts(cp_dict, start, stop, 'iran_daily_ts')

### Pull down local version of Iran series

In [None]:
#local_iran_daily_ts = get_all_features(cp_dict, cmp, ts_table_name, limit = 10000000, min_views = 100)
#local_iran_daily_ts.to_csv('data/iran/iran_ts.tsv', sep = '\t')
local_iran_daily_ts = pd.read_csv('data/iran/iran_ts.tsv', sep = '\t')

print(local_iran_daily_ts.shape)

In [None]:
local_iran_daily_ts.columns = [c.split('.')[1] if len(c.split('.')) == 2 else c for c in local_iran_daily_ts.columns]
local_iran_daily_ts.rename(columns={'p': 'project', 'c': 'country', 't': 'page_title'}, inplace=True)

### Inspect Timeseries From  Report

In [None]:
report_df = pd.read_csv('./data/iran/gold_label_blocked_articles.tsv', sep = '\t')

In [None]:
report_df.head()

In [None]:
print(report_df.shape)
report_df = report_df.merge(local_iran_daily_ts, on = ['project', 'country', 'page_title'])
print(report_df.shape)

In [None]:
for i, r in report_df.iterrows():
    print(r['project'], r['en_page_title_x'])
    plt.figure()
    ts, ts_prop, f = plot_series(local_iran_daily_ts, start, stop, r, smooth = 3)
    plt.show(f)


#### Create an image of examples for labelers to use as a reference point

### Create Hive table with comparison of access rates from before and after the transition

In [None]:
s1start = '2015-05-01'
s1stop = '2015-06-12'
s2start = '2015-06-12'
s2stop = '2015-07-24'

cmp = PVSpanComparison([s1start, s1stop], [s2start, s2stop], 'censorship', dry = True)

### Select those time series with the greatest change in view counts

In [None]:
span_df = query_span_comparison(cmp, 'Iran', min_post_article_view = 100, min_wikidata_item_view = 500 )
span_df.rename(columns={'p': 'project', 'c': 'country', 't': 'page_title'}, inplace=True)
print(span_df.shape)

In [None]:
span_df.sort('normalized_wdc_view_proportion_delta', inplace  = True, ascending = 0)
span_df['en_page_title'].head(10)

In [None]:
span_df.sort('normalized_tpc_view_proportion_delta', inplace  = True, ascending = 0)
span_df['en_page_title'].head(10)

Sorting by normalized_tpc_view_proportion_delta seems better

### Create labeled set of Iran articles

1. Sample Successive working sets of size 2k
2. pick k random series + next top k extreme series and shuffle them
3. labeler asked to score exaples based on example images (slider [-1, 1])
3. labeler does not see titles 


In [None]:
labels_filename = './data/iran/iran_labels_2.txt'
data_filename = './data/iran/iran_data_2.tsv'

In [None]:
working_set_size = 100

labels_file = open(labels_filename, "a")

with open(labels_filename, "r") as f:
    labeled_set = set(line.strip() for line in f)
    
data_file = open(data_filename, "a")

end = False
while not end:
    
    # get next working set:
    working_set = []
    i = 0
    
    for e in all_id_dicts:
        if e['id'] not in labeled_set:
            working_set.append(e)
            i += 1      
        if i == working_set_size:
            break
        
    
    df_ts = get_local_ts(working_set, en_titles = True)

    for id_dict in working_set:
        ts, ts_prop, f = plot_series(df_ts, start, stop, id_dict, smooth = 3)
        print(id_dict['en_page_title'])
        plt.show(f)
        label = input()            
        plt.close(f)
        if label == 'x':
            end = True
            break
        labels_file.write(id_dict['id'] + '\n')
        write_ts_to_file(id_dict, ts, ts_prop, label, data_file)
        labeled_set.add(id_dict['id'])
        
labels_file.close()
data_file.close()  

#### train a model to predict spike
For now, lets just focus on the proportion time series

In [None]:
from tensorflow_utils import batch_iter
from sklearn.cross_validation import train_test_split

In [None]:
data_df = pd.read_csv(data_filename, sep = '\t', header = None)

In [None]:
data_df = pd.read_csv(data_filename, sep = '\t', header = None)
data_df.fillna(0, inplace = True)
y_df = pd.DataFrame([data_df[183] == 'y', data_df[183] != 'y']).transpose()
y_df = y_df.astype(int)
X_df = data_df.drop(183, axis=1)
X_df = X_df.ix[:, 83:165]
n = X_df.shape[1]

In [None]:
""" 
m = 500
n = 10
mu_1 = np.random.multivariate_normal([3], [[1]], n).squeeze()
mu_2 = np.random.multivariate_normal([3], [[1]], n).squeeze()
cov = np.identity(n) * 10

X_1 = np.random.multivariate_normal(mu_1, cov, m)
X_2 = np.random.multivariate_normal(mu_2, cov, m)

y_1 = np.ones(m) 
y_2 = np.zeros(m)

X = np.concatenate([X_1, X_2])
y = np.concatenate([y_1, y_2])

X_df = pd.DataFrame(X)

y_df = pd.DataFrame([y == 1, y == 0]).transpose()
y_df = y_df.astype(int)
"""

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.33, random_state=42)

In [None]:
# Parameters
learning_rate = 0.001
training_epochs = 250
batch_size = 200
display_step = 10

# Network Parameters
n_hidden_1 = n # 1st layer num features
n_hidden_2 = n # 2nd layer num features
n_hidden_3 = n # 2nd layer num features
n_input = n # MNIST data input (img shape: 28*28)
n_classes = 2 # MNIST total classes (0-9 digits)

# tf Graph inputhttps://phabricator.wikimedia.org/T123292
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

# Create model
def multilayer_perceptron(_X, _weights, _biases):
    layer_1 = tf.tanh(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) 
    layer_2 = tf.tanh(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) 
    layer_3 = tf.tanh(tf.add(tf.matmul(layer_2, _weights['h3']), _biases['b3']))
    return tf.matmul(layer_3, _weights['out']) + _biases['out']

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
pred = multilayer_perceptron(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) # Softmax loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Adam Optimizer

# Initializing the variables
init = tf.initialize_all_variables()

# Launch the graph

sess = tf.Session()

sess.run(init)


train_accs = []
test_accs = []

# Training cycle
for epoch in range(training_epochs):
    avg_cost = 0.
    m = 0
    batches = batch_iter(X_train, y_train, batch_size)
    # Loop over all batches
    for batch_xs, batch_ys in batches:
        batch_m = len(batch_ys)
        m += batch_m
        # Fit training using batch data
        sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
        # Compute average loss
        avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys}) * batch_m
    # Display logs per epoch step
    if epoch % display_step == 0:
        print ("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost/m))
        correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        train_acc = accuracy.eval({x: X_train, y: y_train}, session=sess)
        test_acc = accuracy.eval({x: X_test, y: y_test}, session=sess)
        print ("Accuracy:",train_acc )
        print ("Accuracy:",test_acc ) 
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        

print ("Optimization Finished!")




### Apply Model

In [None]:
df_all_ts_raw = get_all_series({'Iran':['en',]}, limit = 10000, min_views = 100)
df_all_ts = pd.DataFrame([parse(ts) for ts in df_all_ts_raw['ts']])

In [None]:
probs = tf.nn.softmax(pred)
results = probs.eval(feed_dict={x: df_all_ts.values}, session=sess).T

In [None]:
df_all_ts_raw['probs'] = results[0]

In [None]:
df_all_ts_raw.sort('probs', ascending = False, inplace = True)

In [None]:
df_all_ts_raw.head(100)

In [None]:
for i, line in enumerate(open(data_filename, 'r')):
    if (len(line.split('\t'))) == 183:
        print(i)
        break

##### Get daily per country and per article pageview time series for a set of country project pairs

In [None]:
cp_dict = {'Iran':                        ['en', 'fa',],
           'Saudi Arabia':                ['en', 'ar',],
           'Turkey':                      ['en', 'tr',],
           'Rebublic of Korea':           ['en', 'ko',],
           'Iraq':                        ['en', 'ar',],
           'Cuba':                        ['en', 'es',],
           'Venezuela':                   ['en', 'es',],
           'Pakistan':                    ['en', 'ur',],
           'Vietnam':                     ['en', 'vi',],
           'Singapore':                   ['en', 'zh',],
           'Uzbekistan':                  ['en', 'uz',],
           'Nigeria':                     ['en', 'en',],
           'Egypt':                       ['en', 'ar',],
           'Thailand':                    ['en', 'th',],
           'Morocco':                     ['en', ],
           'Bangladesh':                  ['en', ],
           'United States':               ['en', ],
           'China':                       ['en', ],
           'Russia':                      ['en', 'ru',],
          }

start = '2015-05-11'
stop = '2015-08-01'
#create_hive_daily_ts(cp_dict, start, stop, 'daily_ts2')

# Inspect Topics

In [None]:
df_ts.index  = pd.to_datetime(df_ts.day)


In [None]:
df_ts['proportion'].plot()

In [None]:
articles = [ 'Lesbian', 'LGBT', 'Gay', 'Transgender', 'Bisexuality', 'Homosexuality']
fig_dir = './figs_queer'
df = get_local_ts(cp_dict, articles) 
plot_all_series(df, start, stop, cp_dict, articles, fig_dir, smooth = 7 )

In [None]:
articles = ['Sex', 'Anal_sex', 'BDSM', 'Brazzers', 'Cunnilingus', 'Dildo', 'Fellatio', 'Oral_sex', 'Human_penis', 'Vulva', 'Scrotum', 'Vagina']
fig_dir = './figs_sex'
df = get_local_ts(cp_dict, articles) 
plot_all_series(df, start, stop, cp_dict, articles, fig_dir, smooth = 7)

In [None]:
articles = ['Mustafa_Kemal_Atatürk', 'Human_penis', 'Vulva', 'Scrotum', 'Vagina', 'Opinion_polling_for_the_Turkish_general_election,_June_2015']
fig_dir = './figs_turkey_suggestions'
cp = {'Turkey': ['tr', 'en']}
df = get_local_ts(cp, articles) 
plot_all_series(df, start, stop, cp, articles, fig_dir, smooth = 7 )

In [None]:
articles = ['Salman_of_Saudi_Arabia']
fig_dir = './figs_saudi_king'
cp = {'Saudi Arabia': ['ar', 'en']}
df = get_local_ts(cp, articles) 
plot_all_series(df, start, stop, cp, articles, fig_dir, smooth = 7 )

# Inspect Countries

In [None]:
c_censorship = 'Iran'
c_control = 'United States'
countries = [c_censorship, c_control]
projects = ['en.wikipedia', 'fa.wikipedia']
n = 10

In [None]:
# get top candidates

In [None]:
c = 'Iran'

# en articles from paper
#blocked_articles = list(pd.read_csv('./data/blocked_articles.tsv')['article']) 

d_censorship = pd.read_csv('./data/https_transition_comparison.tsv', sep = '\t', encoding = 'utf8')

# outliers in censored country

outlier_articles = [tuple(x) for x in d_censorship[d_censorship['country'] == c][['project', 'title']][:n].values]

# their english counter parts
en_outlier_articles = [ ('en.wikipedia', x) for x  in d_censorship[d_censorship['country'] == c][:n]['en_title']]

# get times series for all
articles = set([str(e[1]) for e in outlier_articles + en_outlier_articles if "'" not in str(e[1])])

In [None]:
articles

In [None]:
def compare_countries(start, stop, c_censorship, c_control, a_censorship, a_control, smooth = 4):
    f, axarr = plt.subplots(2, sharex=True)
    
    # plot transition point
    english_end = datetime.strptime('2015-06-12 09:40', "%Y-%m-%d %H:%M") # End transition of English Wikipedia, including Mobile
    axarr[0].axvline(english_end, color='red', label = 'HTTPS transition', linewidth=0.5)
    axarr[1].axvline(english_end, color='red', label = 'HTTPS transition', linewidth=0.5)

    # plot ts for article in censored country
    project = a_censorship[0]
    title = str(a_censorship[1])
    ts0 = get_series(start, stop, project, c_censorship, title)
    ts0 = pd.rolling_mean(ts0, smooth)
    axarr[0].plot(ts0.index, ts0.values)
    ylabel = c_censorship 
    axarr[0].set_ylabel(ylabel)
    
        
    # plot ts for articles in control
    en_project = a_control[0]
    en_title = str(a_control[1])
    ts1 = get_series(start, stop, en_project, c_control, en_title)
    ts1 = pd.rolling_mean(ts1, smooth)
    axarr[1].plot(ts1.index, ts1.values)
    ylabel = c_control #+ us_article[0].split('.')[0] + ' ' + us_article[1]
    axarr[1].set_ylabel(ylabel)
    
    axarr[0].set_title(project.split('.')[0] + ' ' + en_title)
    
    fig_dir = './figs_' + c_censorship
    if en_title is not np.nan:
        fig_name =  en_title  +  '.pdf'
    else:
        fig_name =  title  +  '.pdf'
    
    fig_name = fig_name.replace('/', '-')
    plt.savefig(os.path.join(fig_dir, fig_name))
    plt.close(f)
    

In [None]:
fig_dir = './figs_' + c_censorship
if os.path.exists(fig_dir):
    shutil.rmtree(fig_dir)
os.makedirs(fig_dir)
    
for i, article in enumerate(outlier_articles):
    compare_countries(start, stop, c_censorship, c_control, article, en_outlier_articles[i] , smooth = 24)
    