In [1]:
import pandas as pd
import numpy as np
from prepare_regression import *
from plotter import *
import warnings; warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('data/data_en.csv', parse_dates=['date'], index_col=0)

In [3]:
data.drop(columns="language")

Unnamed: 0,date,article,pageviews
0,2012-01-01,abu_sayyaf,254
1,2012-01-02,abu_sayyaf,351
2,2012-01-03,abu_sayyaf,423
3,2012-01-04,abu_sayyaf,418
4,2012-01-05,abu_sayyaf,804
...,...,...,...
70123,2014-12-27,yemen,1895
70124,2014-12-28,yemen,1929
70125,2014-12-29,yemen,2180
70126,2014-12-30,yemen,0


In [4]:
data = data[data.article != 'hamas']

In [12]:
relevant= prepare_data_for_regression_with_window(data, 18,48,20)

In [13]:
relevant

Unnamed: 0,date,pageviews,time,intervention,post_slope
17,2013-06-30,2649743,1,0,0
18,2013-07-31,2330629,2,0,0
19,2013-08-31,2382858,3,1,1
20,2013-09-30,2498279,4,1,2
21,2013-10-31,2487946,5,1,3
22,2013-11-30,2459252,6,1,4
23,2013-12-31,1860747,7,1,5
24,2014-01-31,2395900,8,1,6
25,2014-02-28,1943184,9,1,7
26,2014-03-31,2220918,10,1,8


In [22]:
model = smf.ols('pageviews ~ time + C(intervention) + post_slope', data=relevant)
res = model.fit()
res.summary()
res.rsquared

0.1698680273682167

## Remarks: 
Taking an extended version of the data, we are confronted to the challenge of new confounders i.e events that may change the online behavior towards the terrorism related articles. 
We spotted a remarkable drop in the views in July 2014. We would like to test the importance of this drop statistically and whether it was due to a behavior-changing event or just these points were outliers. 
We first notice that, in our dataset, in months July and August 2014, there are a lot of articles with zero pageviews.

After fitting a segmented regression model with intervention on June 2014, we see that the pvalue of the intervention is small (0.02) and less than 0.05. On the other hand, the pvalues of the month index and post slope parameters are high ...
Also, the R square value of the model is 0.415, which means that the segmented regression with the intervention set 

In [31]:
##algorithm

start_index = 18
end_index = 48
start_end_window=12
start_inter_window=5

f= range(start_index+2, end_index-1)
rg = range(start_index, end_index) 
for start in rg: 
    for end in rg:
        if end - start < start_end_window:
            continue;
        
        for inter in range (start,end): 
            if inter - start < start_inter_window or end - inter < start_end_window:
                continue;
            
        
            relevant= prepare_data_for_regression_with_window(data, start,end,inter)
            model = smf.ols('pageviews ~ time + C(intervention) + post_slope', data=relevant)
            res = model.fit()
            if res.rsquared>0.5:
                print("start at "+ str(start))
                print("intervention at "+ str(inter))
                print("end at "+ str(end))
                print(res.rsquared)

start at 18
intervention at 23
end at 35
0.5625443015415559
start at 18
intervention at 23
end at 36
0.5825298172012751
start at 18
intervention at 24
end at 36
0.5825422352369272
start at 18
intervention at 31
end at 43
0.7308953281685422
start at 18
intervention at 31
end at 44
0.6827743056171534
start at 18
intervention at 32
end at 44
0.5180850406213915
start at 18
intervention at 31
end at 45
0.6657745417217451
start at 18
intervention at 32
end at 45
0.5038021796733793
start at 18
intervention at 31
end at 46
0.6560487004826487
start at 18
intervention at 31
end at 47
0.6584917128157275
start at 19
intervention at 24
end at 36
0.5607242002041477
start at 19
intervention at 31
end at 43
0.7214790606931893
start at 19
intervention at 31
end at 44
0.6702349574255951
start at 19
intervention at 31
end at 45
0.6520753827905204
start at 19
intervention at 31
end at 46
0.6417571539510818
start at 19
intervention at 31
end at 47
0.644980202796612
start at 20
intervention at 31
end at 43
