In [29]:
import pandas as pd
import numpy as np
from scipy import stats
from IPython.display import HTML

In [2]:
df_test = pd.read_csv('../Validation/Data/Test_Data/df_fin.csv',index_col='domain')
df_test_url=pd.read_csv('../Validation/Data/Test_Data/df_url.gz',index_col='domain',compression='gzip')
df_test=pd.concat([df_test,df_test_url[['url_char_-','url_number_special_chars']]],join='inner',axis=1)
columns = df_test.loc[:,df_test.columns!='Target'].columns.tolist()
columns.append('domain')

## 1. TTest_rel  
#### " This is a two-sided test for the null hypothesis that 2 related or repeated samples have identical average (expected) values. "
#### Interpretation - "The test measures whether the average score differs significantly across samples (e.g. exams). If we observe a large p-value, for example greater than 0.05 or 0.1 then we cannot reject the null hypothesis of identical average scores. If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%, then we reject the null hypothesis of equal averages. Small p-values are associated with large t-statistics." - https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html

#### Week 1 - 0209 and Original Validation Data

In [3]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02092020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_test[df_test.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
1,a_href_relative,-54.23577,0.0,1
33,total_href,-58.994138,0.0,1
31,script_async_true,-95.790067,0.0,1
29,link_href_out_of_domain,-58.031375,0.0,1
26,iframe_src_.html,-63.644544,0.0,1
24,a_href_out_of_domain,-61.827953,0.0,1
23,a_href_https,-47.496507,0.0,1
22,a_href_http,-37.81877,0.0,1
25,form_action_http,-31.595313,0.0,1
27,img_src_http,-29.936579,0.0,1


#### Week 2 - 0216 and Week 1

In [4]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02092020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [6]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02162020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
16,http_header_cache-control_set_max-age,0.458226,0.64679,0
18,http_header_server_apache,-0.373976,0.70842,0
23,a_href_https,0.353841,0.72346,0
31,script_async_true,0.34003,0.73383,0
30,link_type_text/css,-0.332266,0.73969,0
19,http_header_transfer-encoding_chunked,0.291848,0.7704,0
33,total_href,0.280406,0.77917,0
1,a_href_relative,0.263444,0.79221,0
24,a_href_out_of_domain,0.262464,0.79296,0
0,a_count,0.248976,0.80338,0


### No change from Week 1 to Week 2. It would be interesting to look into the maximum time difference before the features change

#### Week 3 - 0223 and Week 2

In [8]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02162020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [9]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02232020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
22,a_href_http,-0.649391,0.51609,0
25,form_action_http,-0.529032,0.59678,0
20,http_header_vary_user-agent,0.477026,0.63334,0
18,http_header_server_apache,-0.426385,0.66983,0
21,http_header_via_1.1,0.317323,0.751,0
29,link_href_out_of_domain,0.271352,0.78612,0
3,meta_count,-0.23699,0.81267,0
24,a_href_out_of_domain,-0.235339,0.81395,0
1,a_href_relative,-0.209956,0.8337,0
33,total_href,-0.187943,0.85092,0


### Week 4 - 0301  and Week 3

In [3]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02232020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [5]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03012020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
21,http_header_via_1.1,-2.411379,0.01589,1
19,http_header_transfer-encoding_chunked,0.31954,0.74932,0
26,iframe_src_.html,-0.302217,0.76249,0
22,a_href_http,0.287756,0.77353,0
18,http_header_server_apache,-0.283188,0.77703,0
30,link_type_text/css,-0.26617,0.79011,0
24,a_href_out_of_domain,0.23622,0.81326,0
33,total_href,0.194465,0.84581,0
1,a_href_relative,0.186418,0.85212,0
20,http_header_vary_user-agent,0.171293,0.86399,0


### Week 5 - 0308 and Week 4

In [7]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03012020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [8]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03082020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
27,img_src_http,-0.816888,0.41399,0
22,a_href_http,-0.608474,0.54287,0
20,http_header_vary_user-agent,0.431156,0.66636,0
25,form_action_http,-0.205881,0.83688,0
23,a_href_https,0.189905,0.84938,0
17,http_header_content-encoding_gzip,0.183232,0.85462,0
30,link_type_text/css,-0.179933,0.85721,0
18,http_header_server_apache,-0.173802,0.86202,0
2,center_count,0.172224,0.86326,0
0,a_count,0.127315,0.89869,0


### Week 6 - 0315 and Week 5

In [10]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03152020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [11]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03222020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
22,a_href_http,-0.894117,0.37126,0
18,http_header_server_apache,-0.448165,0.65404,0
27,img_src_http,-0.446858,0.65498,0
21,http_header_via_1.1,0.362658,0.71686,0
23,a_href_https,0.339697,0.73409,0
19,http_header_transfer-encoding_chunked,0.311402,0.7555,0
0,a_count,-0.31022,0.75639,0
26,iframe_src_.html,0.302776,0.76206,0
25,form_action_http,-0.289524,0.77218,0
32,total_count,-0.270243,0.78697,0


### Week 7 - 0329 and Week 6

In [14]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03222020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [15]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03292020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
16,http_header_cache-control_set_max-age,0.660579,0.50888,0
18,http_header_server_apache,-0.535153,0.59255,0
25,form_action_http,-0.443375,0.6575,0
22,a_href_http,-0.342512,0.73197,0
21,http_header_via_1.1,0.287687,0.77359,0
32,total_count,-0.275327,0.78307,0
0,a_count,-0.250377,0.8023,0
26,iframe_src_.html,-0.172378,0.86314,0
17,http_header_content-encoding_gzip,-0.145508,0.88431,0
1,a_href_relative,-0.143635,0.88579,0


### Week 8 - 0405 and Week 7

In [16]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_03292020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [17]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_04052020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
22,a_href_http,-0.45944,0.64592,0
25,form_action_http,-0.450325,0.65248,0
18,http_header_server_apache,-0.396299,0.69189,0
17,http_header_content-encoding_gzip,0.377245,0.70599,0
32,total_count,0.355022,0.72257,0
23,a_href_https,0.27128,0.78618,0
29,link_href_out_of_domain,0.231217,0.81715,0
27,img_src_http,-0.20012,0.84139,0
28,js_function_.push(,-0.185542,0.8528,0
4,p_count,0.177067,0.85946,0


### Week 9 - 0412 and Week 8

In [19]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_04052020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [20]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_04122020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
17,http_header_content-encoding_gzip,0.860376,0.38958,0
25,form_action_http,-0.418088,0.67588,0
19,http_header_transfer-encoding_chunked,0.286922,0.77417,0
18,http_header_server_apache,-0.279409,0.77993,0
2,center_count,-0.268722,0.78814,0
27,img_src_http,-0.24476,0.80664,0
30,link_type_text/css,-0.22263,0.82382,0
21,http_header_via_1.1,0.205297,0.83734,0
31,script_async_true,0.201522,0.84029,0
22,a_href_http,-0.180872,0.85647,0


### Week 10  - 0419 and Week 9

In [21]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_04122020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [22]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_04192020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
25,form_action_http,-0.360006,0.71884,0
27,img_src_http,-0.297076,0.76641,0
17,http_header_content-encoding_gzip,0.248034,0.80411,0
23,a_href_https,0.232388,0.81624,0
33,total_href,0.217417,0.82788,0
1,a_href_relative,0.212911,0.8314,0
31,script_async_true,-0.201852,0.84003,0
32,total_count,0.17042,0.86468,0
19,http_header_transfer-encoding_chunked,0.155526,0.87641,0
2,center_count,0.143501,0.88589,0


### Week 10 - 0419 and Week 1 -0209

In [27]:
#### Read Last Weeks Data
df_week_last = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02092020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_last.columns if col.endswith('count')]
df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)

df_week_last = df_week_last.loc[:,columns]
df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_last.set_index(['domain'],inplace=True)

In [28]:
#### Read Current Weeks Data
df_week_current = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_04192020.csv',
                       low_memory=False)

count_columns=[col for col in df_week_current.columns if col.endswith('count')]
df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)

df_week_current = df_week_current.loc[:,columns]
df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
df_week_current.set_index(['domain'],inplace=True)

#### Make sure that same domains are used
df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

#### HTTP columns are 0, True, False, convert them all to int
for col in df_week_current.columns:
    if ('http_') in col:
        df_week_current[col] = df_week_current[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_current[col] = df_week_current[col].astype(int)
        
for col in df_week_last.columns:
    if ('http_') in col:
        df_week_last[col] = df_week_last[col].\
                                apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
        df_week_last[col] = df_week_last[col].astype(int)


#### Calculate relative ttest
ttest_rel_current = []
for col in df_week_current.columns:
    try:
        ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                    stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
    except:
        print(col)
        
df_week_current_stats = pd.DataFrame(ttest_rel_current)
df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
df_week_current_stats.sort_values(by='pvalue', inplace=True)
df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
df_week_current_stats

Unnamed: 0,Feature,t-statistic,pvalue,changed
18,http_header_server_apache,-3.565717,0.00036,1
25,form_action_http,-3.205612,0.00135,1
22,a_href_http,-3.127424,0.00176,1
27,img_src_http,-2.067427,0.0387,1
19,http_header_transfer-encoding_chunked,1.995219,0.04602,1
23,a_href_https,1.920593,0.05479,0
16,http_header_cache-control_set_max-age,1.844095,0.06517,0
17,http_header_content-encoding_gzip,1.714728,0.0864,0
20,http_header_vary_user-agent,1.347347,0.17787,0
30,link_type_text/css,-1.251013,0.21093,0


### Next I intend to use this test - Repeated Measures ANOVA.
### This is an excellent explanation for it. - https://statistics.laerd.com/statistical-guides/repeated-measures-anova-statistical-guide.php

### Check this notebook again by end of day Sunday. It should be updated via the link I shared. If not alternatively view it directly on github at -
    

In [30]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')