In [None]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
def ttest_2wk(df_week_last, df_week_current, columns):
    '''
    Takes as input two dataframes for the same set of points collected
    at different times and a list of common features.
    
    Returns a dataframe consisting of ttest_rel statistics and associated
    p-val.
    
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
    '''
    #### Read Last Weeks Data
    count_columns=[col for col in df_week_last.columns if col.endswith('count')]
    df_week_last['total_count']=df_week_last[count_columns].sum(axis=1)
    href_columns=[col for col in df_week_last.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
    df_week_last['total_href']=df_week_last[href_columns].sum(axis=1)
    df_week_last = df_week_last.loc[:,columns]
    df_week_last.drop_duplicates(['domain'],keep='first',inplace=True)
    df_week_last.set_index(['domain'],inplace=True)
    
    #### Read Current Weeks Data
    count_columns=[col for col in df_week_current.columns if col.endswith('count')]
    df_week_current['total_count']=df_week_current[count_columns].sum(axis=1)
    href_columns=[col for col in df_week_current.columns if (('href_absolute' in col) | ('href_relative' in col) | ('href_page' in col))  ]
    df_week_current['total_href']=df_week_current[href_columns].sum(axis=1)
    df_week_current = df_week_current.loc[:,columns]
    df_week_current.drop_duplicates(['domain'],keep='first',inplace=True)
    df_week_current.set_index(['domain'],inplace=True)

    #### Make sure that same domains are used
    df_week_last = df_week_last[df_week_last.index.isin(df_week_current.index.values)]
    df_week_current = df_week_current[df_week_current.index.isin(df_week_last.index.values)]

    #### HTTP columns are 0, True, False, convert them all to int
    for col in df_week_current.columns:
        if ('http_') in col:
            df_week_current[col] = df_week_current[col].\
                                    apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
            df_week_current[col] = df_week_current[col].astype(int)

    for col in df_week_last.columns:
        if ('http_') in col:
            df_week_last[col] = df_week_last[col].\
                                    apply(lambda x: 1 if x == 'True' else 0 if x== 'False' else x)
            df_week_last[col] = df_week_last[col].astype(int)


    #### Calculate relative ttest
    ttest_rel_current = []
    for col in df_week_current.columns:
        try:
            ttest_rel_current.append((col, stats.ttest_rel(df_week_current[col], df_week_last[col])[0] ,
                                        stats.ttest_rel(df_week_current[col], df_week_last[col])[1] ))
        except:
            print(col)

    df_week_current_stats = pd.DataFrame(ttest_rel_current)
    df_week_current_stats.columns = ['Feature', 't-statistic', 'pvalue']
    df_week_current_stats.sort_values(by='pvalue', inplace=True)
    df_week_current_stats['pvalue'] = df_week_current_stats['pvalue'].apply(lambda x: np.round(x,5))

    df_week_current_stats['changed'] = df_week_current_stats['pvalue'].apply(lambda x:  1 if x < 0.05 else 0)
    return(df_week_current_stats)

In [3]:
df_0202 = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02022020.csv',
                       low_memory=False)
df_0209 = pd.read_csv('../../../../reputation/final_datasets/temporal_study/csvs_for_each_week/week_02092020.csv',
                       low_memory=False)

In [4]:
columns = ['a_count',
 'a_href_relative',
 'center_count',
 'meta_count',
 'p_count',
 'url_char_.',
 'url_char_f',
 'url_char_i',
 'url_char_l',
 'url_char_p',
 'url_char_w',
 'url_char_y',
 'url_char_z',
 'url_extension_.com',
 'url_extensions',
 'url_length',
 'http_header_cache-control_set_max-age',
 'http_header_content-encoding_gzip',
 'http_header_server_apache',
 'http_header_transfer-encoding_chunked',
 'http_header_vary_user-agent',
 'http_header_via_1.1',
 'a_href_http',
 'a_href_https',
 'a_href_out_of_domain',
 'form_action_http',
 'iframe_src_.html',
 'img_src_http',
 'js_function_.push(',
 'link_href_out_of_domain',
 'link_type_text/css',
 'script_async_true',
 'total_count',
 'total_href',
 'url_char_-',
 'url_number_special_chars',
 'domain']

In [5]:
ttest_2wk(df_0202,df_0209,columns)

Unnamed: 0,Feature,t-statistic,pvalue,changed
16,http_header_cache-control_set_max-age,0.653391,0.51351,0
32,total_count,0.551158,0.58153,0
0,a_count,0.363727,0.71606,0
23,a_href_https,0.358244,0.72016,0
4,p_count,0.34814,0.72774,0
27,img_src_http,0.342264,0.73215,0
24,a_href_out_of_domain,0.324243,0.74575,0
17,http_header_content-encoding_gzip,0.321953,0.74749,0
1,a_href_relative,0.321041,0.74818,0
33,total_href,0.318921,0.74979,0
