# Experiments <a name='table_of_contents'></a>

* ## Data Collection
* ## Feature Engineering
    1. [WHOIS: Keep vs Discard](#whois_keep_vs_discard)
        * [Conclusion](#whois_keep_vs_discard_conclusion)
    2. [WHOIS: KMeans vs Dummy Variable](#whois_kmeans_vs_dummy_variable)
        * [Conclusion](#whois_kmeans_vs_dummy_variable_conclusion)
    3. [WHOIS: ](#whois)
        * [Conclusion](#whois_conclusion)
    4. [Feature: Paper 2](#feature_paper_2)
        * [Conclusion](#feature_paper_2_conclusion)
    5. [Feature: Self-Defined](#feature_self_defined)
        * [Conclusion](#feature_self_defined_conclusion)
    6. [Feature: Selection](#feature_selection)
        * [Conclusion](#feature_selection_conclusion)
    7. [Feature Groups](#feature_groups)
* ## Model Selection
    1. [Tuning: Support Vector Machine](#tuning_support_vector_machine)
        * [Conclusion](#tuning_support_vector_machine_conclusion)
    2. [Tuning: Logistic Regression](#tuning_logistic_regression)
        * [Conclusion](#tuning_logistic_regression_conclusion)
    3. [Tuning: Decision Tree](#tuning_decision_tree)
        * [Conclusion](#tuning_decision_tree_conclusion)
* ## Service

In [1]:
import os
import sys
import traceback
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
from sklearn import datasets

from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

if "./modules" not in sys.path:
    sys.path.append("./modules")    

from BasicUtility import BasicUtility
from ClassifierUtility import ClassifierUtility
from ClustererUtility import ClustererUtility
from TuningUtility import TuningUtility

%matplotlib inline

### Load data

In [2]:
'''df_c = BasicUtility.readData('./dataset/feature_dirty.json')
df_c['label'] = 1

df_l = pd.concat([
    BasicUtility.readData('./dataset/feature_dirty_legitimate.json'), 
    BasicUtility.readData('./dataset/feature_dirty_momo.json')], ignore_index=True)
df_l['label'] = 0

df = df_l
df = df.loc[:, [
    'label',
    'url',
    'hostname',
    'length_of_fqdn',
    'replica_in_fqdn',
    'num_of_currencies_seen',
    'num_of_duplicate_prices_seen',
    'percent_savings',
    'contain_emails',
    'large_iframes',
    'under_a_year',
    'china_registered',
    'in_top_one_million'
]]

df.to_csv(path_or_buf='./dataset/feature_legitimate_paper1.csv', index=False)
'''

"df_c = BasicUtility.readData('./dataset/feature_dirty.json')\ndf_c['label'] = 1\n\ndf_l = pd.concat([\n    BasicUtility.readData('./dataset/feature_dirty_legitimate.json'), \n    BasicUtility.readData('./dataset/feature_dirty_momo.json')], ignore_index=True)\ndf_l['label'] = 0\n\ndf = df_l\ndf = df.loc[:, [\n    'label',\n    'url',\n    'hostname',\n    'length_of_fqdn',\n    'replica_in_fqdn',\n    'num_of_currencies_seen',\n    'num_of_duplicate_prices_seen',\n    'percent_savings',\n    'contain_emails',\n    'large_iframes',\n    'under_a_year',\n    'china_registered',\n    'in_top_one_million'\n]]\n\ndf.to_csv(path_or_buf='./dataset/feature_legitimate_paper1.csv', index=False)\n"

### Set classifiers

In [3]:
classifier_svm = svm.SVC(C=2.0)
classifier_lr = LogisticRegression()
classifier_dt = tree.DecisionTreeClassifier()

utility_svm = ClassifierUtility(classifier_svm)
utility_lr = ClassifierUtility(classifier_lr)
utility_dt = ClassifierUtility(classifier_dt)

In [4]:
df_counterfeit = BasicUtility.readData('./dataset/feature_counterfeit_paper1.csv')[:400]
df_legitimate = BasicUtility.readData('./dataset/feature_legitimate_paper1.csv')[:400] 
df_800 = pd.concat([df_counterfeit, df_legitimate], ignore_index=True)
df_800 = df_800.loc[:, [
    'label',
    'url',
    'hostname',
    'length_of_fqdn',
    'replica_in_fqdn',
    'num_of_currencies_seen',
    'num_of_duplicate_prices_seen',
    'percent_savings',
    'contain_emails',
    'large_iframes',
    'under_a_year',
    'china_registered',
    'in_top_one_million'
]]

df_800['replica_in_fqdn'] = df_800['replica_in_fqdn'].map({True: 1, False: 0})
df_800['large_iframes'] = df_800['large_iframes'].map({True: 1, False: 0})
df_800['in_top_one_million'] = df_800['in_top_one_million'].map({True: 1, False: 0})

### 1. WHOIS: Keep vs Discard <a name="whois_keep_vs_discard"></a>
> Why? 由於撈資料時這兩個欄位的missing value比例很高，而論文中WHOIS是significant的欄位  
> 驗證WHOIS資訊(**under_a_year**, **china_registered**)對model performance(**accuracy**, **precision**, **recall**)的影響
> * 如果對model影響不大則可以discard
> * 影響很大就需要選擇一個適合的方式填missing values

### Get missing value rate

In [5]:
wanted_columns = (df_800.columns != 'label') & (df_800.columns != 'url') & (df_800.columns != 'hostname')
wanted_columns = df_800.columns[wanted_columns]
missing_value_rate = BasicUtility.getMissingValueRate(df_800, wanted_columns)
missing_value_rate

Unnamed: 0,Amount,Proportion
length_of_fqdn,0,0.0
replica_in_fqdn,0,0.0
num_of_currencies_seen,2,0.0025
num_of_duplicate_prices_seen,2,0.0025
percent_savings,7,0.00875
contain_emails,2,0.0025
large_iframes,0,0.0
under_a_year,357,0.44625
china_registered,612,0.765
in_top_one_million,0,0.0


### Ignore samples with missing values

In [6]:
row_condition = pd.notna(df_800['label'])
for column in df_800.columns:
    row_condition = (row_condition & pd.notna(df_800[column]))
df_without_missing_values = df_800.copy().loc[row_condition, :]

### Do feature scaling

In [7]:
#col_condition = (df_without_missing_values.columns != 'label') & (df_without_missing_values.columns != 'url') & (df_without_missing_values.columns != 'hostname')
#df_without_missing_values_scaled = BasicUtility.doFeatureScaling(df_without_missing_values.loc[:, col_condition])
#df_without_missing_values_scaled = pd.concat( [df_without_missing_values.loc[:, ['label', 'url', 'hostname']], df_without_missing_values_scaled], axis=1 )
df_without_missing_values_scaled = BasicUtility.doFeatureScaling(df_without_missing_values)

In [8]:
utility_lr.runExperiment(df_without_missing_values_scaled)

Metric(5-fold mean),Accuracy,Accuracy,Precision,Precision,Recall,Recall,F1 Score,F1 Score
WHOIS,keep,discard,keep,discard,keep,discard,keep,discard
all,0.935673,0.780409,0.955921,0.794615,0.908497,0.727451,0.928182,0.756585
w/o length_of_fqdn,0.935673,0.780409,0.955921,0.794615,0.908497,0.727451,0.928182,0.756585
w/o replica_in_fqdn,0.935673,0.801462,0.955921,0.836282,0.908497,0.727451,0.928182,0.772271
w/o in_top_one_million,0.935673,0.769883,0.955921,0.777704,0.908497,0.727451,0.928182,0.748854
w/o num_of_currencies_seen,0.935673,0.796199,0.955921,0.823306,0.908497,0.727451,0.928182,0.768573
w/o num_of_duplicate_prices_seen,0.930409,0.737427,0.946447,0.719214,0.908497,0.738562,0.923061,0.726605
w/o percent_savings,0.930409,0.753801,0.946447,0.729762,0.908497,0.773203,0.923061,0.745572
w/o contain_emails,0.935673,0.780409,0.955921,0.794615,0.908497,0.727451,0.928182,0.756585
w/o large_iframes,0.935673,0.780409,0.955921,0.794615,0.908497,0.727451,0.928182,0.756585


### Get coefficients from logistic regression

In [9]:
logistic_regression_coefficients = utility_lr.getModelCoefficients(df_without_missing_values_scaled)
logistic_regression_coefficients
pd.concat([logistic_regression_coefficients, missing_value_rate], axis=1).sort_values(by=['coefficients'], ascending=False)

Unnamed: 0,coefficients,Amount,Proportion
under_a_year,3.086861,357,0.44625
china_registered,1.732457,612,0.765
percent_savings,0.87696,7,0.00875
replica_in_fqdn,0.25246,0,0.0
num_of_currencies_seen,-0.037557,2,0.0025
contain_emails,-0.143372,2,0.0025
in_top_one_million,-0.3931,0,0.0
large_iframes,-0.469681,0,0.0
length_of_fqdn,-0.552036,0,0.0
num_of_duplicate_prices_seen,-1.187039,2,0.0025


### See if other ML algortihms have the same trend

In [10]:
def WHOIS_Keep_Versus_Discard(dataframe):
    result_lr = utility_lr.runExperiment(dataframe)
    result_dt = utility_dt.runExperiment(dataframe)
    result_svm = utility_svm.runExperiment(dataframe)

    indexes = ["Logistic Regression", "Decision Tree", "SVM"]
    columns = result_svm.columns
    data = [
        result_lr.values.tolist()[0],
        result_dt.values.tolist()[0],
        result_svm.values.tolist()[0]
    ]
    
    return pd.DataFrame(data=data, columns=columns, index=indexes)

WHOIS_Keep_Versus_Discard(df_without_missing_values_scaled)

Metric(5-fold mean),Accuracy,Accuracy,Precision,Precision,Recall,Recall,F1 Score,F1 Score
WHOIS,keep,discard,keep,discard,keep,discard,keep,discard
Logistic Regression,0.935673,0.780409,0.955921,0.794615,0.908497,0.727451,0.928182,0.756585
Decision Tree,0.876316,0.763743,0.883822,0.785556,0.852288,0.705882,0.861035,0.733097
SVM,0.935673,0.790936,0.955921,0.812473,0.908497,0.727451,0.928182,0.763938


### Conclusion <a name="whois_keep_vs_discard_conclusion"></a>
[top](#table_of_contents)

### 2. WHOIS: K-means vs Dummy Variable <a name="whois_kmeans_vs_dummy_variable"></a>
> 選擇要以何種方式來填WHOIS的missing value
* Dummy Variable: 括充欄位，當missing value rate很高時將會使得該欄位失去significance
* K-means: 用其他欄位去做分群，以每個群label 1/0的比例去填missing value，當其他欄位與WHOIS的相關性不高則填入的missing value意義不大

### Create a smaller dataset to mimic larger dataset, higher missing value rate

In [11]:
row_condition = (pd.notna(df_800['under_a_year'])) | (pd.notna(df_800['china_registered']))
gb_label = df_800.loc[row_condition, :].groupby(['label'])
df_400 = gb_label.head(n=200)
missing_value_rate_400 = BasicUtility.getMissingValueRate(df_400, wanted_columns)

c = pd.concat([missing_value_rate_400, missing_value_rate], axis=1)
del c['Amount']
c.columns = ['dataset_400', 'dataset_800']
c.columns.name = 'Missing Value Rate'
c

Missing Value Rate,dataset_400,dataset_800
length_of_fqdn,0.0,0.0
replica_in_fqdn,0.0,0.0
num_of_currencies_seen,0.0025,0.0025
num_of_duplicate_prices_seen,0.0025,0.0025
percent_savings,0.01,0.00875
contain_emails,0.0025,0.0025
large_iframes,0.0,0.0
under_a_year,0.0,0.44625
china_registered,0.575,0.765
in_top_one_million,0.0,0.0


### Fill missing values
* Others(low missing value rate)
    * Numerical: fill with mean
    * Boolean: fill with zero

In [12]:
columns_with_low_missing_value_rate = {
    'names': ['num_of_currencies_seen', 'num_of_duplicate_prices_seen', 'percent_savings', 'contain_emails'],
    'types': ['numerical', 'numerical', 'numerical', 'categorical']
}

df_800_filled = BasicUtility.fillMissingValues(
    dataframe=df_800, 
    column_names=columns_with_low_missing_value_rate['names'], 
    column_types=columns_with_low_missing_value_rate['types']
)

df_400_filled = BasicUtility.fillMissingValues(
    dataframe=df_400, 
    column_names=columns_with_low_missing_value_rate['names'], 
    column_types=columns_with_low_missing_value_rate['types']
)

#BasicUtility.getMissingValueRate(df_400_filled, wanted_columns)

### Fill missing values
* WHOIS(high missing value rate)
    * Initial: dummy variable
    * Alternative: kmeans

In [13]:
df_800_dummy = df_800_filled.copy()
df_800_dummy['under_a_year_dummy'] = df_800_filled['under_a_year'].map(arg={ 1: 0, 0: 0, None: 1 })
df_800_dummy['china_registered_dummy'] = df_800_filled['china_registered'].map(arg={ 1: 0, 0: 0, None: 1 })
df_800_dummy = df_800_dummy.fillna(0)

df_400_dummy = df_400_filled.copy()
df_400_dummy['under_a_year_dummy'] = df_400_filled['under_a_year'].map(arg={ 1: 0, 0: 0, None: 1 })
df_400_dummy['china_registered_dummy'] = df_400_filled['china_registered'].map(arg={ 1: 0, 0: 0, None: 1 })
df_400_dummy = df_400_dummy.fillna(0)

#BasicUtility.getMissingValueRate(df_400_dummy, wanted_columns)

### Average Silhouette Method(平均側影法): 決定K要取多少
> This measure has a range of [-1, 1]
Silhouette coefficients (as these values are referred to as)
near +1 indicate that the sample is far away from the neighboring clusters.
A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and 
negative values indicate that those samples might have been assigned to the wrong cluster.

In [14]:
util_cluster = ClustererUtility()
df_800_kmeans = util_cluster.doDummyToClustering(df_800_dummy, max_K=4)
df_400_kmeans = util_cluster.doDummyToClustering(df_400_dummy, max_K=4)

   Silhouette Score
K                  
2          0.538354
3          0.613469
4          0.654701
               cluster_size  under_a_year  fill_under_a_year  \
cluster_label                                                  
0                        84          49.0           0.583333   
1                        63          14.0           0.222222   
2                         9           2.0           0.222222   
3                        32          20.0           0.625000   

               china_registered  fill_china_registered  
cluster_label                                           
0                          42.0               0.500000  
1                           5.0               0.079365  
2                           0.0               0.000000  
3                          20.0               0.625000  
   Silhouette Score
K                  
2          0.533106
3          0.602389
4          0.647411
               cluster_size  under_a_year  fill_under_a_year  \
cluster_l

### Do feature scaling

In [15]:
df_800_dummy_scaled = BasicUtility.doFeatureScaling(df_800_dummy)
df_400_dummy_scaled = BasicUtility.doFeatureScaling(df_400_dummy)
df_800_kmeans_scaled = BasicUtility.doFeatureScaling(df_800_kmeans)
df_400_kmeans_scaled = BasicUtility.doFeatureScaling(df_400_kmeans)

### Conclusion <a name="whois_kmeans_vs_dummy_variable_conclusion"></a>
[top](#table_of_contents)

In [16]:
def WHOIS_Kmeans_Versus_Dummy_Variable():
    iterables = [["Accuracy", "Precision", "Recall", "F1 Score"], ["k-means", "dummy variable"]]
    col_index = pd.MultiIndex.from_product(iterables=iterables, names=['Metric(5-fold mean)', 'Method'])
    row_index = ['dataset_400', 'dataset_800']
    df_result = pd.DataFrame()
    
    result_400_dummy = utility_lr.doABTesting(df_400_dummy_scaled, test_all=False)
    result_400_kmeans = utility_lr.doABTesting(df_400_kmeans_scaled, test_all=False)
    result_800_dummy = utility_lr.doABTesting(df_800_dummy_scaled, test_all=False)
    result_800_kmeans = utility_lr.doABTesting(df_800_kmeans_scaled, test_all=False)
    
    df_400_row = pd.DataFrame()
    for metric in ['test_accuracy', 'test_precision', 'test_recall', 'test_f1']:
        df_400_row = pd.concat([df_400_row, result_400_kmeans[metric]], axis=1)
        df_400_row = pd.concat([df_400_row, result_400_dummy[metric]], axis=1)
        
    df_800_row = pd.DataFrame()
    for metric in ['test_accuracy', 'test_precision', 'test_recall', 'test_f1']:
        df_800_row = pd.concat([df_800_row, result_800_kmeans[metric]], axis=1)
        df_800_row = pd.concat([df_800_row, result_800_dummy[metric]], axis=1)
    
    df_result = pd.concat([df_400_row, df_800_row], axis=0)
    df_result.columns = col_index
    df_result.index = row_index
    
    return df_result
'''iterables = [["Missing Value Rate"], ["under_a_year", "china_registered"]]
col_index = pd.MultiIndex.from_product(iterables=iterables)
row_index = ["dataset_400", "dataset_800"]
matrix = []

matrix.append([
    BasicUtility.getMissingRates(df_400, ['under_a_year_dummy'], is_dummy=True), 
    BasicUtility.getMissingRates(df_400, ['china_registered_dummy'], is_dummy=True)
])
matrix.append([
    BasicUtility.getMissingRates(df_800, ['under_a_year_dummy'], is_dummy=True), 
    BasicUtility.getMissingRates(df_800, ['china_registered_dummy'], is_dummy=True)
])
whois_missing_rate = pd.DataFrame(data=matrix, columns=col_index, index=row_index)
'''

'''iterables = [["Accuracy", "Precision", "Recall"], ["dummy", "clustering"]]
col_index = pd.MultiIndex.from_product(iterables=iterables)
matrix = []

matrix.append([
    exp1_400[('Accuracy', 'keep')]['all'],
    exp2_400['test_accuracy']['all'],
    exp1_400[('Precision', 'keep')]['all'],
    exp2_400['test_precision']['all'],
    exp1_400[('Recall', 'keep')]['all'],
    exp2_400['test_recall']['all']
])
matrix.append([
    exp1_800[('Accuracy', 'keep')]['all'],
    exp2_800['test_accuracy']['all'],
    exp1_800[('Precision', 'keep')]['all'],
    exp2_800['test_precision']['all'],
    exp1_800[('Recall', 'keep')]['all'],
    exp2_800['test_recall']['all']
])
comparison_of_different_filling_method = pd.DataFrame(data=matrix, columns=col_index, index=row_index)
comparison_of_different_filling_method

exp2_result = pd.concat([whois_missing_rate, comparison_of_different_filling_method], axis=1)
exp2_result'''
WHOIS_Kmeans_Versus_Dummy_Variable()

Metric(5-fold mean),Accuracy,Accuracy,Precision,Precision,Recall,Recall,F1 Score,F1 Score
Method,k-means,dummy variable,k-means,dummy variable,k-means,dummy variable,k-means,dummy variable
dataset_400,0.7675,0.9525,0.763874,0.968961,0.77,0.935,0.765892,0.951639
dataset_800,0.72,0.8325,0.716953,0.838471,0.73,0.825,0.722397,0.83119


In [17]:
def getCorrelationCoef(dataframe):
    df_copy = dataframe.copy()
    if 'label' in dataframe.columns:
        del df_copy["label"]
    if 'url' in dataframe.columns:
        del df_copy["url"]
    if 'hostname' in dataframe.columns:    
        del df_copy["hostname"]
    print(df_copy.columns)
    
    R = np.corrcoef(df_copy, rowvar=False)
    df_coef = pd.DataFrame(data=R, columns=df_copy.columns, index=df_copy.columns)
    
    return df_coef

matrix = getCorrelationCoef(df_without_missing_values_scaled)
matrix[["under_a_year", "china_registered"]]

Index(['length_of_fqdn', 'replica_in_fqdn', 'num_of_currencies_seen',
       'num_of_duplicate_prices_seen', 'percent_savings', 'contain_emails',
       'large_iframes', 'under_a_year', 'china_registered',
       'in_top_one_million'],
      dtype='object')


Unnamed: 0,under_a_year,china_registered
length_of_fqdn,0.010927,-0.104768
replica_in_fqdn,0.090794,0.141327
num_of_currencies_seen,0.366322,0.481356
num_of_duplicate_prices_seen,-0.126726,-0.112299
percent_savings,0.467246,0.494967
contain_emails,-0.06672,-0.055167
large_iframes,-0.150829,-0.124712
under_a_year,1.0,0.781837
china_registered,0.781837,1.0
in_top_one_million,-0.134535,-0.111239


### 3. WHOIS:  <a name="whois"></a>
> A: **under_a_year** is defined, B: **china_registered** is defined  
> all:  
> whois_well: A & B  
> under_a_year_undefined: ~A  
> china_registered_undefined: ~B  
> whois_undefined: (~A) & (~B) 

In [18]:
'''
def runExperiment3(dataframe):
    num_of_records = dataframe.shape[0]
    under_a_year_well = (dataframe['under_a_year_dummy'] == 0)
    china_registered_well = (dataframe['china_registered_dummy'] == 0)
    
    data = []
    columns = ['proportion of label_1', 'proportion of label_0']
    indexes = ['all', 'whois_well', 'under_a_year_undefined', 'china_registered_undefined', 'whois_undefined']
    conditions = [
        dataframe.index,
        (under_a_year_well & china_registered_well),
        ~under_a_year_well,
        ~china_registered_well,
        ((~under_a_year_well) & (~china_registered_well))
    ]
    
    for i in range(0, 5):
        condition = conditions[i]
        df_meets_condition = dataframe.loc[condition, 'label']
        counts = df_meets_condition.value_counts()
        proportion = [counts[1]/num_of_records, counts[0]/num_of_records]
        data.append(proportion)
        
        print(indexes[i])
        print(counts)
        print(proportion)
    
    
    return pd.DataFrame(data=data, columns=columns, index=indexes)

runExperiment3(df_800)
'''

"\ndef runExperiment3(dataframe):\n    num_of_records = dataframe.shape[0]\n    under_a_year_well = (dataframe['under_a_year_dummy'] == 0)\n    china_registered_well = (dataframe['china_registered_dummy'] == 0)\n    \n    data = []\n    columns = ['proportion of label_1', 'proportion of label_0']\n    indexes = ['all', 'whois_well', 'under_a_year_undefined', 'china_registered_undefined', 'whois_undefined']\n    conditions = [\n        dataframe.index,\n        (under_a_year_well & china_registered_well),\n        ~under_a_year_well,\n        ~china_registered_well,\n        ((~under_a_year_well) & (~china_registered_well))\n    ]\n    \n    for i in range(0, 5):\n        condition = conditions[i]\n        df_meets_condition = dataframe.loc[condition, 'label']\n        counts = df_meets_condition.value_counts()\n        proportion = [counts[1]/num_of_records, counts[0]/num_of_records]\n        data.append(proportion)\n        \n        print(indexes[i])\n        print(counts)\n        p

In [19]:
'''
def runExperiment4(dataframe):
    under_a_year_well = (dataframe['under_a_year_dummy'] == 0)
    china_registered_well = (dataframe['china_registered_dummy'] == 0)
    
    data = []
    columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    indexes = ['all', 'whois_well', 'under_a_year_undefined', 'china_registered_undefined', 'whois_undefined']
    conditions = [
        dataframe.index,
        (under_a_year_well & china_registered_well),
        ~under_a_year_well,
        ~china_registered_well,
        ((~under_a_year_well) & (~china_registered_well))
    ]
    
    for i in range(0, 5):
        row_condition = conditions[i]
        col_condition = (dataframe.columns != 'under_a_year_dummy') & (dataframe.columns != 'china_registered_dummy')
        df_meets_condition = dataframe.loc[row_condition, col_condition]
        
        df_testing_result = utility_svm.doABTesting(df_meets_condition, test_all=False)
        data.append(df_testing_result.values.tolist()[0])
        print(indexes[i])
        print(df_testing_result)
        print(df_testing_result.values.tolist())
        
    return pd.DataFrame(data=data, columns=columns, index=indexes)
        
runExperiment4(df_800)
'''

"\ndef runExperiment4(dataframe):\n    under_a_year_well = (dataframe['under_a_year_dummy'] == 0)\n    china_registered_well = (dataframe['china_registered_dummy'] == 0)\n    \n    data = []\n    columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score']\n    indexes = ['all', 'whois_well', 'under_a_year_undefined', 'china_registered_undefined', 'whois_undefined']\n    conditions = [\n        dataframe.index,\n        (under_a_year_well & china_registered_well),\n        ~under_a_year_well,\n        ~china_registered_well,\n        ((~under_a_year_well) & (~china_registered_well))\n    ]\n    \n    for i in range(0, 5):\n        row_condition = conditions[i]\n        col_condition = (dataframe.columns != 'under_a_year_dummy') & (dataframe.columns != 'china_registered_dummy')\n        df_meets_condition = dataframe.loc[row_condition, col_condition]\n        \n        df_testing_result = utility_svm.doABTesting(df_meets_condition, test_all=False)\n        data.append(df_testing_result.val

### Conclusion <a name="whois_conclusion"></a>
This experiment only tells why we want to fill the missing values of whois

[top](#table_of_contents)

### 4. Feature: Paper 2 <a name="feature_paper_2"></a>
> add features from **'Learning to Detect and Measure Fake Ecommerce Websites in
Search-Engine Results'**
>
* **has_social_media**: whether a website provides links to **Facebook**, **Instagram** and **Line** fanpage of its brand
* **has_mobile_app**: whether a website provides links to **Google Play** and **Apple Store** to download their mobile apps
* **has_payment_option**: whether a website provides links to **付款方式**
* **contain_phone_numbers**: whether a website contains contact numbers

### Load data

In [20]:
df_counterfeit_paper2 = BasicUtility.readData('./dataset/feature_counterfeit_paper2.csv')
df_legitimate_paper2 = BasicUtility.readData('./dataset/feature_legitimate_paper2.csv')
df_800_paper2 = pd.concat([df_counterfeit_paper2, df_legitimate_paper2], ignore_index=True)

### Merge old features with new ones

In [21]:
df_800_paper2 = pd.merge(
    left=df_800_dummy_scaled, 
    right=df_800_paper2.groupby('url').first().reset_index(), 
    on=['url', 'hostname'], 
    how='left'
)

### Get missing value rate of new features to decide how to fill their missing values

In [22]:
BasicUtility.getMissingRates(
    dataframe=df_800_paper2, 
    column_names=['has_social_media', 'has_mobile_app', 'has_payment_option', 'contain_phone_numbers']
)

Unnamed: 0,amount,proportion
has_social_media,2,0.0025
has_mobile_app,2,0.0025
has_payment_option,8,0.01
contain_phone_numbers,8,0.01


### Fill missing values

In [23]:
df_800_paper2_filled = BasicUtility.fillMissingValues(
    dataframe=df_800_paper2, 
    column_names=['has_social_media', 'has_mobile_app', 'has_payment_option', 'contain_phone_numbers'],
    column_types=['categorical', 'categorical', 'categorical', 'categorical']
)

### Do feature scaling

In [24]:
df_800_paper2_scaled = BasicUtility.doFeatureScaling(df_800_paper2_filled)

In [25]:
utility_lr.doABTesting(df_800_paper2_scaled, ['has_social_media', 'has_mobile_app', 'has_payment_option', 'contain_phone_numbers'])

Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1
all,0.90875,0.894862,0.9275,0.910528
w/o has_social_media,0.84625,0.85736,0.8325,0.843795
w/o has_mobile_app,0.91,0.896517,0.9275,0.911521
w/o has_payment_option,0.91,0.897134,0.9275,0.911643
w/o contain_phone_numbers,0.90875,0.894862,0.9275,0.910528


In [26]:
def runAll(dataframe, columns_for_testing=[]):
    experiments = {
        'SVM': utility_svm.doABTesting(dataframe, columns_for_testing),
        'LR': utility_lr.doABTesting(dataframe, columns_for_testing),
        'DT': utility_dt.doABTesting(dataframe, columns_for_testing)
    }
    
    indexes = experiments['SVM'].index
    metrics = experiments['SVM'].columns
    algorithms = experiments.keys()
    columns = pd.MultiIndex.from_product([metrics, algorithms])
    result = pd.DataFrame()
    
    for metric in metrics:
        for algorithm in algorithms:
            result = pd.concat([result, experiments[algorithm][metric]], axis=1)
    result.columns = columns
    
    return result

In [27]:
report = runAll(df_800_dummy_scaled)
report

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_accuracy,test_precision,test_precision,test_precision,test_recall,test_recall,test_recall,test_f1,test_f1,test_f1
Unnamed: 0_level_1,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT
all,0.81125,0.8325,0.83625,0.795996,0.838471,0.868876,0.8375,0.825,0.795,0.815967,0.83119,0.828188
w/o china_registered,0.81375,0.835,0.845,0.799945,0.83913,0.874966,0.8375,0.83,0.8075,0.818015,0.834075,0.838285
w/o contain_emails,0.81,0.82625,0.8375,0.794098,0.833165,0.873852,0.8375,0.8175,0.7925,0.814943,0.824542,0.829122
w/o in_top_one_million,0.81125,0.835,0.8325,0.795996,0.841019,0.863614,0.8375,0.8275,0.7925,0.815967,0.833685,0.824809
w/o large_iframes,0.81125,0.8325,0.84125,0.795996,0.841813,0.879186,0.8375,0.82,0.7925,0.815967,0.830185,0.83241
w/o length_of_fqdn,0.81,0.8225,0.82,0.794091,0.820757,0.870106,0.8375,0.825,0.7525,0.814962,0.822635,0.806718
w/o num_of_currencies_seen,0.8325,0.8425,0.83875,0.857988,0.861512,0.868156,0.7975,0.8175,0.8025,0.825888,0.838308,0.831849
w/o num_of_duplicate_prices_seen,0.805,0.81375,0.81875,0.786639,0.818215,0.846895,0.8375,0.8075,0.7825,0.811027,0.812412,0.811685
w/o percent_savings,0.8125,0.835,0.80875,0.797947,0.838995,0.840861,0.8375,0.83,0.7625,0.816985,0.834117,0.798759
w/o replica_in_fqdn,0.8125,0.835,0.85125,0.797947,0.842412,0.891994,0.8375,0.825,0.8,0.816985,0.833221,0.84228


In [28]:
report_paper2 = runAll(df_800_paper2_scaled, ['has_social_media', 'has_mobile_app', 'has_payment_option', 'contain_phone_numbers'])
report_paper2

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_accuracy,test_precision,test_precision,test_precision,test_recall,test_recall,test_recall,test_f1,test_f1,test_f1
Unnamed: 0_level_1,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT
all,0.90375,0.90875,0.8825,0.899682,0.894862,0.892588,0.91,0.9275,0.87,0.904286,0.910528,0.881047
w/o contain_phone_numbers,0.905,0.90875,0.8825,0.898059,0.894862,0.890909,0.915,0.9275,0.8725,0.906038,0.910528,0.881457
w/o has_mobile_app,0.905,0.91,0.88375,0.898059,0.896517,0.895251,0.915,0.9275,0.87,0.906038,0.911521,0.882361
w/o has_payment_option,0.9025,0.91,0.8825,0.897418,0.897134,0.886716,0.91,0.9275,0.8775,0.90319,0.911643,0.881912
w/o has_social_media,0.83875,0.84625,0.83375,0.843983,0.85736,0.851784,0.8325,0.8325,0.8125,0.837625,0.843795,0.829715


### Find out which social media is in dominant

In [29]:
df_label0 = pd.concat([
    BasicUtility.readData('./dataset/social_media_momo.json'),
    BasicUtility.readData('./dataset/social_media_yahoo.json')
], ignore_index=True)
df_label1 = BasicUtility.readData('./dataset/social_media.json')

In [30]:
df_label0 = df_label0.loc[:, ['fb', 'ig', 'line']]
df_label0['at_least_one_social_media'] = (df_label0['fb']) | (df_label0['ig']) | (df_label0['line'])
proportion_label0 = df_label0.apply('sum')/df_label0.shape[0]

df_label1 = df_label1.loc[:, ['fb', 'ig', 'line']]
df_label1['at_least_one_social_media'] = (df_label1['fb']) | (df_label1['ig']) | (df_label1['line'])
proportion_label1 = df_label1.apply('sum')/df_label1.shape[0]

df_social_media_proportion = pd.concat([proportion_label1, proportion_label0], axis=1)
df_social_media_proportion.columns = ['Counterfeit', 'Legitimate']
df_social_media_proportion.index.name = 'Proportion of Links'
df_social_media_proportion.columns.name = 'Type of Website'
df_social_media_proportion

Type of Website,Counterfeit,Legitimate
Proportion of Links,Unnamed: 1_level_1,Unnamed: 2_level_1
fb,0.021792,0.74424
ig,0.007264,0.260369
line,0.062954,0.40553
at_least_one_social_media,0.077482,0.801843


### Conclusion <a name="feature_paper_2_conclusion"></a>
[top](#table_of_contents)

In [31]:
utility_lr.getModelCoefficients(df_800_paper2_scaled).sort_values(by=['coefficients'], ascending=False)

Unnamed: 0,coefficients
under_a_year,4.137645
under_a_year_dummy,1.696315
percent_savings,1.256759
china_registered,0.569836
num_of_currencies_seen,0.486588
china_registered_dummy,0.257917
replica_in_fqdn,0.163886
contain_phone_numbers,-0.053793
in_top_one_million,-0.251062
has_payment_option,-0.731565


### 5. Feature: Self-Defined <a name="feature_self_defined"></a>
> add self-defined features, these kinds of features comes from observation of counterfeit websites
>
> how to quantify "simplicity"?
> * **node_counts**: number of DOM nodes in a website
> * **dom_height**: height(depth) of DOM
> * **text_tag_ratio**: text length/node counts
> * **text_length**: inner text length under html “body” element

In [32]:
df_counterfeit_self = BasicUtility.readData('./dataset/feature_counterfeit_self.csv')
df_legitimate_self = BasicUtility.readData('./dataset/feature_legitimate_self.csv')
df_800_self = pd.concat([df_counterfeit_self, df_legitimate_self], ignore_index=True)

In [33]:
df_800_self = pd.merge(
    left=df_800_dummy_scaled, 
    right=df_800_self.groupby('url').first().reset_index(), 
    on=['url', 'hostname'], 
    how='left'
)

In [34]:
BasicUtility.getMissingRates(df_800_self, ['node_counts', 'dom_height', 'text_tag_ratio'])

Unnamed: 0,amount,proportion
node_counts,2,0.0025
dom_height,2,0.0025
text_tag_ratio,4,0.005


In [35]:
df_800_self_filled = BasicUtility.fillMissingValues(df_800_self, ['node_counts', 'dom_height', 'text_tag_ratio'], ['numerical', 'numerical', 'numerical'])

In [36]:
df_800_self_filled['text_length'] = df_800_self_filled['node_counts'] * df_800_self_filled['text_tag_ratio']
del df_800_self_filled['text_tag_ratio']

In [37]:
#col_conditions = (df_800_self.columns != 'url') & (df_800_self.columns != 'hostname') & (df_800_self.columns != 'label')
#df_800_self.loc[:, col_conditions] = BasicUtility.doFeatureScaling(df_800_self.loc[:, col_conditions])
df_800_self_scaled = BasicUtility.doFeatureScaling(df_800_self_filled)

In [38]:
utility_lr.doABTesting(df_800_self_scaled, ['node_counts', 'dom_height', 'text_length'])

Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1
all,0.89375,0.887552,0.9025,0.894631
w/o node_counts,0.87125,0.87081,0.8725,0.871319
w/o dom_height,0.88,0.888266,0.87,0.878291
w/o text_length,0.8975,0.89387,0.9025,0.897818


In [39]:
report_self = runAll(df_800_self_scaled, ['node_counts', 'dom_height', 'text_length'])
report_self

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_accuracy,test_precision,test_precision,test_precision,test_recall,test_recall,test_recall,test_f1,test_f1,test_f1
Unnamed: 0_level_1,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT
all,0.86125,0.89375,0.89,0.877387,0.887552,0.880441,0.84,0.9025,0.9075,0.857515,0.894631,0.892367
w/o dom_height,0.8425,0.88,0.88875,0.846022,0.888266,0.86811,0.8375,0.87,0.9175,0.841563,0.878291,0.891661
w/o node_counts,0.84875,0.87125,0.85125,0.851657,0.87081,0.854594,0.845,0.8725,0.8575,0.847815,0.871319,0.852685
w/o text_length,0.87,0.8975,0.8775,0.884912,0.89387,0.859251,0.85,0.9025,0.91,0.866546,0.897818,0.882216


### Conclusion <a name="feature_self_defined_conclusion"></a>
[top](#table_of_contents)

In [40]:
utility_lr.getModelCoefficients(df_800_self_scaled).sort_values(by=['coefficients'], ascending=False)

Unnamed: 0,coefficients
under_a_year,4.325452
under_a_year_dummy,1.839859
num_of_currencies_seen,1.014754
china_registered,0.968805
percent_savings,0.597804
replica_in_fqdn,0.360022
china_registered_dummy,0.292282
in_top_one_million,-0.205392
text_length,-1.197715
large_iframes,-1.278468


In [41]:
'''df_800_text = df_800_self.copy()
#df_text = df_800_self.copy()
df_text['text_length'] = df_text['node_counts']*df_text['text_tag_ratio']'''
gb_text = df_800_self_filled.loc[:, ['label', 'text_length']].groupby(['label'])
print(gb_text.mean())
print(gb_text.std())
#text_length is random, so it's not significant

       text_length
label             
0      1605.298544
1      1171.723240
       text_length
label             
0      1858.128295
1      1411.968798


### 6. Feature Selection <a name="feature_selection"></a>
> * 所有feature放在一起train
> * 以coef的絕對值排序刪除feature
> * 到n_sample:n_feature=30:1

In [42]:
df_800_all_scaled = pd.concat([
    df_800_paper2_scaled,
    df_800_self_scaled.loc[:, ['node_counts', 'dom_height', 'text_length']],
], axis=1)
runAll(df_800_all_scaled)

Unnamed: 0_level_0,test_accuracy,test_accuracy,test_accuracy,test_precision,test_precision,test_precision,test_recall,test_recall,test_recall,test_f1,test_f1,test_f1
Unnamed: 0_level_1,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT,SVM,LR,DT
all,0.91,0.915,0.89375,0.906669,0.908362,0.89092,0.915,0.925,0.9,0.910388,0.916092,0.894724
w/o china_registered,0.90625,0.91375,0.87,0.902094,0.906163,0.873093,0.9125,0.925,0.8675,0.906773,0.914973,0.869415
w/o contain_emails,0.9075,0.915,0.885,0.900407,0.908362,0.879409,0.9175,0.925,0.8925,0.908493,0.916092,0.885725
w/o contain_phone_numbers,0.9075,0.915,0.8825,0.900407,0.908362,0.890957,0.9175,0.925,0.8725,0.908493,0.916092,0.88134
w/o dom_height,0.90875,0.9125,0.88,0.904391,0.897713,0.88202,0.915,0.9325,0.8775,0.909256,0.914407,0.879606
w/o has_mobile_app,0.90875,0.91125,0.88625,0.902385,0.907331,0.893849,0.9175,0.9175,0.8775,0.909561,0.911996,0.884983
w/o has_payment_option,0.9075,0.9175,0.8925,0.900407,0.912496,0.90032,0.9175,0.925,0.885,0.908493,0.918298,0.891601
w/o has_social_media,0.8625,0.89375,0.88375,0.8745,0.88749,0.869667,0.8475,0.9025,0.91,0.859963,0.89448,0.88763
w/o in_top_one_million,0.90875,0.91625,0.89125,0.902385,0.908461,0.901801,0.9175,0.9275,0.88,0.909561,0.917431,0.8897
w/o large_iframes,0.90875,0.915,0.88625,0.902385,0.908362,0.888389,0.9175,0.925,0.885,0.909561,0.916092,0.885909


### Sort absolute values of coefficients to eliminate undesired features

In [43]:
absolute_coefficients = utility_lr.getModelCoefficients(df_800_all_scaled).abs().sort_values(by=['coefficients'], ascending=False)
absolute_coefficients.columns = ['Absolute Coefficients']
absolute_coefficients

Unnamed: 0,Absolute Coefficients
under_a_year,3.953647
node_counts,2.692064
has_social_media,2.455635
dom_height,1.942607
has_mobile_app,1.719297
under_a_year_dummy,1.579204
length_of_fqdn,1.323884
num_of_duplicate_prices_seen,1.252205
percent_savings,1.224316
text_length,0.990876


In [44]:
coefficient_thresholds = [0.5, 1.0, 1.5, 2.0]
utility_used = utility_lr
df_feature_selection = pd.DataFrame()

for i in range(0, len(coefficient_thresholds)):
    columns_below_threshold = (absolute_coefficients < coefficient_thresholds[i])['Absolute Coefficients']
    columns_to_eliminate = absolute_coefficients.loc[columns_below_threshold, :].index.values.tolist()
    
    df_800_drop_scaled = df_800_all_scaled.drop(labels=columns_to_eliminate, axis=1)
    cv_result = utility_used.doCrossValidation(df_800_drop_scaled)
    cv_result = cv_result.T.apply(['mean', 'std']).loc[:, ['test_recall', 'train_recall', 'test_f1', 'train_f1']]
    cv_result = cv_result.T.stack()
    
    df_feature_selection = pd.concat([df_feature_selection, cv_result], axis=1)
    
df_feature_selection.columns = coefficient_thresholds
df_feature_selection.columns.name = 'Coefficient Thresholds'
df_feature_selection

Unnamed: 0,Coefficient Thresholds,0.5,1.0,1.5,2.0
test_recall,mean,0.9275,0.9325,0.925,0.9475
test_recall,std,0.024044,0.034911,0.038528,0.034686
train_recall,mean,0.929375,0.93375,0.925,0.9475
train_recall,std,0.007526,0.007126,0.009632,0.008672
test_f1,mean,0.918433,0.918725,0.912487,0.882014
test_f1,std,0.022008,0.031099,0.034586,0.039453
train_f1,mean,0.920458,0.921083,0.913574,0.880905
train_f1,std,0.006609,0.006414,0.007561,0.008333


### Conclusion <a name="feature_selection_conclusion"></a>
[top](#table_of_contents)

### Feature Groups <a name="feature_groups"></a>
* 簡單說明Paper2的其他feature是因為missing value rate高所以不用,Paper1的則是因為和他們自己有預設brand name
* Define the meaning of the feature and how you get it clearly e.g. has_mobile_app
* Categorical: multiple classes, Boolean: 0-1 class

[top](#table_of_contents)

In [45]:
indexes = pd.MultiIndex.from_tuples([
    ('URL','length_of_fqdn'), 
    ('URL','replica_in_fqdn'), 
    ('Content Element','num_of_currencies_seen'), 
    ('Content Element','num_of_duplicate_prices_seen'), 
    ('Content Element','percent_savings'), 
    ('Content Element','contain_emails'), 
    ('Content Element','contain_phone_numbers'), 
    ('Content Element','has_social_media'), 
    ('Content Element','has_mobile_app'), 
    ('Content Element','has_payment_option'), 
    ('Content Element','large_iframes'), 
    ('Content Structure','node_counts'), 
    ('Content Structure','dom_height'), 
    ('Content Structure','text_length'), 
    ('WHOIS','in_top_one_million'), 
    ('WHOIS','china_registered'), 
    ('WHOIS','under_a_year')
], names=['Group', 'Name'])
columns = ['Type', 'Meaning']
feature_spec = [
    ['Numerical', 'length of fully qualified domain name'],
    ['Boolean', 'whether keyword "replica" appears in fqdn'],
    ['Numerical', 'how many kind of currencies shown in a website'],
    ['Numerical', 'the maximum number of duplicate prices seen in a website'],
    ['Numerical', 'number of percent off'],
    ['Boolean', 'whether a website contains email'],
    ['Boolean', 'whether a website contains contact numbers'],
    ['Boolean', 'links to **Facebook**, **Instagram** and **Line** fanpage of their brands'],
    ['Boolean', 'links to Google Play and Apple Store'],
    ['Boolean', 'whether a website has links to "付款方式"'],
    ['Boolean', 'whether a website contains large iframe'],
    ['Numerical', 'number of nodes in DOM'],
    ['Numerical', 'height of DOM tree'],
    ['Numerical', 'text length under body element'],
    ['Boolean', 'whether a website is in magestic million domain rankings'],
    ['Boolean', 'registration country is china'],
    ['Boolean', 'registration age is under a year']
]
pd.DataFrame(data=feature_spec, index=indexes, columns=columns)

Unnamed: 0_level_0,Unnamed: 1_level_0,Type,Meaning
Group,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
URL,length_of_fqdn,Numerical,length of fully qualified domain name
URL,replica_in_fqdn,Boolean,"whether keyword ""replica"" appears in fqdn"
Content Element,num_of_currencies_seen,Numerical,how many kind of currencies shown in a website
Content Element,num_of_duplicate_prices_seen,Numerical,the maximum number of duplicate prices seen in...
Content Element,percent_savings,Numerical,number of percent off
Content Element,contain_emails,Boolean,whether a website contains email
Content Element,contain_phone_numbers,Boolean,whether a website contains contact numbers
Content Element,has_social_media,Boolean,"links to **Facebook**, **Instagram** and **Lin..."
Content Element,has_mobile_app,Boolean,links to Google Play and Apple Store
Content Element,has_payment_option,Boolean,"whether a website has links to ""付款方式"""


In [46]:
columns_below_threshold = (absolute_coefficients < 1.0)['Absolute Coefficients']
columns_to_eliminate = absolute_coefficients.loc[columns_below_threshold, :].index.values.tolist()
    
df_800_drop_scaled = df_800_all_scaled.drop(labels=columns_to_eliminate, axis=1)
utility_tuning = TuningUtility(df_800_drop_scaled)

### 1. Tuning: Support Vector Machine <a name="tuning_support_vector_machine"></a>
* Parameters:
    1. C: penalty parameter of the error term, larger prone to overfitting smaller prone to underfitting, default=1.0
        * tradeoff between training error and flatness -> larger C, smaller training error
    2. kernel: default='rbf'
        * 'linear'
        * 'polynomial'
            * degree: default=3
            * gamma: default=1/n_features
            * coef0: default=0
        * 'rbf(Gaussian)'
            * gamma
        * 'sigmoid'
            * gamma
            * coef0

In [47]:
utility_tuning.tuneSVMParameters()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC,Stability,Generalization
kernel,C,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rbf,0.25,0.855,0.823272,0.9075,0.862589,0.96425,0.030104,0.0325
rbf,0.5,0.90625,0.904285,0.91,0.906529,0.966812,0.040889,0.03375
rbf,1.0,0.90625,0.898192,0.9175,0.907392,0.968156,0.032596,0.03125
rbf,2.0,0.91,0.898572,0.925,0.911355,0.970563,0.038528,0.0375
rbf,4.0,0.91,0.898572,0.925,0.911355,0.971812,0.038528,0.0375
linear,0.25,0.89375,0.902122,0.885,0.892674,0.97,0.042757,0.0325
linear,0.5,0.905,0.898059,0.915,0.906038,0.971781,0.032355,0.0325
linear,1.0,0.91,0.898572,0.925,0.911355,0.973406,0.038528,0.0375
linear,2.0,0.91,0.898572,0.925,0.911355,0.973656,0.038528,0.036875
linear,4.0,0.9075,0.900656,0.9175,0.908585,0.973469,0.032596,0.02875


### Conclusion <a name="tuning_support_vector_machine_conclusion"></a>
[top](#table_of_contents)

### 2. Tuning: Logistic Regression <a name="tuning_logistic_regression"></a>
* Parameters:
    1. C: inversion of regularization term, the smaller the larger  regularization, default=1.0
    2. solver: algorithm to use in the optimization problem, default='liblinear'
        * small datasets use 'libnear' and large use 'sag' & 'saga'
        * 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty
    3. penalty: norm used in penalization, default='l2'
    4. max_iter: maximum number of iterations taken for the solvers to converge, default=100

In [48]:
utility_tuning.tuneLogisticRegressionParameters()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC,Stability,Generalization
penalty,C,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
l2,0.25,0.91,0.900663,0.9225,0.911151,0.968719,0.035795,0.03375
l2,0.5,0.91375,0.903204,0.9275,0.915,0.971,0.028504,0.0275
l2,1.0,0.9175,0.905727,0.9325,0.918725,0.971594,0.034911,0.035
l2,2.0,0.91875,0.907916,0.9325,0.919901,0.972687,0.028777,0.02875
l2,4.0,0.9225,0.912539,0.935,0.923509,0.973906,0.025617,0.02625
l1,0.25,0.91,0.892979,0.9325,0.912144,0.965297,0.034911,0.03625
l1,0.5,0.91625,0.903558,0.9325,0.917621,0.971281,0.034911,0.035
l1,1.0,0.91625,0.907953,0.9275,0.917307,0.973906,0.028504,0.0275
l1,2.0,0.92625,0.91987,0.935,0.927053,0.975,0.02054,0.02
l1,4.0,0.9225,0.915092,0.9325,0.923402,0.975875,0.022707,0.023125


### Conclusion <a name="parameter_logistic_regression_conclusion"></a>
[top](#table_of_contents)

### 3. Tuning: Decision Tree <a name="tuning_decision_tree"></a>
* Decision trees tend to overfit on data with a large number of features
* Balance your dataset before training to prevent the tree from being biased toward the classes that are dominant
* Parameters:
    1. criterion: default='gini', 'entropy'
    2. splitter: default='best', 'random' for large number of samples
    3. max_features: default=None, 'sqrt' for n_features > 50
    4. max_depth: default=None, try max_depth=3 as an initial tree depth
    5. min_samples_split: default=2
    6. min_samples_leaf: default=1, try min_samples_leaf=5 as an initial value

In [49]:
utility_tuning.tuneDecisionTreeParameters()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC,Stability,Generalization
max_depth,min_samples_leaf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1,0.85125,0.900757,0.8075,0.842497,0.917062,0.120416,0.05125
3,5,0.84625,0.895423,0.8025,0.837898,0.930937,0.109473,0.056875
3,10,0.85625,0.900261,0.815,0.847726,0.930719,0.11706,0.045625
4,1,0.895,0.931861,0.8575,0.891375,0.913047,0.030104,0.036875
4,5,0.88875,0.927237,0.85,0.884452,0.937578,0.05,0.040625
4,10,0.90375,0.942586,0.8625,0.899761,0.939516,0.030619,0.02875
5,1,0.90125,0.909314,0.8925,0.900097,0.919766,0.048894,0.044375
5,5,0.88875,0.898529,0.88,0.887254,0.943312,0.065907,0.05375
5,10,0.90375,0.915699,0.8925,0.902343,0.944063,0.057009,0.034375
6,1,0.9,0.916956,0.8825,0.898773,0.919016,0.022707,0.05375


### Conclusion <a name="tuning_decision_tree_conclusion"></a>
[top](#table_of_contents)

In [50]:
'''
#df = pd.concat([readData('./dataset/feature_self_yahoo_.json'), readData('./dataset/feature_self_momo_.json')], ignore_index=True)
df = readData('./dataset/feature_self_.json')
df = df.drop(columns=['in_top_one_million', 'length_of_fqdn', 'replica_in_fqdn'])
df = df.loc[:, ['url', 'hostname', 'node_counts', 'dom_height', 'text_tag_ratio']]
#df.to_csv(path_or_buf='./dataset/feature_legitimate_self.csv', index=False)
df.to_csv(path_or_buf='./dataset/feature_counterfeit_self.csv', index=False)
'''
'''
#df = pd.concat([readData('./dataset/feature_paper2_yahoo.json'), readData('./dataset/feature_paper2_momo.json')], ignore_index=True)
df = readData('./dataset/feature_paper2.json')
df = df.drop(columns=['in_top_one_million', 'length_of_fqdn', 'replica_in_fqdn', 'label'])
df = df.loc[:, ['url', 'hostname', 'contain_phone_numbers', 'has_mobile_app', 'has_social_media', 'has_payment_option']]
field = 'has_mobile_app'
df[field] = df[field].map({True: 1, False: 0})
field = 'has_social_media'
df[field] = df[field].map({True: 1, False: 0})
df.to_csv(path_or_buf='./dataset/feature_counterfeit_paper2.csv', index=False)
'''

"\n#df = pd.concat([readData('./dataset/feature_paper2_yahoo.json'), readData('./dataset/feature_paper2_momo.json')], ignore_index=True)\ndf = readData('./dataset/feature_paper2.json')\ndf = df.drop(columns=['in_top_one_million', 'length_of_fqdn', 'replica_in_fqdn', 'label'])\ndf = df.loc[:, ['url', 'hostname', 'contain_phone_numbers', 'has_mobile_app', 'has_social_media', 'has_payment_option']]\nfield = 'has_mobile_app'\ndf[field] = df[field].map({True: 1, False: 0})\nfield = 'has_social_media'\ndf[field] = df[field].map({True: 1, False: 0})\ndf.to_csv(path_or_buf='./dataset/feature_counterfeit_paper2.csv', index=False)\n"

### Try model selection with GridSearchCV

In [51]:
test = utility_tuning.tuneParameters(algorithm='SVM')
print(test.best_params_)
utility_test = ClassifierUtility(classifier=test.best_estimator_)
utility_test.doABTesting(df_800_drop_scaled, test_all=False)

{'kernel': 'poly', 'C': 0.25}


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1
all,0.59875,0.555846,0.99,0.711774


### Export model by using pickle

In [52]:
import pickle
from collections import OrderedDict

label_and_feature = BasicUtility.splitLabelFeature(df_800_drop_scaled)
classifiers = OrderedDict((
    ('LR', LogisticRegression(penalty='l2', C=4.0)),
    ('DT', tree.DecisionTreeClassifier(max_depth=5, min_samples_leaf=1)),
    ('SVM', svm.SVC(kernel='rbf', C=2.0))
))

for key in classifiers.keys():
    classifiers[key].fit(label_and_feature['feature'].values.tolist(), label_and_feature['label'])
    
    file_path = "./model/"+key+".sav"
    print(file_path)
    with open(file_path, 'wb') as f:
        pickle.dump(classifiers[key], f)

./model/LR.sav
./model/DT.sav
./model/SVM.sav


In [53]:
file_name = './model/DT.sav'
loaded_model = pickle.load(open(file_name, 'rb'))
utility_test.classifier = loaded_model
utility_test.doABTesting(df_800_drop_scaled)

Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1
all,0.9,0.90722,0.8925,0.899004
w/o length_of_fqdn,0.89875,0.910786,0.885,0.896748
w/o num_of_duplicate_prices_seen,0.89,0.899285,0.8825,0.888819
w/o percent_savings,0.9025,0.910212,0.895,0.901699
w/o under_a_year,0.87125,0.894477,0.8425,0.866956
w/o has_mobile_app,0.9,0.905315,0.895,0.899415
w/o has_social_media,0.89125,0.948214,0.83,0.884115
w/o node_counts,0.885,0.90913,0.8625,0.882472
w/o dom_height,0.9,0.909236,0.89,0.898668
