# Swordphish Testing

Python notebook that shows users how Swordphish features can be used and how does the api testing tool works.


**Requirements:**

* Python >= 3.5
* tldextract >= 2.0.1
* pandas >= 0.18.0
* numpy >= 1.10.4
* sklearn >= 0.17.1


In [None]:
import pandas as pd
import numpy as np
import math
import time
import extract_urls as urlext
import swordphish_api as sp
from sklearn import metrics

SWORDPHISH_API = 'https://api.easysol.io/swordphish/'
SWORDPHISH_APIKEY = 'df05a8fa48a0418286e5a5b075063b28' # Please specify your API KEY
SAMPLE_DIRECTORY = 'sample/'

### This are all the different options user has to extraxt the urls

In [None]:
# 1. Default extractraction of csv file
# This option reads the file and extracts the first column that contains urls
url_array = urlext.extract_urls_default(SAMPLE_DIRECTORY)

# 2. Override the default and choose column
# In this option the user chooses the column to be extracted
url_array = urlext.extract_urls_override(SAMPLE_DIRECTORY, 1)

# 3. Users can extract the csv file themseleves 
# Manually extract the urls
file_content = pd.read_csv(SAMPLE_DIRECTORY + 'combined.csv')
file_content.columns = ['url','classification']
file_content.sample(10, random_state=42)

### Swordphish can process only 1000 urls at a time. If the length of the array of urls is larger than that then, we need to pass the information by batches of 1000

In [None]:
final_array = []
length = file_content.shape[0]
print("Number of urls being test: " + str(length))
url_array = file_content[['url']]
index = np.array_split(np.arange(0,length), math.ceil(length / 1000))
for index_ in index:
    final_array.append(url_array.iloc[index_].values.T.tolist()[0])

### Now we call Swordphish once per batch and we count the time it takes to run all the queries

In [None]:
start_time = time.time()  # starts counting time
final_results = []
for batch in final_array:
    params = {
      "urlArray": batch,
      "force_clf": False
    }
    results = sp.call_swordphish(SWORDPHISH_APIKEY, params)  # calls Swordphish
    final_results += results
sphish_time = round((time.time() - start_time)*1000,2)  # ends the counter
avg_query_time = round(sphish_time / length, 2)  # calculates average time per query

print("** SWORDPHISH PROCESS TIMING ** ")
print("-- Total time elapsed:     " + str(sphish_time) + "ms")
print("-- Average time per query: " + str(avg_query_time) + "ms")

## Now we can see the results for each of the different calculations:
### 1. Phishing

In [None]:
phishing_stats = sp.calculate_stats("PHISHING", 2, final_results)
print(phishing_stats)

### 2. DGA

In [None]:
dga_stats = sp.calculate_stats("DGA", 3, final_results)
print(dga_stats)

### 3. Malware 

In [None]:
malware_stats = sp.calculate_stats("MALWARE", 4, final_results)
print(malware_stats)

### Results comparison:

In [None]:
# Convert phishing score to binary prediction
results = pd.DataFrame(sp.classify(final_results), 
                       columns=['URL', 'Rank', 'Phishing Score', 'DGA Score', 
                                'Malware Score', 'Phishing Prediction'])
# Include the ground truth
results['Phishing True Label'] = file_content[['classification']].values.T.tolist()[0]
results.sample(10, random_state=42)

In [None]:
res = metrics.confusion_matrix(results['Phishing True Label'],
                               results['Phishing Prediction'])
pd.DataFrame(res, index=['Actual Ham', 'Actual Phishing'],
                  columns=['Predicted Ham', 'Predicted Phishing'])

In [None]:
res_df = pd.DataFrame(columns=['Statistic'])

res_df.loc['Accuracy'] = metrics.accuracy_score(results['Phishing True Label'],results['Phishing Prediction'])
res_df.loc['Recall'] = metrics.recall_score(results['Phishing True Label'],results['Phishing Prediction'])
res_df.loc['Precision'] = metrics.precision_score(results['Phishing True Label'],results['Phishing Prediction'])
res_df.loc['F1-Score'] = metrics.f1_score(results['Phishing True Label'],results['Phishing Prediction'])
res_df

#### Finally we can create a csv file that contains all the results

In [None]:
create_csv(final_results.values.tolist(), 'sample')

In [None]:
results_csv = pd.read_csv('swordphish_sample_results.csv', index_col=0, header=None, names=['url','rank', 'phishing', 'dga', 'malware', 'classification'])
results_csv.iloc[:5]

### We can select what results we want to see, such as phishing results:

In [None]:
phish_res = results_csv[['url','phishing']]
phish_res.sample(10, random_state=42)

#### This whole process can be done as well by extracting the domains from the urls

In [None]:
url_array = pd.read_csv(SAMPLE_DIRECTORY + 'combined.csv', usecols=[0]).values.T.tolist()[0]
domain_array = extract_domains(url_array)
domain_array = pd.DataFrame(domain_array)
domain_array.columns = ['domain']
domain_array.sample(10, random_state=42)

#### Now we can run the whole process again, but using the domains

In [None]:
final_array = []
length = file_content.shape[0]
index = np.split(np.arange(0,length), math.ceil(length / 1000))
for index_ in index:
    final_array.append(file_content.iloc[index_].values.T.tolist()[0])
    

In [None]:
start_time = time.time()  # starts counting time
final_results = []
for batch in final_array:
    params = {
      "urlArray": batch,
      "force_clf": True
    }
    results = call_swordphish(SWORDPHISH_APIKEY, params)  # calls Swordphish
    final_results += results
sphish_time = round((time.time() - start_time) * 1000, 2)  # ends the counter
avg_query_time = round(sphish_time / length, 2)  # calculates average time per query
print("** SWORDPHISH PROCESS TIMING ** ")
print("-- Total time elapsed:     " + str(sphish_time) + "ms")
print("-- Average time per query: " + str(avg_query_time) + "ms")

In [None]:
phishing_stats = calculate_stats("PHISHING", 2, final_results)
print(phishing_stats)
dga_stats = calculate_stats("DGA", 3, final_results)
print(dga_stats)
malware_stats = calculate_stats("MALWARE", 4, final_results)
print(malware_stats)