# Get Unique Sites

Goal: The goal here is to identify all of the unique platforms available so we can narrow down similar groups and who they are paying

In [16]:
import numpy as np
import pandas as pd
import requests
from scipy.stats import ttest_ind
import pickle as pkl
from tqdm.notebook import tqdm as tqdm

In [2]:
T = pd.read_csv("ScamIPs.csv")
siteNames = T['Domain']

In [3]:
# calculate the kl divergence
def kl_divergence(p, q):
    return sum([p[i] * np.log2(p[i]/q[i]) for i in range(len(p))])

In [4]:
def CreateVector(vocabulary, wordDict):
    vector = np.zeros(len(vocabulary))
    
    for idx in range(len(vocabulary)):
        try:
            vector[idx] = wordDict[vocabulary[idx]]
        except:
            vector[idx] = 0.01
    
    return vector

In [5]:
def GetHist(text):
    hist = {}
    
    for char in text:
        hist[char] = hist.get(char,0) + 1
    
    return hist

In [6]:
def CompareSites(text1, text2):
    
    hist1 = GetHist(text1)
    hist2 = GetHist(text2)
    
    vocab = list(set(list(hist1.keys()) + list(hist2.keys())))
    vocab.sort()
    
    vector1 = CreateVector(vocab, hist1)
    vector2 = CreateVector(vocab, hist2)
    
    kl = kl_divergence(vector1, vector2)

    if np.abs(kl) < 100:
        return True, kl # same
    else:
        return False, kl # different sites

In [18]:
siteDict = {} # key : site_List
pageDict = {} # key: page_html
key = 0
similarBool = 0

for site in tqdm(siteNames):
    print(site, end='')
    
    try:
        response = requests.get(site)
    except:
        print('...FAIL 404')
        continue
        
    if response.status_code != 200:
        print('...FAIL')
        continue
    
    print('...SUCCESS')

    for pageKey in pageDict:
        similarBool, _ = CompareSites(pageDict[pageKey], response.text)
        
        if similarBool:
            siteDict[pageKey].append(site)
            continue
    
    # if you cycled through, didn't find a similarity
    if not similarBool:
        pageDict[key] = response.text
        siteDict[key] = [site]
        key += 1
    
    with open('sameSites.pkl','wb') as f:
        pkl.dump((siteDict, pageDict),f)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1085.0), HTML(value='')))

https://www.hongkong-online-fraud-asset-tracing.com/...SUCCESS
https://fxcccenter.com/...SUCCESS
https://cmegrouptop.com/...FAIL
https://fxcmtop.com/...FAIL
https://blockinmax.com/...FAIL
https://blockinb2c.com/...FAIL
https://bitchembox.com/...FAIL
https://www.bittrex168.com/...SUCCESS
http://norwest.thegoodd.com/...FAIL
http://www.bikuex.com/...SUCCESS
https://www.t0988123.club/...FAIL 404
https://www.t88hjasd.xyz/...FAIL 404
https://www.thefutureguard.com/...SUCCESS
https://www.tgferz.cloud/...FAIL 404
http://bagmiomio.com/#/login...SUCCESS
https://www.flash68.com/bo/...SUCCESS
https://www.flash68.com/gvex/...SUCCESS
https://sfoa.boayy.com/...SUCCESS
http://www.coincusp.vip/...FAIL 404
https://www.coinonels.com/...FAIL 404
https://www.m87aybsadweb.xyz/...FAIL 404
https://www.bihuiglobal.com/...FAIL 404
https://www.kjerkjef.club/...FAIL 404
https://www.pishbbaxc.live/...FAIL 404
http://app.kkyyipa.com/...SUCCESS
https://www.qqkkx.com:7475/onve.html...SUCCESS
https://down.qianmingwww.

In [19]:
siteDict

{0: ['https://www.hongkong-online-fraud-asset-tracing.com/'],
 1: ['https://fxcccenter.com/'],
 2: ['https://www.bittrex168.com/'],
 3: ['http://www.bikuex.com/'],
 4: ['https://www.thefutureguard.com/'],
 5: ['http://bagmiomio.com/#/login'],
 6: ['https://www.flash68.com/bo/'],
 7: ['https://www.flash68.com/gvex/'],
 8: ['https://sfoa.boayy.com/'],
 9: ['http://app.kkyyipa.com/',
  'https://www.qqkkx.com:7475/onve.html',
  'https://down.qianmingwww.com:7852/jzpx.html',
  'https://down.yyqqv.com/'],
 10: ['https://h5.ejjsg.com/'],
 11: ['https://etf68.bxanys.com/'],
 12: ['https://amber.bxanys.com/'],
 13: ['https://aomge.com/1bfsroo3', 'https://bbtc.gold/'],
 14: ['https://hkex.bxanys.com/'],
 15: ['https://aomge.com/frbjax1u'],
 16: ['https://www.hbet132.com/'],
 17: ['https://www.zbweb01.com/'],
 18: ['https://www.yhk88.app/'],
 19: ['https://www.yhk888.io/'],
 20: ['https://www.chainalysisline.com/'],
 21: ['https://www.chainalysisweb.com/'],
 22: ['https://www.ghkm.app/'],
 23: ['

In [22]:
pageDict[9]

