In [1]:
import pandas as pd
import re
import os
import numpy as np

In [2]:
scamSites = pd.read_csv("ScamWebsites.csv")
scamSites.shape

(2075, 27)

In [3]:
# print header
list(scamSites)

['Requested domain',
 'API response',
 'Domain name',
 'Domain extension',
 'Trust score',
 'IP address',
 'IP country',
 'ISP name',
 'WHOIS creation date',
 'Registrar name',
 'Registrar IANA code',
 'Registrant name',
 'Registrant company',
 'Registrant email',
 'Registrant country',
 'Agv. review score',
 'Total reviews',
 'Total Facebook comments',
 'Total Scamadviser votes',
 'Request HTTP status',
 'Tags',
 'Keywords',
 'Trust rules',
 'Alexa rank',
 'Pageviews',
 'Created at',
 'Updated at']

In [4]:
def PingSite(siteName):

    strippedName = re.findall('[a-z0-9.-]+',name)
    dotCount = [x.count('.') for x in strippedName]

    strippedName = strippedName[np.argmax(dotCount)]

    #print(strippedName)
    command = 'nslookup %s' % (strippedName)
    x = os.popen(command).read().split('\n')
    
    if len(x) >= 7:
        print(strippedName, "...SUCCESS")
    else:
        print(strippedName, "...FAIL")
    
    return x

In [6]:
def ScrapeLookup(nslookupOutput):
    webInfoJSON = {}
    lookupThemes = ["Name","Address","Alias"]
    
    if len(nslookupOutput) < 7:
        return 0
    else:
        currentTheme = ""
        for i in range(len(nslookupOutput)):
            # check if we need to update currentTheme
            if nslookupOutput[i] == '':
                continue

            for theme in lookupThemes:
                if nslookupOutput[i].startswith(theme):
                    currentTheme = theme
                    webInfoJSON[currentTheme] = []
                    stripped = re.findall('(?<=:)\s+.+',nslookupOutput[i])[0].strip('\t ')
            
            if currentTheme and stripped == "": # we have a theme, but we're not on theme line
                stripped = nslookupOutput[i].strip('\t ')

            if currentTheme:
                webInfoJSON[currentTheme].append(stripped)
                stripped = "" # redefine to clear
    
    return webInfoJSON

In [7]:
# Create a DF with Names, Addresses, and Aliases
df = pd.DataFrame(columns={'Domain','Name','Address','Alias'})

for name in scamSites['Requested domain']:
    pingOutput = PingSite(name)
    outputDict = ScrapeLookup(pingOutput)
    
    if outputDict:
        outputDict['Domain'] = name
        outputDict['Address'].sort()
        df = df.append(outputDict, ignore_index=True)

mgmmacau.live ...SUCCESS
larcla.live ...SUCCESS
down.a51fan.com ...SUCCESS
down.a51fan.com ...SUCCESS
app.kkyyipa.com ...SUCCESS
down.a51fan.com ...SUCCESS
sjpplcplus.com ...FAIL
btcturocenter.com ...SUCCESS
etradest.xyz ...SUCCESS
bbtc.gold ...SUCCESS
www.bittrex168.com ...SUCCESS
www.fengdongjz.com ...SUCCESS
www.passiveoutcomes.com ...SUCCESS
commercepart-time.com ...SUCCESS
www.mlit.top ...SUCCESS
trader.eightprime-fx.com ...SUCCESS
hublogs.com ...SUCCESS
wnsramq.com ...FAIL
ahh344.com ...FAIL
www.longchen18.com ...SUCCESS
btcflashchain.uk ...SUCCESS
veritexfrost.com ...SUCCESS
h5.coinapm.com ...SUCCESS
app.conceptseries.live ...SUCCESS
mitsui55.com ...SUCCESS
w.shouhuo076.com ...FAIL
www.facoin.co ...FAIL
www.sunrayforex.com ...SUCCESS
facoinplus.com ...SUCCESS
www.ohmeho.com ...SUCCESS
wedatetw.com ...FAIL
idstw666.xyz ...SUCCESS
www.russell4m.com ...SUCCESS
www.jolover.com ...SUCCESS
www.highlevel4dc.com ...SUCCESS
www.clk2btc.com ...SUCCESS
www.newgoodcoin.com ...SUCCESS
app.tb

In [285]:
df.loc[6]

Address              [103.127.125.151]
Domain     https://www.bittrex168.com/
Alias                              NaN
Name              [www.bittrex168.com]
Name: 6, dtype: object

In [358]:
print(df.shape)
df.to_csv('ScamIPs.csv',index=False)

(1085, 4)


In [359]:
list(df)

['Address', 'Domain', 'Alias', 'Name']

In [360]:
domainIPs = pd.unique(df['Address'].astype('string'))
ipCount = {}

for IP in df['Address'].astype('string'):
    ipCount[IP] = ipCount.get(IP,0) + 1

In [361]:
sortedKeys = sorted(ipCount,key=lambda x:ipCount[x],reverse=True)
sortedKeys = [x for x in sortedKeys if ipCount[x] > 1]
print([ipCount[x] for x in sortedKeys])

[31, 22, 15, 12, 12, 11, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [362]:
mostCommon = pd.DataFrame()

for key in sortedKeys[1:]:
    temp = df[df['Address'].astype('string') == key]
    
    mostCommon = mostCommon.append(temp)

In [363]:
mostCommon

Unnamed: 0,Address,Domain,Alias,Name
229,[34.98.99.30],http://igmarke.com/,,[igmarke.com]
230,[34.98.99.30],https://www.leta.ink/,[www.leta.ink],[leta.ink]
231,[34.98.99.30],https://www.exblkpro.com/,[www.exblkpro.com],[exblkpro.com]
233,[34.98.99.30],http://www.uh-und.com/,,[www.uh-und.com]
234,[34.98.99.30],https://www.bitlbxpro.com/,[www.bitlbxpro.com],[bitlbxpro.com]
...,...,...,...,...
924,[18.162.229.81],https://www.newbnexusdt.com/,,[www.newbnexusdt.com]
904,"[154.198.217.75, 154.198.217.76, 154.198.217.7...",https://www.zaibuxingjiuchiji.com/atve29,"[www.zaibuxingjiuchiji.com, g9t2mpvx.mfycdn.com]",[xcsnmh6d.n.mfycdn.com]
908,"[154.198.217.75, 154.198.217.76, 154.198.217.7...",https://www.zaibuxingjiuchiji.com/jvwaes,"[www.zaibuxingjiuchiji.com, g9t2mpvx.mfycdn.com]",[xcsnmh6d.n.mfycdn.com]
1073,[159.138.22.8],http://www.323122.com:8930/,,[www.323122.com]


In [364]:
mostCommon.to_csv('MostCommonIPs.csv', index=False)

# Get IPs

In [375]:
import requests
import json
import time

In [381]:
pingIPs = pd.read_csv("ScamIPs.csv")
pingIPs.shape

(1085, 4)

In [403]:
baseurl = 'http://ip-api.com/json/'

In [405]:
header = ['status',
 'country',
 'countryCode',
 'region',
 'regionName',
 'city',
 'zip',
 'lat',
 'lon',
 'timezone',
 'isp',
 'org',
 'as',
 'query']

In [None]:
ipCache = []
ipdf = pd.DataFrame(columns = header)

In [414]:
for IP in pingIPs['Address']:
    for b in IP.split(','):
        address = b.strip("[]\t '")
            
        if address not in ipCache:
            results = requests.get(baseurl + address)
            text = json.loads(results.text)
            ipdf = ipdf.append(text, ignore_index = True)
            time.sleep(1)
            ipCache.append(address)
        else:
            continue

In [413]:
address

'104.21.1.190'

In [415]:
ipdf

Unnamed: 0,status,country,countryCode,region,regionName,city,zip,lat,lon,timezone,isp,org,as,query,message
0,success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:1a00:b:e5f8:8100:93a1,
1,success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:7200:b:e5f8:8100:93a1,
2,success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:8a00:b:e5f8:8100:93a1,
3,success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:c400:b:e5f8:8100:93a1,
4,success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:d800:b:e5f8:8100:93a1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,success,Hong Kong,HK,KKC,Kowloon City,Tsimshatsui,96521,22.3014,114.1760,Asia/Hong_Kong,ServiceID-PQWE,Kwaifong Group Limited,AS142403 YISU CLOUD LTD,185.251.248.225,
1808,success,United States,US,NJ,New Jersey,Newark,07175,40.7357,-74.1724,America/New_York,"Cloudflare, Inc.","Cloudflare, Inc.","AS13335 Cloudflare, Inc.",104.21.51.96,
1809,success,United States,US,NJ,New Jersey,Newark,07175,40.7357,-74.1724,America/New_York,"Cloudflare, Inc.","Cloudflare, Inc.","AS13335 Cloudflare, Inc.",172.67.178.133,
1810,success,United States,US,NJ,New Jersey,Newark,07175,40.7357,-74.1724,America/New_York,"Cloudflare, Inc.","Cloudflare, Inc.","AS13335 Cloudflare, Inc.",2606:4700:3031::6815:3360,


In [417]:
ipdf.to_csv("IPs.csv", index = False)

# Merge

In [458]:
merged = pd.DataFrame(columns = list(df) + header)
merged

Unnamed: 0,Address,Domain,Alias,Name,status,country,countryCode,region,regionName,city,zip,lat,lon,timezone,isp,org,as,query


In [459]:
for i in range(df.shape[0]):
    mergedRow = {}
    
    ipList = df.loc[i]['Address']
    chosenIP = ipList[0]
    
    mergedRow.update(df.loc[i].to_dict())
    tempDict = ipdf[ipdf['query'] == chosenIP].to_dict()
    # get rid of the annoying dicts in tempDict
    for key in tempDict:
        subkeys = list(tempDict[key].keys())
        tempDict[key] = tempDict[key][subkeys[0]]
    
    mergedRow.update(tempDict)
    
    merged = merged.append(mergedRow, ignore_index = True)

In [460]:
merged

Unnamed: 0,Address,Domain,Alias,Name,status,country,countryCode,region,regionName,city,zip,lat,lon,timezone,isp,org,as,query,message
0,"[2600:9000:2026:1a00:b:e5f8:8100:93a1, 2600:90...",https://mgmmacau.live/,[mgmmacau.live],[d25jau5ikwq32c.cloudfront.net],success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:1a00:b:e5f8:8100:93a1,
1,"[2600:9000:2026:4800:5:3cb:bb40:93a1, 2600:900...",https://larcla.live/,[larcla.live],[d2sf2uvueylcz1.cloudfront.net],success,United States,US,IL,Illinois,Chicago,60666,41.8781,-87.6298,America/Chicago,"Amazon.com, Inc.",AWS CloudFront (GLOBAL),"AS16509 Amazon.com, Inc.",2600:9000:2026:4800:5:3cb:bb40:93a1,
2,[99.83.191.53],https://down.a51fan.com:8146/f1bu.app,"[down.a51fan.com, bvepfcmbdn.bigbackbone.com]",[bvepfcmbdn.ddos2naive.com],success,United States,US,WA,Washington,Seattle,98108,47.6229,-122.3370,America/Los_Angeles,"Amazon.com, Inc.",AWS Global Accelerator (GLOBAL),"AS16509 Amazon.com, Inc.",99.83.191.53,
3,[99.83.191.53],https://down.a51fan.com:8146/0ka5.app,"[down.a51fan.com, bvepfcmbdn.bigbackbone.com]",[bvepfcmbdn.ddos2naive.com],success,United States,US,WA,Washington,Seattle,98108,47.6229,-122.3370,America/Los_Angeles,"Amazon.com, Inc.",AWS Global Accelerator (GLOBAL),"AS16509 Amazon.com, Inc.",99.83.191.53,
4,[103.61.31.121],http://app.kkyyipa.com/,"[app.kkyyipa.com, kwknasktdu.bigbackbone.com]",[kwknasktdu.ddos2naive.com],success,Hong Kong,HK,KSS,Sham Shui Po,Cheung Sha Wan,,22.3363,114.1580,Asia/Hong_Kong,ULan Network Limited,ULan Network Limited,AS40065 CNSERVERS LLC,103.61.31.121,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,[185.251.248.225],https://bnmcvbn.sfddgf.com/,,[bnmcvbn.sfddgf.com],success,Hong Kong,HK,KKC,Kowloon City,Tsimshatsui,96521,22.3014,114.1760,Asia/Hong_Kong,ServiceID-PQWE,Kwaifong Group Limited,AS142403 YISU CLOUD LTD,185.251.248.225,
1081,"[104.18.8.188, 104.18.9.188, 2606:4700::6812:8...",https://www.ghkm.bet/,"[www.ghkm.bet, www.ghkm.bet.ktr4db5j.edgejoint...",[www.ghkm.bet.cloudflare.ktr4db5j.edgejoint.ne...,success,United States,US,NJ,New Jersey,Newark,07175,40.7357,-74.1724,America/New_York,"Cloudflare, Inc.","Cloudflare, Inc.","AS13335 Cloudflare, Inc.",104.18.8.188,
1082,"[13.75.113.56, 168.63.205.135, 20.40.89.58]",https://d.fexglobag.buzz/?id=686c6cd5115a54082...,"[d.fexglobag.buzz, rjs2nvym-u.funnull.vip]",[bd9xv5hw.n.funnullv12.com],success,Hong Kong,HK,HCW,Central and Western District,Hong Kong,96521,22.2670,114.1880,Asia/Hong_Kong,Microsoft Corporation,Microsoft Azure Cloud (eastasia),AS8075 Microsoft Corporation,13.75.113.56,
1083,"[45.32.70.82, 45.77.3.111]",https://d.app32103.top/?id=a08123dd8a75fc02fca...,[d.app32103.top],[hc06-site-02.cdn-ng.net],success,United States,US,CA,California,Los Angeles,90012,34.0544,-118.2440,America/Los_Angeles,The Constant Company,"Vultr Holdings, LLC","AS20473 The Constant Company, LLC",45.32.70.82,


In [462]:
merged.to_csv('ScamIPs.csv', index = False)