<a href="https://colab.research.google.com/github/dimas-adi-kris/Project-Portofolio/blob/main/HF_MultiThread.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-whois
!pip install waybackpy
!pip install futures
!pip install shodan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import joblib
from whois import whois
from waybackpy import WaybackMachineCDXServerAPI
from socket import gethostbyname
from shodan import Shodan
from requests import get
from urllib.parse import urlparse
from datetime import datetime
from re import compile
from json import dump, loads
from time import sleep

In [3]:
cache = {}

class HostFeatures:
    def __init__(self, url,ip,scheme):
        self.url = url
        self.host = ip
        self.scheme = scheme
        self.urlparse = urlparse(self.url)
        self.now = datetime.now()
        self.init_sub_params = self.initialise_sub_parameters()

    def initialise_sub_parameters(self):
        if self.host not in cache:
            self.whois = self.__get__whois_dict()
            self.shodan = self.__get_shodan_dict()
            self.snapshots = self.__get_site_snapshots()
            return True
        else:
            return False

    def __get__whois_dict(self):
        try:
            whois_dict = whois(self.host)
            return whois_dict
        except:
            return {}

    def __get_shodan_dict(self):
        #api = Shodan('oHXPLkp5UCrFOH0jkKAAGjfzElojeAv5')
        api = Shodan('W6cy1PGcje0jJwKDBTgrqWSZioRpRmzg')
        try:
            host = api.host(self.host)
            return host
        except:
            return {}

    def __parse__before__date(self, date_string):
        month_year = date_string.split()[-1]
        d = '01-{}'.format(month_year)
        d = datetime.strptime(d, '%d-%b-%Y')
        return d

    def __parse_whois_date(self, date_key):
        cdate = self.whois.get(date_key, None)
        if cdate:
            if isinstance(cdate, str) and 'before' in cdate:
                d = self.__parse__before__date(cdate)
            elif isinstance(cdate, list):
                d = cdate[0]
            else:
                d = cdate
        return d if cdate else cdate

    def __get_site_snapshots(self):
        try:
            snapshots = WaybackMachineCDXServerAPI(self.url).snapshots()
            snapshots = [snapshot.datetime_timestamp for snapshot in snapshots]
            return snapshots
        except:
            return []

    def number_of_subdomains(self):
        ln1 = self.whois.get('nets', None)
        ln2 = self.shodan.get('domains', None)
        ln = ln1 or ln2
        return len(ln) if ln else None

    def url_creation_date(self):
        d = self.__parse_whois_date('creation_date')
        return d

    def url_expiration_date(self):
        d = self.__parse_whois_date('expiration_date')
        return d

    def url_last_updated(self):
        d = self.__parse_whois_date('updated_date')
        return d

    def url_age(self):
        try:
            days = (self.now - self.url_creation_date()).days
        except:
            days = None
        return days

    def url_intended_life_span(self):
        try:
            lifespan = (self.url_expiration_date() - self.url_creation_date()).days
        except:
            lifespan = None
        return lifespan

    def url_life_remaining(self):
        try:
            rem = (self.url_expiration_date() - self.now).days
        except:
            rem = None
        return rem

    def url_registrar(self):
        return self.whois.get('registrar', None)

    def url_registration_country(self):
        c = self.whois.get('country', None)
        return c

    def url_host_country(self):
        c = self.shodan.get('country_name', None)
        return c

    def url_open_ports(self):
        ports = self.shodan.get('ports', '')
        return ports if ports != '' else None

    def url_num_open_ports(self):
        ports = self.url_open_ports()
        lp = len(ports) if ports else 0
        return lp

    def url_is_live(self):
        url = '{}://{}'.format(self.scheme, self.url)
        try:
            return get(url).status_code == 200
        except:
            return False

    def url_isp(self):
        return self.shodan.get('isp', '')

    def url_connection_speed(self):
        url = '{}://{}'.format(self.scheme, self.url)
        if self.url_is_live():
            return get(url).elapsed.total_seconds()
        else:
            return None

    def first_seen(self):
        try:
            fs = self.snapshots[0]
            return fs
        except:
            return datetime.now()

    def get_os(self):
        oss = self.shodan.get('os', None)
        return oss

    def last_seen(self):
        try:
            ls = self.snapshots[-1]
            return ls
        except:
            return datetime.now()

    def days_since_last_seen(self):
        dsls = (self.now - self.last_seen()).days
        return dsls

    def days_since_first_seen(self):
        dsfs = (self.now - self.first_seen()).days
        return dsfs

    def average_update_frequency(self):
        snapshots = self.snapshots
        diffs = [(t-s).days for s, t in zip(snapshots, snapshots[1:])]
        l = len(diffs)
        if l > 0:
            return sum(diffs)/l
        else:
            return 0

    def number_of_updates(self):
        return len(self.snapshots)

    def ttl_from_registration(self):
        earliest_date_seen = self.first_seen()
        try:
            ttl_from_reg = (earliest_date_seen - self.url_creation_date()).days
        except:
            ttl_from_reg = None
        return ttl_from_reg

    def run(self):
        if self.init_sub_params:
            try:
                fv = {
                    "url":self.url,            
                    "host": self.host,
                    "num_subdomains": self.number_of_subdomains(),
                    "registration_date": str(self.url_creation_date()),
                    "expiration_date": str(self.url_expiration_date()),
                    "last_updates_dates": str(self.url_last_updated()),
                    "age": self.url_age(),
                    "intended_life_span": self.url_intended_life_span(),
                    "life_remaining": self.url_life_remaining(),
                    "registrar": self.url_registrar(),
                    "reg_country": self.url_registration_country(),
                    "host_country": self.url_host_country(),
                    "open_ports": self.url_open_ports(),
                    "num_open_ports": self.url_num_open_ports(),
                    "is_live": self.url_is_live(),
                    "isp": self.url_isp(),
                    "connection_speed": self.url_connection_speed(),
                    "first_seen": str(self.first_seen()),
                    "last_seen": str(self.last_seen()),
                    "days_since_last_seen": self.days_since_last_seen(),
                    "days_since_first_seen": self.days_since_first_seen(),
                    "avg_update_days": self.average_update_frequency(),
                    "total_updates": self.number_of_updates(),
                    "ttl": self.ttl_from_registration()
                }
                cache[self.host] = fv
                return fv
            except Exception as e:
                print('OOPS')
                print(e)
                return e
        else:
            print(self.host)
            return cache[self.host]

In [4]:
def __get_ip(link_url):
    try:
        ip = link_url if url_host_is_ip(link_url) else gethostbyname(link_url)
        return ip
    except:
        return None

def url_host_is_ip(link_url):
    host = link_url
    pattern = compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
    match = pattern.match(host)
    return match is not None

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset_pdf_raw.csv',index_col=0)
df['hostname'] = df['http'].apply(lambda x:urlparse(x).netloc)
df['scheme'] = df['http'].apply(lambda x:urlparse(x).scheme)
df_hn = pd.DataFrame(df.hostname.unique())
df_hn['ip'] = df_hn[0].apply(lambda x:__get_ip(x))
df_hn.columns = ['hostname','ip']
df_m = pd.merge(df,df_hn,on='hostname')
df_hn = df_m.drop_duplicates(['ip'])[['hostname','scheme','ip']]
df_hn.reset_index(drop=True,inplace=True)

In [None]:
df_hn

In [None]:
ls_res = []
for idx,data in df_hn.iterrows():
  print(idx,data['hostname'])
  ft = HostFeatures(data['hostname'],data['ip'],data['scheme']).run()
  ls_res.append(ft)

In [32]:
active_count()

13

In [7]:
import time
from threading import Lock, Thread, active_count
thread_list = []

res = {}
def testFunction(ind,link_url,ips,schemes):
  ft = HostFeatures(link_url,ips,schemes).run()
  res[ind] = ft
n_threads = 1000 # define max child threads. 
for idx,data in df_hn.iterrows():
    t = Thread(target=testFunction, args=(idx,data['hostname'],data['ip'],data['scheme']))
    thread_list.append(t)
    t.start()
    if idx%500==0:
      print(idx,data['hostname'])
      print ( f"Launching thread with name: {data['hostname']}" )
      print ( '\n == Current active threads ==: ' + str(active_count()-1) )
    while active_count() > n_threads: # max thread count (includes parent thread)
        time.sleep(1)

0 
Launching thread with name: 

 == Current active threads ==: 10
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refusedError trying to connect to socket: closing socket - [Errno 111] Connection refused

Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refusedError trying to connect to socket: closing socket - [Errno 111] Connection refused

Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refusedError trying to connect to socket: closing socket - [Errno 111] Connection refused

Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




Error trying to connect to socket: closing socket - [Errno 111] Connection refused




500 www.sangpencerah.com
Launching thread with name: www.sangpencerah.com

 == Current active threads ==: 377




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/urllib3/connectionpool.py", line 396, in _make_request
    assert_header_parsing(httplib_response.msg)
  File "/usr/local/lib/python3.7/dist-packages/urllib3/util/response.py", line 72, in assert_header_parsing
    raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data)
urllib3.exceptions.HeaderParsingError: [MissingHeaderBodySeparatorDefect()], unparsed data: 'server name: 208\r\nX-POWERED-BY: vietnamlawmagazine.vn\r\nDate: Sat, 22 Oct 2022 21:34:30 GMT\r\nContent-Length: 14272\r\n\r\n'
ERROR:urllib3.connection:Certificate did not match expected hostname: www.jurnal.ar-raniry.ac.id. Certificate: {'subject': ((('commonName', 'jurnal.ar-raniry.ac.id'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '04E2B87E010D35F17017B309F7A9B869276A', 'notBefore': 'Aug 31 16:07:07 2022 GMT', 'notAfter': 'Nov 29 16:

Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused


ERROR:urllib3.connection:Certificate did not match expected hostname: www.ncert.nic.in. Certificate: {'subject': ((('commonName', 'ictschools.ncert.gov.in'),),), 'issuer': ((('countryName', 'US'),), (('stateOrProvinceName', 'Arizona'),), (('localityName', 'Scottsdale'),), (('organizationName', 'GoDaddy.com, Inc.'),), (('organizationalUnitName', 'http://certs.godaddy.com/repository/'),), (('commonName', 'Go Daddy Secure Certificate Authority - G2'),)), 'version': 3, 'serialNumber': '6B3311895B7CF2A0', 'notBefore': 'Jan 25 04:57:44 2022 GMT', 'notAfter': 'Jan  1 04:32:16 2023 GMT', 'subjectAltName': (('DNS', 'ictschools.ncert.gov.in'), ('DNS', 'ciet.nic.in'), ('DNS', 'epathshala.nic.in'), ('DNS', 'osre.ncert.gov.in'), ('DNS', 'itpd.ncert.gov.in'), ('DNS', 'ictcurriculum.gov.in'), ('DNS', 'ncert.nic.in'), ('DNS', 'pindics.ncert.gov.in')), 'OCSP': ('http://ocsp.godaddy.com/',), 'caIssuers': ('http://certificates.godaddy.com/repository/gdig2.crt',), 'crlDistributionPoints': ('http://crl.god

Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


ERROR:urllib3.connection:Certificate did not match expected hostname: www.kemenpar.go.id. Certificate: {'subject': ((('countryName', 'ID'),), (('stateOrProvinceName', 'Daerah Khusus Ibukota Jakarta'),), (('localityName', 'Jakarta Pusat'),), (('organizationName', 'Kementerian Pariwisata dan Ekonomi Kreatif Republik Indonesia'),), (('commonName', '*.kemenparekraf.go.id'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('commonName', 'DigiCert TLS RSA SHA256 2020 CA1'),)), 'version': 3, 'serialNumber': '0AE1B9EF0FB815705D8299FFB42DA9BD', 'notBefore': 'May 31 00:00:00 2022 GMT', 'notAfter': 'Jul  1 23:59:59 2023 GMT', 'subjectAltName': (('DNS', '*.kemenparekraf.go.id'), ('DNS', 'kemenparekraf.go.id')), 'OCSP': ('http://ocsp.digicert.com',), 'caIssuers': ('http://cacerts.digicert.com/DigiCertTLSRSASHA2562020CA1-1.crt',), 'crlDistributionPoints': ('http://crl3.digicert.com/DigiCertTLSRSASHA2562020CA1-4.crl', 'http://crl4.digicert.com/DigiCertTLSRSASHA256202

Error trying to connect to socket: closing socket - timed out




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


ERROR:urllib3.connection:Certificate did not match expected hostname: www.informatika.unsyiah.ac.id. Certificate: {'subject': ((('countryName', 'ID'),), (('stateOrProvinceName', 'Aceh'),), (('organizationName', 'Universitas Syiah Kuala'),), (('commonName', 'usk.ac.id'),)), 'issuer': ((('countryName', 'GB'),), (('stateOrProvinceName', 'Greater Manchester'),), (('localityName', 'Salford'),), (('organizationName', 'Sectigo Limited'),), (('commonName', 'Sectigo RSA Organization Validation Secure Server CA'),)), 'version': 3, 'serialNumber': '69C54617D9B822A3DAB69AB8D5B67E0C', 'notBefore': 'Oct 18 00:00:00 2021 GMT', 'notAfter': 'Nov 18 23:59:59 2022 GMT', 'subjectAltName': (('DNS', 'usk.ac.id'), ('DNS', '*.unsyiah.ac.id'), ('DNS', '*.usk.ac.id')), 'OCSP': ('http://ocsp.sectigo.com',), 'caIssuers': ('http://crt.sectigo.com/SectigoRSAOrganizationValidationSecureServerCA.crt',), 'crlDistributionPoints': ('http://crl.sectigo.com/SectigoRSAOrganizationValidationSecureServerCA.crl',)}


Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refusedError trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - timed out

Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to



Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




1000 eudl.eu
Launching thread with name: eudl.eu

 == Current active threads ==: 316


ERROR:urllib3.connection:Certificate did not match expected hostname: portal.kopertis3.or.id. Certificate: {'subject': ((('commonName', '*.gunadarma.ac.id'),),), 'issuer': ((('countryName', 'BE'),), (('organizationName', 'GlobalSign nv-sa'),), (('commonName', 'AlphaSSL CA - SHA256 - G2'),)), 'version': 3, 'serialNumber': '418FF1EB9173C09969A036A4', 'notBefore': 'Jul 25 01:36:15 2022 GMT', 'notAfter': 'Aug 26 01:36:14 2023 GMT', 'subjectAltName': (('DNS', '*.gunadarma.ac.id'), ('DNS', 'gunadarma.ac.id')), 'OCSP': ('http://ocsp2.globalsign.com/gsalphasha2g2',), 'caIssuers': ('http://secure.globalsign.com/cacert/gsalphasha2g2r1.crt',), 'crlDistributionPoints': ('http://crl.globalsign.com/gs/gsalphasha2g2.crl',)}


Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


ERROR:urllib3.connection:Certificate did not match expected hostname: portal.kopertis3.or.id. Certificate: {'subject': ((('commonName', '*.gunadarma.ac.id'),),), 'issuer': ((('countryName', 'BE'),), (('organizationName', 'GlobalSign nv-sa'),), (('commonName', 'AlphaSSL CA - SHA256 - G2'),)), 'version': 3, 'serialNumber': '418FF1EB9173C09969A036A4', 'notBefore': 'Jul 25 01:36:15 2022 GMT', 'notAfter': 'Aug 26 01:36:14 2023 GMT', 'subjectAltName': (('DNS', '*.gunadarma.ac.id'), ('DNS', 'gunadarma.ac.id')), 'OCSP': ('http://ocsp2.globalsign.com/gsalphasha2g2',), 'caIssuers': ('http://secure.globalsign.com/cacert/gsalphasha2g2r1.crt',), 'crlDistributionPoints': ('http://crl.globalsign.com/gs/gsalphasha2g2.crl',)}
ERROR:urllib3.connection:Certificate did not match expected hostname: www.fitokimiaumi.files.wordpress.com. Certificate: {'subject': ((('commonName', '*.files.wordpress.com'),),), 'issuer': ((('countryName', 'GB'),), (('stateOrProvinceName', 'Greater Manchester'),), (('localityNam

Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Error trying to connect to socket: closing socket - [Errno 111] Connection refused


ERROR:urllib3.connection:Certificate did not match expected hostname: www.fitokimiaumi.files.wordpress.com. Certificate: {'subject': ((('commonName', '*.files.wordpress.com'),),), 'issuer': ((('countryName', 'GB'),), (('stateOrProvinceName', 'Greater Manchester'),), (('localityName', 'Salford'),), (('organizationName', 'Sectigo Limited'),), (('commonName', 'Sectigo RSA Domain Validation Secure Server CA'),)), 'version': 3, 'serialNumber': '4026EBAB0BC127997921D1AD7D6E6287', 'notBefore': 'Dec 28 00:00:00 2021 GMT', 'notAfter': 'Jan 28 23:59:59 2023 GMT', 'subjectAltName': (('DNS', '*.files.wordpress.com'), ('DNS', 'files.wordpress.com')), 'OCSP': ('http://ocsp.sectigo.com',), 'caIssuers': ('http://crt.sectigo.com/SectigoRSADomainValidationSecureServerCA.crt',)}


Error trying to connect to socket: closing socket - [Errno 111] Connection refusedError trying to connect to socket: closing socket - [Errno 111] Connection refused

Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




Error trying to connect to socket: closing socket - timed out


ERROR:urllib3.connection:Certificate did not match expected hostname: www.apaarti.com. Certificate: {'subject': ((('commonName', 'apaarti.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '03AAAE4DDCBA87864CFD97EEE9D5CD0E3D60', 'notBefore': 'Sep 27 14:01:05 2022 GMT', 'notAfter': 'Dec 26 14:01:04 2022 GMT', 'subjectAltName': (('DNS', 'apaarti.com'),), 'OCSP': ('http://r3.o.lencr.org',), 'caIssuers': ('http://r3.i.lencr.org/',)}
ERROR:urllib3.connection:Certificate did not match expected hostname: www.apaarti.com. Certificate: {'subject': ((('commonName', 'apaarti.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '03AAAE4DDCBA87864CFD97EEE9D5CD0E3D60', 'notBefore': 'Sep 27 14:01:05 2022 GMT', 'notAfter': 'Dec 26 14:01:04 2022 GMT', 'subjectAltName': (('DNS', 'apaarti.com'),), 'OCSP': ('http://r3.o.

Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




Error trying to connect to socket: closing socket - [Errno -2] Name or service not known




1500 192.168.1.100
Launching thread with name: 192.168.1.100

 == Current active threads ==: 333


ERROR:urllib3.connection:Certificate did not match expected hostname: www.untad.ac.id. Certificate: {'subject': ((('commonName', 'untad.ac.id'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '043F59BE7ABB308C85D28916E2DA62B4C3C3', 'notBefore': 'Aug 15 05:16:20 2022 GMT', 'notAfter': 'Nov 13 05:16:19 2022 GMT', 'subjectAltName': (('DNS', 'untad.ac.id'),), 'OCSP': ('http://r3.o.lencr.org',), 'caIssuers': ('http://r3.i.lencr.org/',)}
ERROR:urllib3.connection:Certificate did not match expected hostname: www.untad.ac.id. Certificate: {'subject': ((('commonName', 'untad.ac.id'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'R3'),)), 'version': 3, 'serialNumber': '043F59BE7ABB308C85D28916E2DA62B4C3C3', 'notBefore': 'Aug 15 05:16:20 2022 GMT', 'notAfter': 'Nov 13 05:16:19 2022 GMT', 'subjectAltName': (('DNS', 'untad.ac.id'),), 'OCSP': ('http://r3.o.

In [None]:
res.keys()

In [None]:
for idx,data in df_hn.iterrows():
    t = Thread(target=testFunction, args=(idx,data['hostname'],data['ip'],data['scheme']))
    thread_list.append(t)
    t.start()
    if idx%500==0:
      print(idx,data['hostname'])
      print ( f"Launching thread with name: {data['hostname']}" )
      print ( '\n == Current active threads ==: ' + str(active_count()-1) )
    while active_count() > n_threads: # max thread count (includes parent thread)
        time.sleep(1)

In [31]:
print ( '\n == Current active threads ==: ' + str(active_count()-1) )


 == Current active threads ==: 12


In [None]:
res[7]

In [33]:
ls_res = list(res.keys())
idx_res = [i for i in range(len(df_hn))]

In [34]:
ls_res.sort()
print(ls_res)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [29]:
len(ls_res),len(df_hn)

1562

In [35]:
for i in idx_res:
  if i not in ls_res:
    print(i)

1226
1495
1554


In [24]:
res1 = {}
for i in range(len(df_hn)):
  res1[i] = res[i]

KeyError: ignored

In [30]:
ls_res = list(res1.values())

In [34]:
import numpy as np
np.shape(ls_res)

(107,)

In [21]:
n=1226
df_hn.loc[n]

hostname    www.tokopedia.com
scheme                  https
ip               104.87.85.86
Name: 1226, dtype: object

In [None]:
n = 1226
ft = HostFeatures(df_hn.loc[n,'hostname'],df_hn.loc[n,'ip'],df_hn.loc[n,'scheme']).run()
ls_res[n] = ft

In [None]:
df_hn.loc[n]

In [26]:
for i in range(len(df_hn)):
  print(f'{i}    : {len(res[i].keys())}')

0    : 24
1    : 24
2    : 24
3    : 24
4    : 24
5    : 24
6    : 24
7    : 24
8    : 24
9    : 24
10    : 24
11    : 24
12    : 24
13    : 24
14    : 24
15    : 24
16    : 24
17    : 24
18    : 24
19    : 24
20    : 24
21    : 24
22    : 24
23    : 24
24    : 24
25    : 24
26    : 24
27    : 24
28    : 24
29    : 24
30    : 24
31    : 24
32    : 24
33    : 24
34    : 24
35    : 24
36    : 24
37    : 24
38    : 24
39    : 24
40    : 24
41    : 24
42    : 24
43    : 24
44    : 24
45    : 24
46    : 24
47    : 24
48    : 24
49    : 24
50    : 24
51    : 24
52    : 24
53    : 24
54    : 24
55    : 24
56    : 24
57    : 24
58    : 24
59    : 24
60    : 24
61    : 24
62    : 24
63    : 24
64    : 24
65    : 24
66    : 24
67    : 24
68    : 24
69    : 24
70    : 24
71    : 24
72    : 24
73    : 24
74    : 24
75    : 24
76    : 24
77    : 24
78    : 24
79    : 24
80    : 24
81    : 24
82    : 24
83    : 24
84    : 24
85    : 24
86    : 24
87    : 24
88    : 24
89    : 24
90    : 24
91    : 2

KeyError: ignored

In [44]:
df_hnn = pd.DataFrame(ls_res)

In [45]:
joblib.dump(ls_res,'/content/drive/MyDrive/Colab Notebooks/ls_res.lsdf')

['/content/drive/MyDrive/Colab Notebooks/ls_res.lsdf']

In [7]:
ls_res = joblib.load('/content/drive/MyDrive/Colab Notebooks/ls_res.lsdf')

In [46]:
df_hn[df_hn['hostname']=='e-resources.perpusnas.go.id:2071']

Unnamed: 0,hostname,scheme,ip
17,e-resources.perpusnas.go.id:2071,http,


In [50]:
df

Unnamed: 0,filePdf,Obj,http,Label,Status,cp,hostname,scheme
0,1043883..txt,964 0,"<?xml version=""1.0"" encoding=""UTF-8""?><body st...",1,0,1,,
1,604795..txt,356 0,/Differences [1/rho/period/chi/nine/mu/ellipsi...,1,0,1,,
2,604743..txt,1129 0,/Differences [1/rho/period/chi/nine/mu/ellipsi...,1,0,1,,
3,1043874..txt,699 0,"<?xml version=""1.0"" encoding=""UTF-8""?><body st...",1,0,1,,
4,1477174..txt,1707 0,"<?xml version=""1.0"" encoding=""UTF-8""?><body st...",1,0,1,,
...,...,...,...,...,...,...,...,...
757,504116..txt,173 0,http://itis.gbif.net/pls/itisca/taxastep?king=...,"Unrated,Clean",1,0,itis.gbif.net,http
758,995566..txt,86 0,http://satryatama-ekaputra-fisip14.web.unair.a...,"Unrated,Clean",1,0,satryatama-ekaputra-fisip14.web.unair.ac.id,http
759,995566..txt,87 0,http://satryatama-ekaputra-fisip14.web.unair.a...,"Unrated,Clean",1,0,satryatama-ekaputra-fisip14.web.unair.ac.id,http
760,591396..txt,129 0,https://adeshpande3.github.io/adeshpande3.gith...,"Unrated,Clean",1,0,adeshpande3.github.io,https


In [52]:

pd.set_option('display.max_columns', 50)

In [54]:
df_hnnn = df_hn.join(df_hnn)

In [55]:
df

Unnamed: 0,filePdf,Obj,http,Label,Status,cp,hostname,scheme
0,1043883..txt,964 0,"<?xml version=""1.0"" encoding=""UTF-8""?><body st...",1,0,1,,
1,604795..txt,356 0,/Differences [1/rho/period/chi/nine/mu/ellipsi...,1,0,1,,
2,604743..txt,1129 0,/Differences [1/rho/period/chi/nine/mu/ellipsi...,1,0,1,,
3,1043874..txt,699 0,"<?xml version=""1.0"" encoding=""UTF-8""?><body st...",1,0,1,,
4,1477174..txt,1707 0,"<?xml version=""1.0"" encoding=""UTF-8""?><body st...",1,0,1,,
...,...,...,...,...,...,...,...,...
757,504116..txt,173 0,http://itis.gbif.net/pls/itisca/taxastep?king=...,"Unrated,Clean",1,0,itis.gbif.net,http
758,995566..txt,86 0,http://satryatama-ekaputra-fisip14.web.unair.a...,"Unrated,Clean",1,0,satryatama-ekaputra-fisip14.web.unair.ac.id,http
759,995566..txt,87 0,http://satryatama-ekaputra-fisip14.web.unair.a...,"Unrated,Clean",1,0,satryatama-ekaputra-fisip14.web.unair.ac.id,http
760,591396..txt,129 0,https://adeshpande3.github.io/adeshpande3.gith...,"Unrated,Clean",1,0,adeshpande3.github.io,https
