In [None]:
# Install external libraries
!pip install pandas
!pip install requests
!pip install python-whois
!pip install pyopenssl
!pip install beautifulsoup4
!pip install language-tool-python


Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5
Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8.1


In [None]:
import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse, urljoin
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime
import ssl
import socket
import OpenSSL
from bs4 import BeautifulSoup
import language_tool_python

# Replace with your Google API Key
API_KEY = 'API_KEY'
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# Function Definitions (Merged)

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return "Suspicious tld: True" if 'matches' in result else "Suspicious tld: False"
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL safety: {e}")
        return "Unknown"

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL redirection: {e}")
        return 0

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1
    else:
        return 0

def check_https_token_in_url(url):
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except Exception as e:
        print(f"Error retrieving domain age: {e}")
        return None

def get_domain_expiration(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except Exception as e:
        print(f"Error retrieving domain expiration: {e}")
        return None

def check_registrar(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        if registrar and any(suspicious in registrar for suspicious in suspicious_registrars):
            return 1
        return 0
    except Exception as e:
        print(f"Error retrieving registrar: {e}")
        return None

def check_whois_privacy_protection(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        if any(indicator in registrant_info for indicator in privacy_protection_indicators):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking WHOIS privacy: {e}")
        return None

def check_owner_details(url):
    try:
        domain = url.split('/')[2]
        if domain.endswith('google.com'):
            return 0
        w = whois.whois(domain)
        if not w.get('registrant_name') or not w.get('registrant_email'):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking owner details: {e}")
        return None

def check_ssl_certificate(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return not_after > datetime.now()
    except Exception as e:
        print(f"Error checking SSL certificate: {e}")
        return False

def get_certificate_validity(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        validity_duration = (not_after - not_before).days
        return validity_duration, not_after > datetime.now()
    except Exception as e:
        print(f"Error checking certificate validity: {e}")
        return None, False

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches), matches

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        input_types = {}
        for input_field in input_fields:
            input_type = input_field.get('type', 'text')
            input_types[input_type] = input_types.get(input_type, 0) + 1
        return len(input_fields), input_types
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, {}

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else 0, len(meta_description_tag['content']) if meta_description_tag else 0
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def main():
    url = input("Enter a URL: ")

    total_features = 27
    extracted_features = 0

    # General URL analysis
    print("--- URL Analysis ---")
    try:
        print(f"URL Length: {len(url)}")
        print(f"Number of characters after '.com': {len(url.split('.com')[-1]) if '.com' in url else 0}")
        extracted_features += 1
        print(f"URL Entropy: {calculate_entropy(url):.2f}")
        extracted_features += 1
        print(f"Contains IP Address: {has_ip_address(url)}")
        extracted_features += 1
        print(f"Number of Dots: {count_dots_in_url(url)}")
        extracted_features += 1
        print(f"Number of Hyphens: {count_hyphens_in_domain(url)}")
        extracted_features += 1
        print(f"Special Characters: {count_special_characters(url)}")
        extracted_features += 1
        print(f"Google Safe Browsing: {check_url_safety(url)}")
        extracted_features += 1
        print(f"Number of redirects: {check_url_redirection(url)}")
        extracted_features += 1
        print(f"Suspicious IP in URL: {check_suspicious_ip(url)}")
        extracted_features += 1
        print(f"HTTPS token in URL: {check_https_token_in_url(url)}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in URL Analysis: {e}")

    # WHOIS Analysis
    print("\n--- WHOIS Analysis ---")
    try:
        domain_age = get_domain_age(url)
        domain_expiration = get_domain_expiration(url)
        registrar_check = check_registrar(url)
        whois_privacy = check_whois_privacy_protection(url)
        owner_details = check_owner_details(url)

        print(f"Domain Age: {domain_age if domain_age else 'Unknown'} years")
        extracted_features += 1
        print(f"Domain Expiration Time: {domain_expiration if domain_expiration else 'Unknown'} days")
        extracted_features += 1
        print(f"Registrar check: {registrar_check}")
        extracted_features += 1
        print(f"WHOIS privacy protection: {whois_privacy}")
        extracted_features += 1
        print(f"Owner details check: {owner_details}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in WHOIS Analysis: {e}")

    # SSL Certificate Analysis
    print("\n--- SSL Certificate Analysis ---")
    try:
        ssl_status = check_ssl_certificate(url)
        certificate_validity, certificate_active = get_certificate_validity(url)
        print(f"SSL Certificate Valid: {'Yes' if ssl_status else 'No'}")
        extracted_features += 1
        print(f"SSL Certificate Validity Period: {certificate_validity if certificate_validity else 'Unknown'} days")
        extracted_features += 1
    except Exception as e:
        print(f"Error in SSL Analysis: {e}")

    # Webpage Content and Analysis
    print("\n--- Webpage Analysis ---")
    try:
        content = get_page_content(url)
        if content:
            num_issues, issues = check_grammar_and_spelling(content)
            print(f"Number of grammatical or spelling issues: {num_issues}")
            extracted_features += 1
        else:
            print("Failed to retrieve page content.")

        external_links, internal_links = get_page_links(url)
        print(f"Number of external links: {external_links}")
        extracted_features += 1
        print(f"Number of internal links: {internal_links}")
        extracted_features += 1

        login_forms = detect_login_forms(url)
        input_fields, input_types = detect_input_fields(url)
        print(f"Number of login forms: {login_forms}")
        extracted_features += 1
        print(f"Number of input fields: {input_fields}")
        extracted_features += 1

        js_length = measure_javascript_length(url)
        inline_scripts_length, iframe_count = detect_inline_scripts_and_iframes(url)
        print(f"Total JavaScript length: {js_length} characters")
        extracted_features += 1
        print(f"Total length of inline JavaScript: {inline_scripts_length} characters")
        extracted_features += 1
        print(f"Number of iframe elements: {iframe_count}")
        extracted_features += 1

        title_length, meta_desc_length = extract_title_and_meta_description_length(url)
        print(f"Title Length: {title_length} characters")
        extracted_features += 1
        print(f"Meta Description Length: {meta_desc_length} characters")
        extracted_features += 1
    except Exception as e:
        print(f"Error in Webpage Analysis: {e}")

    print("\n--- Summary ---")
    print(f"Extracted Features: {extracted_features}/{total_features} successfully")

if __name__ == "__main__":
    main()


Enter a URL: https://colab.research.google.com/drive/1dK54kkN9BVhadPLuaQgQ15hhAJIvvomI?authuser=2#scrollTo=Qnx4XsDYHv2f
--- URL Analysis ---
URL Length: 106
Number of characters after '.com': 73
URL Entropy: 5.24
Contains IP Address: False
Number of Dots: 3
Number of Hyphens: 0
Special Characters: 7
Google Safe Browsing: Suspicious tld: False
Number of redirects: 0
Suspicious IP in URL: 0
HTTPS token in URL: 1

--- WHOIS Analysis ---
Domain Age: 27.28219178082192 years
Domain Expiration Time: 1363 days
Registrar check: 0
WHOIS privacy protection: 0
Owner details check: 0

--- SSL Certificate Analysis ---
SSL Certificate Valid: Yes
SSL Certificate Validity Period: 83 days

--- Webpage Analysis ---


Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:03<00:00, 67.4MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpi5t7ljzf.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


Number of grammatical or spelling issues: 1
Number of external links: 1
Number of internal links: 0
Number of login forms: 0
Number of input fields: 0
Total JavaScript length: 64455 characters
Total length of inline JavaScript: 64453 characters
Number of iframe elements: 0
Title Length: 12 characters
Meta Description Length: 0 characters

--- Summary ---
Extracted Features: 27/27 successfully


In [None]:
import math
from collections import Counter
import re
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import whois
from datetime import datetime
import ssl
import socket
from bs4 import BeautifulSoup
import language_tool_python

# Replace with your Google API Key
API_KEY = 'API_KEY'
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return int(bool(ip_pattern.search(url)))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return int('matches' in result)
    except requests.exceptions.RequestException:
        return -1

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException:
        return -1

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    return int(bool(ipv4_pattern.match(host) or ipv6_pattern.match(host)))

def check_https_token_in_url(url):
    return int('https' in urlparse(url).path or 'https' in urlparse(url).query)

def get_domain_age(url):
    try:
        domain = urlparse(url).netloc
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except:
        return -1

def get_domain_expiration(url):
    try:
        domain = urlparse(url).netloc
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except:
        return -1

def check_registrar(url):
    try:
        domain = urlparse(url).netloc
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        return int(registrar and any(suspicious in registrar for suspicious in suspicious_registrars))
    except:
        return -1

def check_whois_privacy_protection(url):
    try:
        domain = urlparse(url).netloc
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        return int(any(indicator in registrant_info for indicator in privacy_protection_indicators))
    except:
        return -1

def check_owner_details(url):
    try:
        domain = urlparse(url).netloc
        w = whois.whois(domain)
        return int(not w.get('registrant_name') or not w.get('registrant_email'))
    except:
        return -1

def check_ssl_certificate(url):
    try:
        domain = urlparse(url).netloc
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return int(not_after > datetime.now())
    except:
        return 0

def get_certificate_validity(url):
    try:
        domain = urlparse(url).netloc
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return (not_after - not_before).days
    except:
        return -1

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except:
        return ""

def check_grammar_and_spelling(text):
    try:
        tool = language_tool_python.LanguageTool('en-US')
        matches = tool.check(text)
        return len(matches)
    except:
        return -1

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except:
        return -1, -1

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except:
        return -1

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        return len(input_fields)
    except:
        return -1

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except:
        return -1

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except:
        return -1, -1

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else -1, len(meta_description_tag['content']) if meta_description_tag else -1
    except:
        return -1, -1

def main():
    url = input("Enter a URL: ")

    # Extract features into a 2D array
    features = [
        ["URL Length", len(url)],
        ["Characters After .com", len(url.split('.com')[-1]) if '.com' in url else 0],
        ["URL Entropy", calculate_entropy(url)],
        ["Contains IP Address", has_ip_address(url)],
        ["Number of Dots", count_dots_in_url(url)],
        ["Number of Hyphens", count_hyphens_in_domain(url)],
        ["Special Characters Count", count_special_characters(url)],
        ["Google Safe Browsing", check_url_safety(url)],
        ["Number of Redirects", check_url_redirection(url)],
        ["Suspicious IP in URL", check_suspicious_ip(url)],
        ["HTTPS Token in URL", check_https_token_in_url(url)],
        ["Domain Age (years)", get_domain_age(url)],
        ["Domain Expiration (days)", get_domain_expiration(url)],
        ["Suspicious Registrar", check_registrar(url)],
        ["WHOIS Privacy Protection", check_whois_privacy_protection(url)],
        ["Missing Owner Details", check_owner_details(url)],
        ["SSL Certificate Valid", check_ssl_certificate(url)],
        ["Certificate Validity Period (days)", get_certificate_validity(url)],
        ["Grammar and Spelling Issues", check_grammar_and_spelling(get_page_content(url))],
        ["Number of External Links", get_page_links(url)[0]],
        ["Number of Internal Links", get_page_links(url)[1]],
        ["Number of Login Forms", detect_login_forms(url)],
        ["Number of Input Fields", detect_input_fields(url)],
        ["JavaScript Length", measure_javascript_length(url)],
        ["Inline Scripts Length", detect_inline_scripts_and_iframes(url)[0]],
        ["Number of Iframes", detect_inline_scripts_and_iframes(url)[1]],
        ["Title Length", extract_title_and_meta_description_length(url)[0]],
        ["Meta Description Length", extract_title_and_meta_description_length(url)[1]],
        ["Page Content Length", len(get_page_content(url))]
    ]

    print("\nExtracted Features:")
    for feature in features:
        print(f"{feature[0]}: {feature[1]}")

if __name__ == "__main__":
    main()


Enter a URL: https://colab.research.google.com/drive/1dK54kkN9BVhadPLuaQgQ15hhAJIvvomI?authuser=2#scrollTo=x-NrMID1LhGQ

Extracted Features:
URL Length: 106
Characters After .com: 73
URL Entropy: 5.1729580736058525
Contains IP Address: 0
Number of Dots: 3
Number of Hyphens: 0
Special Characters Count: 7
Google Safe Browsing: 0
Number of Redirects: 0
Suspicious IP in URL: 0
HTTPS Token in URL: 0
Domain Age (years): 27.28219178082192
Domain Expiration (days): 1363
Suspicious Registrar: 0
WHOIS Privacy Protection: 0
Missing Owner Details: 1
SSL Certificate Valid: 1
Certificate Validity Period (days): 83
Grammar and Spelling Issues: 1
Number of External Links: 1
Number of Internal Links: 0
Number of Login Forms: 0
Number of Input Fields: 0
JavaScript Length: 64465
Inline Scripts Length: 64450
Number of Iframes: 0
Title Length: 12
Meta Description Length: -1
Page Content Length: 19


In [None]:
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

# Load the trained RandomForest model and the scaler
with open('rf_best_model.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Predict function
def predict_from_features(features):
    # Get only required features based on the scaler's expected input size
    required_feature_count = scaler.mean_.shape[0]
    features = features[:required_feature_count]  # Truncate to required length

    # If fewer features are provided, pad with zeros
    if len(features) < required_feature_count:
        features += [0] * (required_feature_count - len(features))

    # Preprocess the features (e.g., scaling)
    features_scaled = scaler.transform([features])

    # Predict using the trained model
    prediction = rf_model.predict(features_scaled)
    prediction_proba = rf_model.predict_proba(features_scaled)

    # Return the prediction and probabilities
    return prediction[0], prediction_proba[0]

# Example usage
if __name__ == "__main__":
    # Example features array (replace with actual feature extraction logic)
    features = [
        106.0,  # URL length
        5.1729580736058525,   # URL entropy
        0.0,   # Contains IP address
        3.0,   # Count dots
        0.0,   # Contains hyphens
        7.0,   # Count special characters
        0.0,   # Suspicious TLD
        0.0,   # URL redirection
        1.0,   # Contains HTTPS token
        27.28219178082192, # Domain age
        1363.0, # Domain expiration
        1.0,   # Privacy protection
        1.0,  # SSL validity
        1.0,   # External links
        0.0,   # Internal links
        0.0,   # Login forms
        0.0,   # Input fields
        0.0,   # Hidden elements
        64465.0,# JavaScript length
        64450.0, # Inline script length
        0.0    # iFrames
    ]

    # Get the prediction
    label, probabilities = predict_from_features(features)

    # Output the result
    if label == 1:
        print("The URL is classified as: Phishy")
    else:
        print("The URL is classified as: Legitimate")

    print(f"Prediction Probabilities: Legitimate: {probabilities[0]:.2f}, Phishy: {probabilities[1]:.2f}")


The URL is classified as: Phishy
Prediction Probabilities: Legitimate: 0.41, Phishy: 0.59




In [None]:
import pandas as pd

a = pd.read_csv('updated_dataset2.csv')
a.head()

Unnamed: 0,url,url_length,url_entropy,contains_ip_address,count_dots,contains_hyphens,count_special_chars,suspicious_tld,url_redirection,contains_https_token,...,ssl_issuer,external_links,internal_links,login_forms,input_fields,hidden_elements,javascript_length,inline_script_length,iframes,status
0,http://www.crestonwood.com/router.php,37.0,3.787043,0.0,3.0,0.0,7.0,0.0,0.0,0.0,...,Let's Encrypt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,http://rgipt.ac.in,18.0,3.46132,0.0,2.0,0.0,5.0,0.0,0.0,0.0,...,Greater Manchester,62.0,5.0,0.0,6.0,0.0,3179.0,3097.0,0.0,0
2,http://www.mutuo.it,19.0,3.260828,0.0,2.0,0.0,5.0,0.0,0.0,0.0,...,Google Trust Services,76.0,3.0,0.0,12.0,0.0,3569.0,0.0,0.0,0
3,http://vamoaestudiarmedicina.blogspot.com/,42.0,4.19554,0.0,2.0,0.0,6.0,0.0,0.0,0.0,...,Google Trust Services,79.0,39.0,0.0,0.0,0.0,8085.0,8085.0,0.0,0
4,https://www.astrologyonline.eu/Astro_MemoNew/P...,56.0,4.344567,0.0,3.0,0.0,8.0,0.0,0.0,0.0,...,Let's Encrypt,4.0,9.0,0.0,0.0,0.0,1230.0,1230.0,0.0,0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'updated_dataset2.csv'
dataset = pd.read_csv(file_path)

# Identify non-numeric columns
non_numeric_columns = dataset.select_dtypes(include=['object']).columns

# Convert non-numeric columns to numeric using Label Encoding
label_encoders = {}
for column in non_numeric_columns:
    if column != 'url':  # Exclude 'url' from encoding as it will not be used for modeling
        le = LabelEncoder()
        dataset[column] = le.fit_transform(dataset[column].astype(str))
        label_encoders[column] = le

# Handle missing values
imputer = SimpleImputer(strategy='mean')
dataset.iloc[:, 1:] = imputer.fit_transform(dataset.iloc[:, 1:])

# Separate features and target variable
X = dataset.drop(columns=['url', 'status'])
y = dataset['status']

# Apply SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='recall', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_

# Train Logistic Regression
log_reg_classifier = LogisticRegression(random_state=42, max_iter=500)
log_reg_classifier.fit(X_train, y_train)

# Predictions
rf_predictions = rf_best_model.predict(X_test)
log_reg_predictions = log_reg_classifier.predict(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)

    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}\n")

# Evaluate models
evaluate_model(y_test, rf_predictions, "Random Forest (Tuned)")
evaluate_model(y_test, log_reg_predictions, "Logistic Regression")




Random Forest (Tuned) Performance:
Accuracy: 0.94
Precision: 0.94
Recall: 0.95
F1 Score: 0.94

Logistic Regression Performance:
Accuracy: 0.82
Precision: 0.80
Recall: 0.87
F1 Score: 0.83



In [None]:
import pickle

with open('rf_best_model.pkl', 'wb') as model_file:
    pickle.dump(rf_best_model, model_file)
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [None]:
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

# Load the trained RandomForest model and the scaler
with open('rf_best_model.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Example feature extraction function (you should replace this with your real feature extraction logic)
def extract_features(url):
    """
    Placeholder for the URL feature extraction logic.
    Replace this with your actual feature extraction implementation.
    """
    # Example feature values based on the dataset
    return [
        len(url),  # URL length
        3.5,  # URL entropy
        0,  # Contains IP address
        url.count('.'),  # Count dots
        1 if '-' in url else 0,  # Contains hyphens
        sum(not char.isalnum() for char in url),  # Count special characters
        1 if any(tld in url for tld in ['.ru', '.cn']) else 0,  # Suspicious TLD
        1 if '//' in url.split('/')[2] else 0,  # URL redirection
        1 if 'https' in url.split(':')[0] else 0,  # Contains HTTPS token
        365,  # Domain age (default value if unknown)
        200,  # Domain expiration (default value if unknown)
        1,  # Privacy protection (example default)
        50,  # SSL validity (default value)
        0,  # External links (default value)
        5,  # Internal links (default value)
        0,  # Login forms (example default)
        0,  # Input fields (default value)
        0,  # Hidden elements (default value)
        1000,  # JavaScript length (default value)
        500,  # Inline script length (default value)
        0,  # iFrames (default value)
    ]

# Predict function
def predict_url(url):
    # Extract features
    features = extract_features(url)

    # Preprocess the features (e.g., scaling)
    features_scaled = scaler.transform([features])

    # Predict using the trained model
    prediction = rf_model.predict(features_scaled)
    prediction_proba = rf_model.predict_proba(features_scaled)

    # Return the prediction and probabilities
    return prediction[0], prediction_proba[0]

# Example usage
if __name__ == "__main__":
    # Input URL to check
    url_to_check = "http://example.com/login"

    # Get the prediction
    label, probabilities = predict_url(url_to_check)

    # Output the result
    if label == 1:
        print(f"The URL '{url_to_check}' is classified as: Phishy")
    else:
        print(f"The URL '{url_to_check}' is classified as: Legitimate")

    print(f"Prediction Probabilities: Legitimate: {probabilities[0]:.2f}, Phishy: {probabilities[1]:.2f}")




---
Again Complete feature Extraction Code:


In [None]:
!pip install pandas requests whois-python pyopenssl beautifulsoup4 language-tool-python


[31mERROR: Could not find a version that satisfies the requirement whois-python (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for whois-python[0m[31m
[0m

In [None]:
pip install whois-python

[31mERROR: Could not find a version that satisfies the requirement whois-python (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for whois-python[0m[31m
[0m

In [None]:
import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse, urljoin
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime
import ssl
import socket
import OpenSSL
from bs4 import BeautifulSoup
import language_tool_python

# Replace with your Google API Key
API_KEY = 'API_KEY'
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# Function Definitions (Merged)

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return "Suspicious tld: True" if 'matches' in result else "Suspicious tld: False"
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL safety: {e}")
        return "Unknown"

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL redirection: {e}")
        return 0

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1
    else:
        return 0

def check_https_token_in_url(url):
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except Exception as e:
        print(f"Error retrieving domain age: {e}")
        return None

def get_domain_expiration(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except Exception as e:
        print(f"Error retrieving domain expiration: {e}")
        return None

def check_registrar(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        if registrar and any(suspicious in registrar for suspicious in suspicious_registrars):
            return 1
        return 0
    except Exception as e:
        print(f"Error retrieving registrar: {e}")
        return None

def check_whois_privacy_protection(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        if any(indicator in registrant_info for indicator in privacy_protection_indicators):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking WHOIS privacy: {e}")
        return None

def check_owner_details(url):
    try:
        domain = url.split('/')[2]
        if domain.endswith('google.com'):
            return 0
        w = whois.whois(domain)
        if not w.get('registrant_name') or not w.get('registrant_email'):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking owner details: {e}")
        return None

def check_ssl_certificate(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return not_after > datetime.now()
    except Exception as e:
        print(f"Error checking SSL certificate: {e}")
        return False

def get_certificate_validity(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        validity_duration = (not_after - not_before).days
        return validity_duration, not_after > datetime.now()
    except Exception as e:
        print(f"Error checking certificate validity: {e}")
        return None, False

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches), matches

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        input_types = {}
        for input_field in input_fields:
            input_type = input_field.get('type', 'text')
            input_types[input_type] = input_types.get(input_type, 0) + 1
        return len(input_fields), input_types
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, {}

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else 0, len(meta_description_tag['content']) if meta_description_tag else 0
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def main():
    url = input("Enter a URL: ")

    total_features = 27
    extracted_features = 0

    # General URL analysis
    print("--- URL Analysis ---")
    try:
        print(f"URL Length: {len(url)}")
        print(f"Number of characters after '.com': {len(url.split('.com')[-1]) if '.com' in url else 0}")
        extracted_features += 1
        print(f"URL Entropy: {calculate_entropy(url):.2f}")
        extracted_features += 1
        print(f"Contains IP Address: {has_ip_address(url)}")
        extracted_features += 1
        print(f"Number of Dots: {count_dots_in_url(url)}")
        extracted_features += 1
        print(f"Number of Hyphens: {count_hyphens_in_domain(url)}")
        extracted_features += 1
        print(f"Special Characters: {count_special_characters(url)}")
        extracted_features += 1
        print(f"Google Safe Browsing: {check_url_safety(url)}")
        extracted_features += 1
        print(f"Number of redirects: {check_url_redirection(url)}")
        extracted_features += 1
        print(f"Suspicious IP in URL: {check_suspicious_ip(url)}")
        extracted_features += 1
        print(f"HTTPS token in URL: {check_https_token_in_url(url)}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in URL Analysis: {e}")

    # WHOIS Analysis
    print("\n--- WHOIS Analysis ---")
    try:
        domain_age = get_domain_age(url)
        domain_expiration = get_domain_expiration(url)
        registrar_check = check_registrar(url)
        whois_privacy = check_whois_privacy_protection(url)
        owner_details = check_owner_details(url)

        print(f"Domain Age: {domain_age if domain_age else 'Unknown'} years")
        extracted_features += 1
        print(f"Domain Expiration Time: {domain_expiration if domain_expiration else 'Unknown'} days")
        extracted_features += 1
        print(f"Registrar check: {registrar_check}")
        extracted_features += 1
        print(f"WHOIS privacy protection: {whois_privacy}")
        extracted_features += 1
        print(f"Owner details check: {owner_details}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in WHOIS Analysis: {e}")

    # SSL Certificate Analysis
    print("\n--- SSL Certificate Analysis ---")
    try:
        ssl_status = check_ssl_certificate(url)
        certificate_validity, certificate_active = get_certificate_validity(url)
        print(f"SSL Certificate Valid: {'Yes' if ssl_status else 'No'}")
        extracted_features += 1
        print(f"SSL Certificate Validity Period: {certificate_validity if certificate_validity else 'Unknown'} days")
        extracted_features += 1
    except Exception as e:
        print(f"Error in SSL Analysis: {e}")

    # Webpage Content and Analysis
    print("\n--- Webpage Analysis ---")
    try:
        content = get_page_content(url)
        if content:
            num_issues, issues = check_grammar_and_spelling(content)
            print(f"Number of grammatical or spelling issues: {num_issues}")
            extracted_features += 1
        else:
            print("Failed to retrieve page content.")

        external_links, internal_links = get_page_links(url)
        print(f"Number of external links: {external_links}")
        extracted_features += 1
        print(f"Number of internal links: {internal_links}")
        extracted_features += 1

        login_forms = detect_login_forms(url)
        input_fields, input_types = detect_input_fields(url)
        print(f"Number of login forms: {login_forms}")
        extracted_features += 1
        print(f"Number of input fields: {input_fields}")
        extracted_features += 1

        js_length = measure_javascript_length(url)
        inline_scripts_length, iframe_count = detect_inline_scripts_and_iframes(url)
        print(f"Total JavaScript length: {js_length} characters")
        extracted_features += 1
        print(f"Total length of inline JavaScript: {inline_scripts_length} characters")
        extracted_features += 1
        print(f"Number of iframe elements: {iframe_count}")
        extracted_features += 1

        title_length, meta_desc_length = extract_title_and_meta_description_length(url)
        print(f"Title Length: {title_length} characters")
        extracted_features += 1
        print(f"Meta Description Length: {meta_desc_length} characters")
        extracted_features += 1
    except Exception as e:
        print(f"Error in Webpage Analysis: {e}")

    print("\n--- Summary ---")
    print(f"Extracted Features: {extracted_features}/{total_features} successfully")

if __name__ == "__main__":
    main()


Enter a URL: https://www.snapchat.com/
--- URL Analysis ---
URL Length: 25
Number of characters after '.com': 1
URL Entropy: 3.59
Contains IP Address: False
Number of Dots: 2
Number of Hyphens: 0
Special Characters: 3
Google Safe Browsing: Suspicious tld: False
Number of redirects: 0
Suspicious IP in URL: 0
HTTPS token in URL: 1

--- WHOIS Analysis ---
Domain Age: 12.835616438356164 years
Domain Expiration Time: 428 days
Registrar check: 0
WHOIS privacy protection: 0
Owner details check: 1

--- SSL Certificate Analysis ---
SSL Certificate Valid: Yes
SSL Certificate Validity Period: 364 days

--- Webpage Analysis ---
Error fetching the page: 403 Client Error: Forbidden for url: https://www.snapchat.com/
Failed to retrieve page content.
Error fetching the page: 403 Client Error: Forbidden for url: https://www.snapchat.com/
Number of external links: 0
Number of internal links: 0
Error fetching the page: 403 Client Error: Forbidden for url: https://www.snapchat.com/
Error fetching the page

In [None]:
!apt-get update
!apt-get install -y wget unzip
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -f install -y
!wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/$(curl -sS https://chromedriver.storage.googleapis.com/LATEST_RELEASE)/chromedriver_linux64.zip
!unzip /tmp/chromedriver.zip -d /usr/local/bin/


Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,566 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,517 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Pac

In [None]:
!pip install selenium


Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.28.0-py3-none-any.whl (486 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.3/486.3 kB[0m [31m24.5 MB/s

In [None]:
pip install python-whois


Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5


In [None]:
pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8.1


In [None]:
import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse, urljoin
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime
import ssl
import socket
import OpenSSL
from bs4 import BeautifulSoup
import language_tool_python
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options


# Replace with your Google API Key
API_KEY = 'API_KEY'
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# Function Definitions (Merged)

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return "Suspicious tld: True" if 'matches' in result else "Suspicious tld: False"
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL safety: {e}")
        return "Unknown"

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL redirection: {e}")
        return 0

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1
    else:
        return 0

def check_https_token_in_url(url):
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except Exception as e:
        print(f"Error retrieving domain age: {e}")
        return None

def get_domain_expiration(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except Exception as e:
        print(f"Error retrieving domain expiration: {e}")
        return None

def check_registrar(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        if registrar and any(suspicious in registrar for suspicious in suspicious_registrars):
            return 1
        return 0
    except Exception as e:
        print(f"Error retrieving registrar: {e}")
        return None

def check_whois_privacy_protection(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        if any(indicator in registrant_info for indicator in privacy_protection_indicators):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking WHOIS privacy: {e}")
        return None

def check_owner_details(url):
    try:
        domain = url.split('/')[2]
        if domain.endswith('google.com'):
            return 0
        w = whois.whois(domain)
        if not w.get('registrant_name') or not w.get('registrant_email'):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking owner details: {e}")
        return None

def check_ssl_certificate(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return not_after > datetime.now()
    except Exception as e:
        print(f"Error checking SSL certificate: {e}")
        return False

def get_certificate_validity(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        validity_duration = (not_after - not_before).days
        return validity_duration, not_after > datetime.now()
    except Exception as e:
        print(f"Error checking certificate validity: {e}")
        return None, False

def get_page_content_with_selenium(url):
    # Configure headless Chrome
    # Configure Chrome options
  options = Options()
  options.add_argument("--headless")  # Run Chrome in headless mode
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")

  # Path to the ChromeDriver
  service = Service("/usr/local/bin/chromedriver")  # Update this path
  driver = webdriver.Chrome(service=service, options=options)
  try:
      driver.get(url)
      print("Page Title:", driver.title)  # Print the page title
      page_content = driver.page_source  # Get the page source
      print("Page content retrieved successfully!")
  except Exception as e:
      print("Error accessing page:", e)
  finally:
      driver.quit()

def check_grammar_and_spelling(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches), matches

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        input_types = {}
        for input_field in input_fields:
            input_type = input_field.get('type', 'text')
            input_types[input_type] = input_types.get(input_type, 0) + 1
        return len(input_fields), input_types
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, {}

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else 0, len(meta_description_tag['content']) if meta_description_tag else 0
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def main():
    url = input("Enter a URL: ")

    total_features = 27
    extracted_features = 0

    # General URL analysis
    print("--- URL Analysis ---")
    try:
        print(f"URL Length: {len(url)}")
        print(f"Number of characters after '.com': {len(url.split('.com')[-1]) if '.com' in url else 0}")
        extracted_features += 1
        print(f"URL Entropy: {calculate_entropy(url):.2f}")
        extracted_features += 1
        print(f"Contains IP Address: {has_ip_address(url)}")
        extracted_features += 1
        print(f"Number of Dots: {count_dots_in_url(url)}")
        extracted_features += 1
        print(f"Number of Hyphens: {count_hyphens_in_domain(url)}")
        extracted_features += 1
        print(f"Special Characters: {count_special_characters(url)}")
        extracted_features += 1
        print(f"Google Safe Browsing: {check_url_safety(url)}")
        extracted_features += 1
        print(f"Number of redirects: {check_url_redirection(url)}")
        extracted_features += 1
        print(f"Suspicious IP in URL: {check_suspicious_ip(url)}")
        extracted_features += 1
        print(f"HTTPS token in URL: {check_https_token_in_url(url)}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in URL Analysis: {e}")

    # WHOIS Analysis
    print("\n--- WHOIS Analysis ---")
    try:
        domain_age = get_domain_age(url)
        domain_expiration = get_domain_expiration(url)
        registrar_check = check_registrar(url)
        whois_privacy = check_whois_privacy_protection(url)
        owner_details = check_owner_details(url)

        print(f"Domain Age: {domain_age if domain_age else 'Unknown'} years")
        extracted_features += 1
        print(f"Domain Expiration Time: {domain_expiration if domain_expiration else 'Unknown'} days")
        extracted_features += 1
        print(f"Registrar check: {registrar_check}")
        extracted_features += 1
        print(f"WHOIS privacy protection: {whois_privacy}")
        extracted_features += 1
        print(f"Owner details check: {owner_details}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in WHOIS Analysis: {e}")

    # SSL Certificate Analysis
    print("\n--- SSL Certificate Analysis ---")
    try:
        ssl_status = check_ssl_certificate(url)
        certificate_validity, certificate_active = get_certificate_validity(url)
        print(f"SSL Certificate Valid: {'Yes' if ssl_status else 'No'}")
        extracted_features += 1
        print(f"SSL Certificate Validity Period: {certificate_validity if certificate_validity else 'Unknown'} days")
        extracted_features += 1
    except Exception as e:
        print(f"Error in SSL Analysis: {e}")

    # Webpage Content and Analysis
    print("\n--- Webpage Analysis ---")
    try:
        content = get_page_content(url)
        if content:
            num_issues, issues = check_grammar_and_spelling(content)
            print(f"Number of grammatical or spelling issues: {num_issues}")
            extracted_features += 1
        else:
            print("Failed to retrieve page content.")

        external_links, internal_links = get_page_links(url)
        print(f"Number of external links: {external_links}")
        extracted_features += 1
        print(f"Number of internal links: {internal_links}")
        extracted_features += 1

        login_forms = detect_login_forms(url)
        input_fields, input_types = detect_input_fields(url)
        print(f"Number of login forms: {login_forms}")
        extracted_features += 1
        print(f"Number of input fields: {input_fields}")
        extracted_features += 1

        js_length = measure_javascript_length(url)
        inline_scripts_length, iframe_count = detect_inline_scripts_and_iframes(url)
        print(f"Total JavaScript length: {js_length} characters")
        extracted_features += 1
        print(f"Total length of inline JavaScript: {inline_scripts_length} characters")
        extracted_features += 1
        print(f"Number of iframe elements: {iframe_count}")
        extracted_features += 1

        title_length, meta_desc_length = extract_title_and_meta_description_length(url)
        print(f"Title Length: {title_length} characters")
        extracted_features += 1
        print(f"Meta Description Length: {meta_desc_length} characters")
        extracted_features += 1
    except Exception as e:
        print(f"Error in Webpage Analysis: {e}")

    print("\n--- Summary ---")
    print(f"Extracted Features: {extracted_features}/{total_features} successfully")

if __name__ == "__main__":
    main()


Enter a URL: https://www.snapchat.com/
--- URL Analysis ---
URL Length: 25
Number of characters after '.com': 1
URL Entropy: 3.59
Contains IP Address: False
Number of Dots: 2
Number of Hyphens: 0
Special Characters: 3
Google Safe Browsing: Suspicious tld: False
Number of redirects: 0
Suspicious IP in URL: 0
HTTPS token in URL: 1

--- WHOIS Analysis ---
Domain Age: 12.835616438356164 years
Domain Expiration Time: 428 days
Registrar check: 0
WHOIS privacy protection: 0
Owner details check: 1

--- SSL Certificate Analysis ---
SSL Certificate Valid: Yes
SSL Certificate Validity Period: 364 days

--- Webpage Analysis ---
Error in Webpage Analysis: name 'get_page_content' is not defined

--- Summary ---
Extracted Features: 17/27 successfully


In [None]:
!pip install pandas
!pip install requests
!pip install whois
!pip install python-whois
!pip install language-tool-python
!pip install beautifulsoup4
!pip install pyOpenSSL


Collecting whois
  Downloading whois-1.20240129.2-py3-none-any.whl.metadata (1.3 kB)
Downloading whois-1.20240129.2-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: whois
Successfully installed whois-1.20240129.2
Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5
Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8.1


In [None]:
import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse, urljoin
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime
import ssl
import socket
import OpenSSL
from bs4 import BeautifulSoup
import language_tool_python

# Replace with your Google API Key
API_KEY = 'API_KEY'
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# Function Definitions (Merged)

def count_characters_after_tld(url, tld=".com"):
    try:
        if tld in url:
            index = url.find(tld) + len(tld)  # Find the position right after the TLD
            return len(url) - index  # Calculate characters after the TLD
        return 0
    except Exception as e:
        print(f"Error counting characters after TLD: {e}")
        return 0

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return "Suspicious tld: True" if 'matches' in result else "Suspicious tld: False"
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL safety: {e}")
        return "Unknown"

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL redirection: {e}")
        return 0

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1
    else:
        return 0

def check_https_token_in_url(url):
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except Exception as e:
        print(f"Error retrieving domain age: {e}")
        return None

def get_domain_expiration(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except Exception as e:
        print(f"Error retrieving domain expiration: {e}")
        return None

def check_registrar(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        if registrar and any(suspicious in registrar for suspicious in suspicious_registrars):
            return 1
        return 0
    except Exception as e:
        print(f"Error retrieving registrar: {e}")
        return None

def check_whois_privacy_protection(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        if any(indicator in registrant_info for indicator in privacy_protection_indicators):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking WHOIS privacy: {e}")
        return None

def check_owner_details(url):
    try:
        domain = url.split('/')[2]
        if domain.endswith('google.com'):
            return 0
        w = whois.whois(domain)
        if not w.get('registrant_name') or not w.get('registrant_email'):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking owner details: {e}")
        return None

def check_ssl_certificate(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return not_after > datetime.now()
    except Exception as e:
        print(f"Error checking SSL certificate: {e}")
        return False

def get_certificate_validity(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        validity_duration = (not_after - not_before).days
        return validity_duration, not_after > datetime.now()
    except Exception as e:
        print(f"Error checking certificate validity: {e}")
        return None, False

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches), matches

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        input_types = {}
        for input_field in input_fields:
            input_type = input_field.get('type', 'text')
            input_types[input_type] = input_types.get(input_type, 0) + 1
        return len(input_fields), input_types
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, {}

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else 0, len(meta_description_tag['content']) if meta_description_tag else 0
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def main():
    url = input("Enter a URL: ")

    total_features = 27
    extracted_features = 0

    # General URL analysis
    print("--- URL Analysis ---")
    try:
        print(f"URL Length: {len(url)}")
        print(f"Number of characters after '.com': {count_characters_after_tld(url)}")
        extracted_features += 1
        print(f"URL Entropy: {calculate_entropy(url):.2f}")
        extracted_features += 1
        print(f"Contains IP Address: {has_ip_address(url)}")
        extracted_features += 1
        print(f"Number of Dots: {count_dots_in_url(url)}")
        extracted_features += 1
        print(f"Number of Hyphens: {count_hyphens_in_domain(url)}")
        extracted_features += 1
        print(f"Special Characters: {count_special_characters(url)}")
        extracted_features += 1
        print(f"Google Safe Browsing: {check_url_safety(url)}")
        extracted_features += 1
        print(f"Number of redirects: {check_url_redirection(url)}")
        extracted_features += 1
        print(f"Suspicious IP in URL: {check_suspicious_ip(url)}")
        extracted_features += 1
        print(f"HTTPS token in URL: {check_https_token_in_url(url)}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in URL Analysis: {e}")

    # WHOIS Analysis
    print("\n--- WHOIS Analysis ---")
    try:
        domain_age = get_domain_age(url)
        if domain_age is not None:
            print(f"Domain Age: {domain_age:.2f} years")
            extracted_features += 1

        domain_expiration = get_domain_expiration(url)
        if domain_expiration is not None:
            print(f"Domain Expiration Time: {domain_expiration} days")
            extracted_features += 1

        registrar_check = check_registrar(url)
        if registrar_check is not None:
            print(f"Registrar check: {registrar_check}")
            extracted_features += 1

        whois_privacy = check_whois_privacy_protection(url)
        if whois_privacy is not None:
            print(f"WHOIS privacy protection: {whois_privacy}")
            extracted_features += 1

        owner_details = check_owner_details(url)
        if owner_details is not None:
            print(f"Owner details check: {owner_details}")
            extracted_features += 1
    except Exception as e:
        print(f"Error in WHOIS Analysis: {e}")

    # SSL Certificate Analysis
    print("\n--- SSL Certificate Analysis ---")
    try:
        ssl_status = check_ssl_certificate(url)
        if ssl_status:
            print(f"SSL Certificate Valid: {'Yes' if ssl_status else 'No'}")
            extracted_features += 1

        certificate_validity, certificate_active = get_certificate_validity(url)
        if certificate_validity:
            print(f"SSL Certificate Validity Period: {certificate_validity} days")
            extracted_features += 1
    except Exception as e:
        print(f"Error in SSL Analysis: {e}")

    # Webpage Content and Analysis
    print("\n--- Webpage Analysis ---")
    try:
        content = get_page_content(url)
        if content:
            num_issues, issues = check_grammar_and_spelling(content)
            print(f"Number of grammatical or spelling issues: {num_issues}")
            extracted_features += 1

            external_links, internal_links = get_page_links(url)
            print(f"Number of external links: {external_links}")
            extracted_features += 1
            print(f"Number of internal links: {internal_links}")
            extracted_features += 1

            login_forms = detect_login_forms(url)
            print(f"Number of login forms: {login_forms}")
            extracted_features += 1

            input_fields, input_types = detect_input_fields(url)
            print(f"Number of input fields: {input_fields}")
            extracted_features += 1

            js_length = measure_javascript_length(url)
            print(f"Total JavaScript length: {js_length} characters")
            extracted_features += 1

            inline_scripts_length, iframe_count = detect_inline_scripts_and_iframes(url)
            print(f"Total length of inline JavaScript: {inline_scripts_length} characters")
            extracted_features += 1
            print(f"Number of iframe elements: {iframe_count}")
            extracted_features += 1

            title_length, meta_desc_length = extract_title_and_meta_description_length(url)
            print(f"Title Length: {title_length} characters")
            extracted_features += 1
            print(f"Meta Description Length: {meta_desc_length} characters")
            extracted_features += 1
        else:
            print("Failed to retrieve page content.")
    except Exception as e:
        print(f"Error in Webpage Analysis: {e}")

    print("\n--- Summary ---")
    print(f"Extracted Features: {extracted_features}/{total_features} successfully")

if __name__ == "__main__":
    main()


Enter a URL: https://www.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2F
--- URL Analysis ---
URL Length: 70
Number of characters after '.com': 46
URL Entropy: 4.54
Contains IP Address: False
Number of Dots: 4
Number of Hyphens: 0
Special Characters: 10
Google Safe Browsing: Suspicious tld: False
Number of redirects: 0
Suspicious IP in URL: 0
HTTPS token in URL: 1

--- WHOIS Analysis ---
Domain Age: 27.77 years
Domain Expiration Time: 3013 days
Registrar check: 0
WHOIS privacy protection: 0
Owner details check: 1

--- SSL Certificate Analysis ---
SSL Certificate Valid: Yes
SSL Certificate Validity Period: 90 days

--- Webpage Analysis ---
Number of grammatical or spelling issues: 15
Number of external links: 18
Number of internal links: 29
Number of login forms: 1
Number of input fields: 23
Total JavaScript length: 48735 characters
Total length of inline JavaScript: 48734 characters
Number of iframe elements: 0
Title Length: 11 characters
Meta Description Length: 31 characte

In [None]:
import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse, urljoin
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime
import ssl
import socket
import OpenSSL
from bs4 import BeautifulSoup
import language_tool_python

# Replace with your Google API Key
API_KEY = ''
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# Function Definitions (Merged)

def count_characters_after_tld(url, tld=".com"):
    try:
        if tld in url:
            index = url.find(tld) + len(tld)  # Find the position right after the TLD
            return len(url) - index  # Calculate characters after the TLD
        return 0
    except Exception as e:
        print(f"Error counting characters after TLD: {e}")
        return 0

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return "Suspicious tld: True" if 'matches' in result else "Suspicious tld: False"
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL safety: {e}")
        return "Unknown"

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL redirection: {e}")
        return 0

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1
    else:
        return 0

def check_https_token_in_url(url):
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except Exception as e:
        print(f"Error retrieving domain age: {e}")
        return None

def get_domain_expiration(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except Exception as e:
        print(f"Error retrieving domain expiration: {e}")
        return None

def check_registrar(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        if registrar and any(suspicious in registrar for suspicious in suspicious_registrars):
            return 1
        return 0
    except Exception as e:
        print(f"Error retrieving registrar: {e}")
        return None

def check_whois_privacy_protection(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        if any(indicator in registrant_info for indicator in privacy_protection_indicators):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking WHOIS privacy: {e}")
        return None

def check_owner_details(url):
    try:
        domain = url.split('/')[2]
        if domain.endswith('google.com'):
            return 0
        w = whois.whois(domain)
        if not w.get('registrant_name') or not w.get('registrant_email'):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking owner details: {e}")
        return None

def check_ssl_certificate(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return not_after > datetime.now()
    except Exception as e:
        print(f"Error checking SSL certificate: {e}")
        return False

def get_certificate_validity(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        validity_duration = (not_after - not_before).days
        return validity_duration, not_after > datetime.now()
    except Exception as e:
        print(f"Error checking certificate validity: {e}")
        return None, False

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches), matches

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        input_types = {}
        for input_field in input_fields:
            input_type = input_field.get('type', 'text')
            input_types[input_type] = input_types.get(input_type, 0) + 1
        return len(input_fields), input_types
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, {}

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else 0, len(meta_description_tag['content']) if meta_description_tag else 0
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def main():
    url = input("Enter a URL: ")

    total_features = 27
    extracted_features = 0

    # General URL analysis
    print("--- URL Analysis ---")
    try:
        print(f"URL Length: {len(url)}")
        print(f"Number of characters after '.com': {count_characters_after_tld(url)}")
        extracted_features += 1
        print(f"URL Entropy: {calculate_entropy(url):.2f}")
        extracted_features += 1
        print(f"Contains IP Address: {has_ip_address(url)}")
        extracted_features += 1
        print(f"Number of Dots: {count_dots_in_url(url)}")
        extracted_features += 1
        print(f"Number of Hyphens: {count_hyphens_in_domain(url)}")
        extracted_features += 1
        print(f"Special Characters: {count_special_characters(url)}")
        extracted_features += 1
        print(f"Google Safe Browsing: {check_url_safety(url)}")
        extracted_features += 1
        print(f"Number of redirects: {check_url_redirection(url)}")
        extracted_features += 1
        print(f"Suspicious IP in URL: {check_suspicious_ip(url)}")
        extracted_features += 1
        print(f"HTTPS token in URL: {check_https_token_in_url(url)}")
        extracted_features += 1
    except Exception as e:
        print(f"Error in URL Analysis: {e}")

    # WHOIS Analysis
    print("\n--- WHOIS Analysis ---")
    try:
        domain_age = get_domain_age(url)
        if domain_age is not None:
            print(f"Domain Age: {domain_age:.2f} years")
            extracted_features += 1

        domain_expiration = get_domain_expiration(url)
        if domain_expiration is not None:
            print(f"Domain Expiration Time: {domain_expiration} days")
            extracted_features += 1

        registrar_check = check_registrar(url)
        if registrar_check is not None:
            print(f"Registrar check: {registrar_check}")
            extracted_features += 1

        whois_privacy = check_whois_privacy_protection(url)
        if whois_privacy is not None:
            print(f"WHOIS privacy protection: {whois_privacy}")
            extracted_features += 1

        owner_details = check_owner_details(url)
        if owner_details is not None:
            print(f"Owner details check: {owner_details}")
            extracted_features += 1
    except Exception as e:
        print(f"Error in WHOIS Analysis: {e}")

    # SSL Certificate Analysis
    print("\n--- SSL Certificate Analysis ---")
    try:
        ssl_status = check_ssl_certificate(url)
        if ssl_status:
            print(f"SSL Certificate Valid: {'Yes' if ssl_status else 'No'}")
            extracted_features += 1

        certificate_validity, certificate_active = get_certificate_validity(url)
        if certificate_validity:
            print(f"SSL Certificate Validity Period: {certificate_validity} days")
            extracted_features += 1
    except Exception as e:
        print(f"Error in SSL Analysis: {e}")

    # Webpage Content and Analysis
    print("\n--- Webpage Analysis ---")
    try:
        content = get_page_content(url)
        if content:
            num_issues, issues = check_grammar_and_spelling(content)
            print(f"Number of grammatical or spelling issues: {num_issues}")
            extracted_features += 1

            external_links, internal_links = get_page_links(url)
            print(f"Number of external links: {external_links}")
            extracted_features += 1
            print(f"Number of internal links: {internal_links}")
            extracted_features += 1

            login_forms = detect_login_forms(url)
            print(f"Number of login forms: {login_forms}")
            extracted_features += 1

            input_fields, input_types = detect_input_fields(url)
            print(f"Number of input fields: {input_fields}")
            extracted_features += 1

            js_length = measure_javascript_length(url)
            print(f"Total JavaScript length: {js_length} characters")
            extracted_features += 1

            inline_scripts_length, iframe_count = detect_inline_scripts_and_iframes(url)
            print(f"Total length of inline JavaScript: {inline_scripts_length} characters")
            extracted_features += 1
            print(f"Number of iframe elements: {iframe_count}")
            extracted_features += 1

            title_length, meta_desc_length = extract_title_and_meta_description_length(url)
            print(f"Title Length: {title_length} characters")
            extracted_features += 1
            print(f"Meta Description Length: {meta_desc_length} characters")
            extracted_features += 1
        else:
            print("Failed to retrieve page content.")
    except Exception as e:
        print(f"Error in Webpage Analysis: {e}")

    print("\n--- Summary ---")
    print(f"Extracted Features: {extracted_features}/{total_features} successfully")

if __name__ == "__main__":
    main()


Enter a URL: http://galorbg.github.io/
--- URL Analysis ---
URL Length: 25
Number of characters after '.com': 0
URL Entropy: 3.67
Contains IP Address: False
Number of Dots: 2
Number of Hyphens: 0
Special Characters: 3
Google Safe Browsing: Suspicious tld: False
Number of redirects: 1
Suspicious IP in URL: 0
HTTPS token in URL: 0

--- WHOIS Analysis ---
Error retrieving domain age: unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'
Error retrieving domain expiration: unsupported operand type(s) for -: 'NoneType' and 'datetime.datetime'
Registrar check: 0
WHOIS privacy protection: 0
Owner details check: 1

--- SSL Certificate Analysis ---
SSL Certificate Valid: Yes
SSL Certificate Validity Period: 364 days

--- Webpage Analysis ---
Number of grammatical or spelling issues: 12
Number of external links: 0
Number of internal links: 2
Number of login forms: 0
Number of input fields: 1
Total JavaScript length: 0 characters
Total length of inline JavaScript: 0 characters
Num

In [None]:
!pip install pandas
!pip install tqdm
!pip install requests
!pip install python-whois
!pip install beautifulsoup4
!pip install language-tool-python


Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5
Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8.1


In [None]:
!pip install whois


Collecting whois
  Downloading whois-1.20240129.2-py3-none-any.whl.metadata (1.3 kB)
Downloading whois-1.20240129.2-py3-none-any.whl (61 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: whois
Successfully installed whois-1.20240129.2


In [None]:
import pandas as pd
from tqdm import tqdm
import time  # Simulates processing time
import os
from collections import Counter
import math
import re
from urllib.parse import urlparse, urljoin
import requests
import whois
from datetime import datetime
import ssl
import socket
from bs4 import BeautifulSoup
import language_tool_python
from tqdm import tqdm

# Replace with your Google API Key
API_KEY = ''
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# Function Definitions (Merged)

def count_characters_after_tld(url, tld=".com"):
    try:
        if tld in url:
            index = url.find(tld) + len(tld)  # Find the position right after the TLD
            return len(url) - index  # Calculate characters after the TLD
        return 0
    except Exception as e:
        print(f"Error counting characters after TLD: {e}")
        return 0

def calculate_entropy(url):
    frequency = Counter(url)
    total_characters = len(url)
    return -sum((count / total_characters) * math.log2(count / total_characters) for count in frequency.values())

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    return url.count('.')

def count_hyphens_in_domain(url):
    domain = urlparse(url).netloc
    return domain.count('-')

def count_special_characters(url):
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    payload = {
        "client": {"clientId": "yourClientID", "clientVersion": "1.0"},
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url}]
        }
    }
    try:
        response = requests.post(f'{SAFE_BROWSING_URL}?key={API_KEY}', json=payload)
        result = response.json()
        return "Suspicious tld: True" if 'matches' in result else "Suspicious tld: False"
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL safety: {e}")
        return "Unknown"

def check_url_redirection(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return len(response.history)
    except requests.exceptions.RequestException as e:
        print(f"Error checking URL redirection: {e}")
        return 0

def check_suspicious_ip(url):
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1
    else:
        return 0

def check_https_token_in_url(url):
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        return (current_date - creation_date).days / 365
    except Exception as e:
        print(f"Error retrieving domain age: {e}")
        return None

def get_domain_expiration(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        return (expiration_date - current_date).days
    except Exception as e:
        print(f"Error retrieving domain expiration: {e}")
        return None

def check_registrar(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrar = w.registrar
        suspicious_registrars = ["Namecheap Inc.", "GoDaddy.com, LLC", "PublicDomainRegistry.com"]
        if registrar and any(suspicious in registrar for suspicious in suspicious_registrars):
            return 1
        return 0
    except Exception as e:
        print(f"Error retrieving registrar: {e}")
        return None

def check_whois_privacy_protection(url):
    try:
        domain = url.split('/')[2]
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        privacy_protection_indicators = ["Privacy Protected", "Domains by Proxy"]
        if any(indicator in registrant_info for indicator in privacy_protection_indicators):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking WHOIS privacy: {e}")
        return None

def check_owner_details(url):
    try:
        domain = url.split('/')[2]
        if domain.endswith('google.com'):
            return 0
        w = whois.whois(domain)
        if not w.get('registrant_name') or not w.get('registrant_email'):
            return 1
        return 0
    except Exception as e:
        print(f"Error checking owner details: {e}")
        return None

def check_ssl_certificate(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        return not_after > datetime.now()
    except Exception as e:
        print(f"Error checking SSL certificate: {e}")
        return False

def get_certificate_validity(url):
    try:
        domain = url.split('/')[2]
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
        not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")
        validity_duration = (not_after - not_before).days
        return validity_duration, not_after > datetime.now()
    except Exception as e:
        print(f"Error checking certificate validity: {e}")
        return None, False

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    return len(matches), matches

def get_page_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        base_domain = urlparse(url).netloc
        external_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc != base_domain)
        internal_links = sum(1 for link in links if urlparse(urljoin(url, link['href'])).netloc == base_domain)
        return external_links, internal_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def detect_login_forms(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        forms = soup.find_all('form')
        return sum(1 for form in forms if form.find_all('input', {'name': lambda x: x and ('user' in x or 'password' in x)}))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_input_fields(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        input_fields = soup.find_all('input')
        input_types = {}
        for input_field in input_fields:
            input_type = input_field.get('type', 'text')
            input_types[input_type] = input_types.get(input_type, 0) + 1
        return len(input_fields), input_types
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, {}

def measure_javascript_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tags = soup.find_all('script')
        return sum(len(script.string) for script in script_tags if script.string)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0

def detect_inline_scripts_and_iframes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        inline_scripts_length = sum(len(script.string) for script in soup.find_all('script') if script.string)
        iframe_count = len(soup.find_all('iframe'))
        return inline_scripts_length, iframe_count
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0

def extract_title_and_meta_description_length(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('title')
        meta_description_tag = soup.find('meta', attrs={'name': 'description'})
        return len(title_tag.string) if title_tag else 0, len(meta_description_tag['content']) if meta_description_tag else 0
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return 0, 0
def extract_features(url):
    try:
        return {
            # URL Analysis Features
            "URL Length": len(url),
            "Characters After .com": count_characters_after_tld(url),
            "Entropy": calculate_entropy(url),
            "Contains IP": has_ip_address(url),
            "Number of Dots": count_dots_in_url(url),
            "Number of Hyphens": count_hyphens_in_domain(url),
            "Special Characters": count_special_characters(url),
            "Redirections": check_url_redirection(url),
            "Suspicious IP in URL": check_suspicious_ip(url),
            "HTTPS Token in URL": check_https_token_in_url(url),

            # WHOIS Analysis Features
            "Domain Age": get_domain_age(url),
            "Domain Expiration": get_domain_expiration(url),
            "Registrar Check": check_registrar(url),
            "WHOIS Privacy": check_whois_privacy_protection(url),
            "Owner Details": check_owner_details(url),

            # SSL Certificate Analysis Features
            "SSL Certificate Valid": check_ssl_certificate(url),
            "SSL Certificate Validity": get_certificate_validity(url),

            # Webpage Content Analysis Features
            "Number of Grammar Issues": check_grammar_and_spelling(get_page_content(url))[0],
            "Number of External Links": get_page_links(url)[0],
            "Number of Internal Links": get_page_links(url)[1],
            "Number of Login Forms": detect_login_forms(url),
            "Number of Input Fields": detect_input_fields(url)[0],
            "JavaScript Length": measure_javascript_length(url),
            "Inline JS Length": detect_inline_scripts_and_iframes(url)[0],
            "Number of Iframes": detect_inline_scripts_and_iframes(url)[1],
            "Title Length": extract_title_and_meta_description_length(url)[0],
            "Meta Description Length": extract_title_and_meta_description_length(url)[1],
        }
    except Exception as e:
        print(f"Error extracting features for URL {url}: {e}")
        return {}

def process_csv(input_file, output_file):
    if not os.path.exists(input_file):
        print(f"Error: Input file {input_file} not found.")
        return

    try:
        # Read the input CSV
        df = pd.read_csv(input_file)
        if 'URL' not in df.columns or 'Label' not in df.columns:
            print("Error: Input file must contain 'URL' and 'Label' columns.")
            return

        processed_data = []

        # Process each URL
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing URLs"):
            url = row['URL']
            label = row['Label']

            # Extract features
            features = extract_features(url)
            features.update({"URL": url, "Label": label})

            # Append to processed data
            processed_data.append(features)

        # Convert to DataFrame
        processed_df = pd.DataFrame(processed_data)

        # Ensure all 27 features are present
        expected_features = [
            "URL Length", "Characters After .com", "Entropy", "Contains IP", "Number of Dots", "Number of Hyphens",
            "Special Characters", "Redirections", "Suspicious IP in URL", "HTTPS Token in URL", "Domain Age",
            "Domain Expiration", "Registrar Check", "WHOIS Privacy", "Owner Details", "SSL Certificate Valid",
            "SSL Certificate Validity", "Number of Grammar Issues", "Number of External Links", "Number of Internal Links",
            "Number of Login Forms", "Number of Input Fields", "JavaScript Length", "Inline JS Length", "Number of Iframes",
            "Title Length", "Meta Description Length"
        ]

        for feature in expected_features:
            if feature not in processed_df.columns:
                processed_df[feature] = None  # Add missing feature as None

        # Save to CSV
        processed_df.to_csv(output_file, index=False)
        print(f"Processed data saved to {output_file}.")

    except Exception as e:
        print(f"Error processing CSV: {e}")

if __name__ == "__main__":
    input_csv = "active_urls_async_test.csv"  # Replace with your input file
    output_csv = "Phishy.csv"

    process_csv(input_csv, output_csv)

Error: Input file must contain 'URL' and 'Label' columns.


In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.28.0-py3-none-any.whl (486 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.3/486.3 kB[0m [31m28.9 MB/s

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

# Initialize Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)

# Base URL
base_url = "https://phishtank.org/phish_archive.php"

# Store extracted URLs
valid_phish_urls = []

# Open the website
driver.get(base_url)

while len(valid_phish_urls) < 5000:
    try:
        # Wait for the table to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "phish_archive"))
        )

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find all rows in the table
        rows = soup.select("tr")  # All rows in the table

        for row in rows:
            columns = row.find_all("td")
            if len(columns) >= 4:
                valid_status = columns[2].text.strip()
                online_status = columns[3].text.strip()
                if valid_status == "VALID PHISH" and online_status == "ONLINE":
                    phish_url = columns[1].find("a").text.strip()
                    valid_phish_urls.append(phish_url)

                    # Stop if we've reached 5000 URLs
                    if len(valid_phish_urls) >= 5000:
                        break

        # If there's an "Older" button, click it
        next_button = driver.find_element(By.LINK_TEXT, "Older")
        if next_button:
            next_button.click()
            time.sleep(3)  # Give some time for the page to load
        else:
            break  # No more pages to scrape

    except Exception as e:
        print(f"Error: {e}")
        break

# Close the driver
driver.quit()

# Save the URLs to a CSV file
df = pd.DataFrame(valid_phish_urls, columns=["Phish URL"])
df.to_csv("valid_phish_urls.csv", index=False)

print(f"Scraping completed. {len(valid_phish_urls)} URLs saved to valid_phish_urls.csv.")


Error: Message: 
Stacktrace:
#0 0x55fe1e3308fa <unknown>
#1 0x55fe1de41d20 <unknown>
#2 0x55fe1de90a66 <unknown>
#3 0x55fe1de90d01 <unknown>
#4 0x55fe1ded6184 <unknown>
#5 0x55fe1deb4b1d <unknown>
#6 0x55fe1ded3560 <unknown>
#7 0x55fe1deb4893 <unknown>
#8 0x55fe1de8330d <unknown>
#9 0x55fe1de8432e <unknown>
#10 0x55fe1e2fd00b <unknown>
#11 0x55fe1e300f97 <unknown>
#12 0x55fe1e2e971c <unknown>
#13 0x55fe1e301b17 <unknown>
#14 0x55fe1e2ce6cf <unknown>
#15 0x55fe1e31f6b8 <unknown>
#16 0x55fe1e31f880 <unknown>
#17 0x55fe1e32f776 <unknown>
#18 0x78f4d46a7ac3 <unknown>

Scraping completed. 0 URLs saved to valid_phish_urls.csv.


In [None]:
import csv
import requests
from requests.exceptions import RequestException

def is_url_active(url):
    """Check if a URL is active by making an HTTP GET request."""
    try:
        response = requests.head(url, timeout=5)  # Use HEAD request for faster response
        return response.status_code == 200
    except RequestException:
        return False

def process_urls(input_csv, output_csv):
    """Process URLs to check if active and save active URLs with their labels."""
    active_urls = []

    # Read the input CSV file
    with open(input_csv, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)

        # Check if required columns exist
        if 'url' not in reader.fieldnames or 'label' not in reader.fieldnames:
            raise ValueError("Input CSV must contain 'url' and 'label' columns.")

        for row in reader:
            url = row['url']
            label = row['label']

            if is_url_active(url):
                active_urls.append({'url': url, 'label': label})

    # Write active URLs to the output CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
        fieldnames = ['url', 'label']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(active_urls)

if __name__ == "__main__":
    input_csv = "input.csv"  # Replace with your input CSV file path
    output_csv = "active_urls.csv"  # Replace with your desired output CSV file path

    try:
        process_urls(input_csv, output_csv)
        print(f"Active URLs saved to {output_csv}")
    except Exception as e:
        print(f"Error: {e}")


Error: name 'status' is not defined


In [None]:
pip install aiohttp tqdm nest_asyncio




In [None]:
import pandas as pd
from tqdm import tqdm
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Function to extract features (you can include additional feature extraction logic here)
def extract_features(url):
    # Example features; customize as needed
    return {
        "url_length": len(url),
        "contains_https": "https" in url,
    }

# Function to process the CSV
def process_csv(input_file, output_file):
    if not os.path.exists(input_file):
        print(f"Error: Input file {input_file} not found.")
        return

    try:
        # Read the input CSV
        df = pd.read_csv(input_file)
        if 'url' not in df.columns or 'parent_status' not in df.columns:
            print("Error: Input file must contain 'url' and 'parent_status' columns.")
            return

        processed_data = []

        # Create a requests session with retries
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))

        # Process each URL
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing URLs"):
            url = row['url']
            parent_status = row['parent_status']

            try:
                # Send a GET request to the URL
                response = session.get(url, timeout=15)
                if response.status_code == 200:
                    features = extract_features(url)
                    features.update({"url": url, "parent_status": parent_status})
                    processed_data.append(features)
                else:
                    print(f"URL {url} returned status code {response.status_code}")
            except Exception as e:
                print(f"Error with URL {url}: {e}")
                continue

        # Convert processed data to DataFrame
        processed_df = pd.DataFrame(processed_data)

        # Save the processed data to a CSV file
        processed_df.to_csv(output_file, index=False)
        print(f"Processed data saved to {output_file}.")

    except Exception as e:
        print(f"Error processing CSV: {e}")

# Main script
if __name__ == "__main__":
    input_csv = "final_cleaned_with_status.csv"  # Replace with your input file
    output_csv = "Phishy.csv"                # Replace with your desired output file

    process_csv(input_csv, output_csv)


Processing URLs:   0%|          | 1/6006 [00:33<55:26:59, 33.24s/it]

Error with URL http://www.crestonwood.com/router.php: HTTPConnectionPool(host='www.crestonwood.com', port=80): Max retries exceeded with url: /router.php (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:   0%|          | 9/6006 [02:12<29:02:42, 17.44s/it]

Error with URL http://wave.progressfilm.co.uk/time3/?logon=myposte: HTTPConnectionPool(host='wave.progressfilm.co.uk', port=80): Max retries exceeded with url: /time3/?logon=myposte (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa4e0670>: Failed to resolve 'wave.progressfilm.co.uk' ([Errno -2] Name or service not known)"))


Processing URLs:   1%|          | 34/6006 [02:57<11:57:17,  7.21s/it]

Error with URL http://sloaneandhyde.com/drs/new2015/document.php/index.css: HTTPConnectionPool(host='sloaneandhyde.com', port=80): Max retries exceeded with url: /drs/new2015/document.php/index.css (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa4e0910>: Failed to resolve 'sloaneandhyde.com' ([Errno -2] Name or service not known)"))


Processing URLs:   1%|          | 45/6006 [03:04<2:02:05,  1.23s/it]

URL http://mne.edu.vn/wp-includes/Netflix/LoginID/index.php returned status code 403


Processing URLs:   1%|          | 55/6006 [03:11<57:40,  1.72it/s]

URL http://ru.dictionarist.com/iconoclast returned status code 403


Processing URLs:   1%|          | 73/6006 [03:54<15:59:51,  9.71s/it]

Error with URL http://www.routeralley.com/guides/nat.pdf: HTTPConnectionPool(host='no.access', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa50cfd0>: Failed to resolve 'no.access' ([Errno -2] Name or service not known)"))


Processing URLs:   1%|▏         | 82/6006 [04:01<1:49:31,  1.11s/it]

URL https://www.metal-supply.dk/ returned status code 403


Processing URLs:   1%|▏         | 88/6006 [04:08<2:04:06,  1.26s/it]

URL https://www.workiva.com/ returned status code 403


Processing URLs:   2%|▏         | 102/6006 [04:17<1:54:56,  1.17s/it]

URL http://www.scancity.am/tmp/ibxolb/ibxolb/login/index-html/login/ returned status code 404


Processing URLs:   2%|▏         | 111/6006 [04:24<1:27:13,  1.13it/s]

URL http://to-stromatadiko.gr/cache/ returned status code 403




URL http://www.imdb.com/Title?3682448 returned status code 403


Processing URLs:   2%|▏         | 121/6006 [04:59<15:05:31,  9.23s/it]

Error with URL http://baby-lim.com/clovvy/: HTTPConnectionPool(host='baby-lim.com', port=80): Max retries exceeded with url: /clovvy/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa50f790>: Failed to resolve 'baby-lim.com' ([Errno -2] Name or service not known)"))


Processing URLs:   2%|▏         | 133/6006 [05:11<1:13:04,  1.34it/s]

URL http://www.imdb.com/Title?0362165 returned status code 403


Processing URLs:   2%|▏         | 143/6006 [05:16<50:06,  1.95it/s]

URL https://www.ghacks.net/2010/04/13/setting-up-a-content-delivery-network-in-wordpress/ returned status code 403


Processing URLs:   3%|▎         | 164/6006 [07:41<59:26:29, 36.63s/it]

Error with URL http://www.thefreedictionary.com/compact+disc: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /compact+disc (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa3f55d0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:   3%|▎         | 173/6006 [09:47<62:02:10, 38.29s/it]

Error with URL https://hublaalikes.com/login: HTTPSConnectionPool(host='hublaalikes.com', port=443): Max retries exceeded with url: /login (Caused by ReadTimeoutError("HTTPSConnectionPool(host='hublaalikes.com', port=443): Read timed out. (read timeout=15)"))


Processing URLs:   3%|▎         | 183/6006 [09:56<3:28:35,  2.15s/it]

URL http://www.nairaland.com/1208097/meet-cnn-internationals-anchors-reporters returned status code 403


Processing URLs:   3%|▎         | 194/6006 [10:02<52:13,  1.85it/s]

URL http://www.pracadarepublicaembeja.net/men returned status code 403


Processing URLs:   3%|▎         | 195/6006 [10:04<1:16:01,  1.27it/s]

URL http://www.atmi.gr/ returned status code 403


Processing URLs:   3%|▎         | 203/6006 [12:10<59:41:29, 37.03s/it]

Error with URL http://www.latos.co.kr/js/erusr/united/ep/meniu1.htm?theinfored=2718ad761726asdg12765asd7612jgasd87612: HTTPConnectionPool(host='www.latos.co.kr', port=80): Max retries exceeded with url: /js/erusr/united/ep/meniu1.htm?theinfored=2718ad761726asdg12765asd7612jgasd87612 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa369300>, 'Connection to www.latos.co.kr timed out. (connect timeout=15)'))


Processing URLs:   4%|▎         | 221/6006 [12:26<1:26:11,  1.12it/s]

URL http://dasisko.sk returned status code 403


Processing URLs:   4%|▍         | 242/6006 [14:43<58:31:11, 36.55s/it]

Error with URL http://www.thefreedictionary.com/select+committee: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /select+committee (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa43f9d0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:   4%|▍         | 247/6006 [18:15<111:13:35, 69.53s/it]

Error with URL https://www.sriexecutive.com/: HTTPSConnectionPool(host='talent.oneelevate.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7812fa3f64a0>, 'Connection to talent.oneelevate.com timed out. (connect timeout=15)'))


Processing URLs:   5%|▍         | 279/6006 [18:41<2:22:52,  1.50s/it]

URL http://www.nhs.uk/conditions/memory-loss/Pages/Introduction.aspx returned status code 404


Processing URLs:   5%|▍         | 282/6006 [18:42<1:15:59,  1.26it/s]

URL https://life4news.ru:443/ returned status code 403


Processing URLs:   5%|▍         | 290/6006 [19:16<14:15:34,  8.98s/it]

Error with URL http://signin-store-ws.frontieroption.com/: HTTPConnectionPool(host='signin-store-ws.frontieroption.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa28ba00>: Failed to resolve 'signin-store-ws.frontieroption.com' ([Errno -2] Name or service not known)"))


Processing URLs:   5%|▍         | 294/6006 [19:18<4:01:09,  2.53s/it]

Error with URL https://www.elitewriterslab.com/: ('Received response with content-encoding: gzip, but failed to decode it.', error('Error -3 while decompressing data: incorrect header check'))


Processing URLs:   5%|▍         | 299/6006 [19:23<1:43:46,  1.09s/it]

URL https://moz.com/beginners-guide-to-seo returned status code 403


Processing URLs:   5%|▌         | 329/6006 [20:11<13:54:16,  8.82s/it]

Error with URL https://nab.support.fortscratchley.org.au/: HTTPSConnectionPool(host='nab.support.fortscratchley.org.au', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7812fa2f08b0>: Failed to resolve 'nab.support.fortscratchley.org.au' ([Errno -2] Name or service not known)"))


Processing URLs:   6%|▌         | 333/6006 [21:26<37:46:53, 23.98s/it]

Error with URL http://www.samcool.org/90AB705610D8E3CD93C3E28B0C6BEFD0/?sec=Thomas%20Hamann: HTTPConnectionPool(host='www.samcool.org', port=80): Max retries exceeded with url: /90AB705610D8E3CD93C3E28B0C6BEFD0/?sec=Thomas%20Hamann (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa31ec20>: Failed to resolve 'www.samcool.org' ([Errno -2] Name or service not known)"))
URL http://armorgames.com/category/adventure returned status code 403




URL http://www.entrustedauctions.com/ returned status code 403


Processing URLs:   6%|▌         | 337/6006 [21:57<24:04:37, 15.29s/it]

Error with URL http://desdeelamor.com/wp-includes/SimplePie/Parse/hd/demx/index.php: HTTPConnectionPool(host='desdeelamor.com', port=80): Max retries exceeded with url: /wp-includes/SimplePie/Parse/hd/demx/index.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa43fbb0>: Failed to resolve 'desdeelamor.com' ([Errno -2] Name or service not known)"))


Processing URLs:   6%|▌         | 356/6006 [22:16<1:55:12,  1.22s/it]

URL http://erzsebetparkhotel.hu/eje1/ayo1/ayo1/index.html returned status code 447


Processing URLs:   6%|▌         | 367/6006 [22:25<1:00:53,  1.54it/s]

URL http://reaco.com.mx returned status code 406


Processing URLs:   6%|▌         | 373/6006 [23:00<15:06:12,  9.65s/it]

Error with URL http://znrdm73sh.fastestcdn.net/: HTTPConnectionPool(host='znrdm73sh.fastestcdn.net', port=80): Max retries exceeded with url: / (Caused by ResponseError('too many 503 error responses'))


Processing URLs:   6%|▋         | 385/6006 [23:55<21:30:22, 13.77s/it]

Error with URL http://www.productitem.com/~ozonomex/ozonoterapiamexico.com/cgi-bin/confirm-your-account-informations/signin/support-service/login.php?country.x=US-United%20States: HTTPConnectionPool(host='www.productitem.com', port=80): Max retries exceeded with url: /~ozonomex/ozonoterapiamexico.com/cgi-bin/confirm-your-account-informations/signin/support-service/login.php?country.x=US-United%20States (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:   7%|▋         | 396/6006 [24:03<1:47:14,  1.15s/it]

URL http://www.ibtimes.com/switched-birth-season-5-spoilers-episode-6-synopsis-released-what-will-happen-four-2500049 returned status code 403


Processing URLs:   7%|▋         | 419/6006 [24:18<35:27,  2.63it/s]

URL http://m.g2234.com/ returned status code 404


Processing URLs:   7%|▋         | 426/6006 [24:53<14:53:53,  9.61s/it]

Error with URL http://quicklloret.com/wp-admin/includes/Update: HTTPConnectionPool(host='quicklloret.com', port=80): Max retries exceeded with url: /wp-admin/includes/Update (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa31e2c0>: Failed to resolve 'quicklloret.com' ([Errno -2] Name or service not known)"))


Processing URLs:   7%|▋         | 431/6006 [25:00<4:13:42,  2.73s/it]

URL http://www.explainthatstuff.com/laserprinters.html returned status code 403


Processing URLs:   7%|▋         | 437/6006 [25:04<1:07:43,  1.37it/s]

URL http://www.bestofsampleresume.com/graphic-designer-resume-sample/ returned status code 406


Processing URLs:   7%|▋         | 442/6006 [25:08<1:04:29,  1.44it/s]

URL http://www.simon.com/mall/the-florida-mall/stores/traffic-shoes returned status code 403


Processing URLs:   7%|▋         | 447/6006 [25:11<44:14,  2.09it/s]

URL http://www.civiccu.com returned status code 403


Processing URLs:   8%|▊         | 455/6006 [25:45<13:47:59,  8.95s/it]

Error with URL http://bagru.info/t/BySXSWap: HTTPConnectionPool(host='bagru.info', port=80): Max retries exceeded with url: /t/BySXSWap (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa368640>: Failed to resolve 'bagru.info' ([Errno -2] Name or service not known)"))


Processing URLs:   8%|▊         | 462/6006 [26:19<15:37:53, 10.15s/it]

Error with URL http://www.fn-international.com/sendstudio/login.aspx: HTTPConnectionPool(host='www.fn-international.com', port=80): Max retries exceeded with url: /sendstudio/login.aspx (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa34d5a0>: Failed to resolve 'www.fn-international.com' ([Errno -2] Name or service not known)"))


Processing URLs:   8%|▊         | 473/6006 [26:59<15:58:41, 10.40s/it]

Error with URL http://everything.explained.today/electric_keyboards/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /electric_keyboards/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:   8%|▊         | 481/6006 [27:34<15:51:35, 10.33s/it]

Error with URL http://www.astro.com/astrology/tma_article160706_e.htm: HTTPSConnectionPool(host='www.astro.com', port=443): Max retries exceeded with url: /astrology/tma_article160706_e.htm (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7812fa39e3b0>: Failed to establish a new connection: [Errno 111] Connection refused'))


Processing URLs:   8%|▊         | 509/6006 [29:57<50:53:40, 33.33s/it]

Error with URL http://www.thefreedictionary.com/ionosphere: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /ionosphere (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa34cdf0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:   9%|▊         | 520/6006 [30:07<2:18:18,  1.51s/it]

URL https://www.guiadoexcel.com.br/ returned status code 520


Processing URLs:   9%|▊         | 525/6006 [30:09<49:12,  1.86it/s]  

URL https://www.browneyedbaker.com/ returned status code 403


Processing URLs:   9%|▉         | 537/6006 [30:54<10:40:36,  7.03s/it]

Error with URL http://dlakupujacych.allegro.secfence.lesavik.net: HTTPConnectionPool(host='dlakupujacych.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa4e3ca0>: Failed to resolve 'dlakupujacych.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:   9%|▉         | 551/6006 [31:46<16:05:02, 10.61s/it]

Error with URL https://seireshd.com/: HTTPSConnectionPool(host='seireshd.com', port=443): Max retries exceeded with url: / (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))


Processing URLs:   9%|▉         | 552/6006 [31:46<11:21:37,  7.50s/it]

URL https://www.ghacks.net/2010/12/20/microsoft-windows-update-overview-all-you-need-to-know/ returned status code 403


Processing URLs:   9%|▉         | 566/6006 [33:55<55:51:40, 36.97s/it]

Error with URL http://www.criaderodecaracoles.com/: HTTPConnectionPool(host='www.criaderodecaracoles.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa50e7a0>, 'Connection to www.criaderodecaracoles.com timed out. (connect timeout=15)'))


Processing URLs:  10%|▉         | 576/6006 [34:04<3:15:16,  2.16s/it]

URL https://www.strong.tv/en/Home returned status code 404


Processing URLs:  10%|▉         | 596/6006 [36:24<41:44:56, 27.78s/it]

Error with URL http://www.thefreedictionary.com/Luftwaffe: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /Luftwaffe (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa12c850>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  10%|█         | 604/6006 [36:59<17:12:33, 11.47s/it]

Error with URL http://www.savitari.com/: HTTPConnectionPool(host='www.savitari.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa2893f0>: Failed to resolve 'www.savitari.com' ([Errno -2] Name or service not known)"))


Processing URLs:  10%|█         | 608/6006 [37:33<18:35:46, 12.40s/it]

Error with URL http://www.brighant.com/1122?sec=Jochen%20Kuntermann: HTTPConnectionPool(host='www.brighant.com', port=80): Max retries exceeded with url: /1122?sec=Jochen%20Kuntermann (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9f60c70>: Failed to resolve 'www.brighant.com' ([Errno -2] Name or service not known)"))


Processing URLs:  10%|█         | 618/6006 [39:41<46:06:41, 30.81s/it]

Error with URL http://www.samducksports.com/wp-content/envato-backups/.ST/POSBEV/: HTTPConnectionPool(host='www.samducksports.com', port=80): Max retries exceeded with url: /wp-content/envato-backups/.ST/POSBEV/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9fcb070>, 'Connection to www.samducksports.com timed out. (connect timeout=15)'))


Processing URLs:  10%|█         | 629/6006 [39:49<2:19:38,  1.56s/it]

URL https://www.google.com/url?hl=en&amp;q=http://www.rugbylabelsolutions.co.uk/drama1/secure/management/&amp;source=gmail&amp;ust=1537885316769000&amp;usg=AFQjCNFGOMmK92rOTE4CanjtiDPIn_LZTg returned status code 403


Processing URLs:  11%|█         | 640/6006 [40:01<1:10:06,  1.28it/s]

URL https://www.apstudynotes.org/us-government/vocabulary/chapter-11-congress/ returned status code 520


Processing URLs:  11%|█         | 642/6006 [40:05<1:41:29,  1.14s/it]

URL http://www.xyzguyz.com/admin1/G.Docs/index.php returned status code 403


Processing URLs:  11%|█         | 645/6006 [40:06<56:19,  1.59it/s]  

URL http://www.biologyjunction.com/Viruses,%20Viroids,%20and%20Prions.ppt returned status code 403


Processing URLs:  11%|█         | 664/6006 [41:03<11:57:07,  8.05s/it]

Error with URL http://everything.explained.today/bit_error/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /bit_error/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  11%|█         | 667/6006 [41:34<13:30:44,  9.11s/it]

Error with URL http://www.covid-19challengecoin.com: HTTPConnectionPool(host='www.covid-19challengecoin.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa2f0b50>: Failed to resolve 'www.covid-19challengecoin.com' ([Errno -2] Name or service not known)"))


Processing URLs:  11%|█▏        | 677/6006 [43:40<54:16:31, 36.67s/it]

Error with URL http://encyclopedia2.thefreedictionary.com/HDMI+switch: HTTPConnectionPool(host='encyclopedia2.thefreedictionary.com', port=80): Max retries exceeded with url: /HDMI+switch (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa31fdc0>, 'Connection to encyclopedia2.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  12%|█▏        | 697/6006 [43:59<41:26,  2.14it/s]

URL http://www.heightblog.com/wp-content/themes/twentyten/VERIFYDB/Login-DropBox/ returned status code 406


Processing URLs:  12%|█▏        | 706/6006 [44:07<1:24:35,  1.04it/s]

URL https://www.fbautoliker.co/access-token returned status code 403


Processing URLs:  12%|█▏        | 710/6006 [44:56<19:18:39, 13.13s/it]

Error with URL http://brighant.com/1122/?sec=Jochen%20Kuntermann: HTTPConnectionPool(host='brighant.com', port=80): Max retries exceeded with url: /1122/?sec=Jochen%20Kuntermann (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa0270d0>: Failed to resolve 'brighant.com' ([Errno -2] Name or service not known)"))


Processing URLs:  12%|█▏        | 715/6006 [45:02<4:14:31,  2.89s/it]

URL http://herpes.emedtv.com/genital-herpes/incubation-period-for-genital-herpes.html returned status code 403


Processing URLs:  12%|█▏        | 718/6006 [45:33<15:00:04, 10.21s/it]

Error with URL http://sloaneandhyde.com/imm/new2015/pvalidate.html: HTTPConnectionPool(host='sloaneandhyde.com', port=80): Max retries exceeded with url: /imm/new2015/pvalidate.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9d3bf10>: Failed to resolve 'sloaneandhyde.com' ([Errno -2] Name or service not known)"))


Processing URLs:  12%|█▏        | 739/6006 [47:51<52:31:53, 35.91s/it]

Error with URL http://www.thefreedictionary.com/active+transport: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /active+transport (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9f61b40>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  12%|█▏        | 743/6006 [48:23<26:26:18, 18.08s/it]

Error with URL http://www.arlberg-skifuehrer.com/06BFE1092A068A329FD238558EB61EF4: HTTPConnectionPool(host='www.arlberg-skifuehrer.com', port=80): Max retries exceeded with url: /06BFE1092A068A329FD238558EB61EF4 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9f62dd0>: Failed to resolve 'www.arlberg-skifuehrer.com' ([Errno -2] Name or service not known)"))


Processing URLs:  13%|█▎        | 753/6006 [48:34<2:15:37,  1.55s/it]

URL https://www.newegg.com/Product/ProductList.aspx?Description=USB%20Parallel%20port&Submit=ENE returned status code 403


Processing URLs:  13%|█▎        | 760/6006 [50:39<53:12:24, 36.51s/it]

Error with URL http://www.thefreedictionary.com/black+letter: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /black+letter (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9ce5660>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  13%|█▎        | 766/6006 [50:50<9:45:21,  6.70s/it] 

Error with URL http://sp963460.sitebeat.site/: Exceeded 30 redirects.


Processing URLs:  13%|█▎        | 783/6006 [51:04<57:40,  1.51it/s]  

URL http://www.simon.com/mall/town-center-at-cobb/stores/traffic-shoes returned status code 403


Processing URLs:  13%|█▎        | 787/6006 [51:08<1:09:16,  1.26it/s]

URL http://www.opticsplanet.com/cameras.html returned status code 403


Processing URLs:  14%|█▎        | 813/6006 [51:29<51:00,  1.70it/s]

URL https://www.whitecoatinvestor.com/ returned status code 403


Processing URLs:  14%|█▎        | 815/6006 [51:31<1:02:47,  1.38it/s]

URL https://www.eguest.net/goodbye.php returned status code 404


Processing URLs:  14%|█▍        | 834/6006 [51:49<1:18:09,  1.10it/s]

URL http://www.ietf.org/rfc/rfc3031 returned status code 404


Processing URLs:  14%|█▍        | 835/6006 [51:50<1:00:20,  1.43it/s]

URL http://www.boysfuns.com/ returned status code 403


Processing URLs:  14%|█▍        | 843/6006 [53:03<29:38:34, 20.67s/it]

Error with URL http://www.megalyrics.ru/about/spacetime-continuum.htm: HTTPConnectionPool(host='www.megalyrics.ru', port=80): Max retries exceeded with url: /about/spacetime-continuum.htm (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9b6a470>: Failed to resolve 'www.megalyrics.ru' ([Errno -2] Name or service not known)"))


Processing URLs:  15%|█▍        | 875/6006 [55:41<52:44:44, 37.01s/it]

Error with URL http://www.thefreedictionary.com/portal: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /portal (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9fc83d0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  15%|█▍        | 882/6006 [56:24<18:31:51, 13.02s/it]

Error with URL http://ch.net2care.com/ar2/: HTTPConnectionPool(host='ch.net2care.com', port=80): Max retries exceeded with url: /ar2/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa2882b0>: Failed to resolve 'ch.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  15%|█▍        | 890/6006 [57:09<15:58:07, 11.24s/it]

Error with URL https://www.littlerockstore.com/: HTTPSConnectionPool(host='www.littlerockstore.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'www.littlerockstore.com'. (_ssl.c:1007)")))


Processing URLs:  15%|█▍        | 899/6006 [57:13<1:04:48,  1.31it/s]

URL https://www.zillow.com/fontana-ca/ returned status code 403


Processing URLs:  15%|█▌        | 909/6006 [57:21<1:08:40,  1.24it/s]

URL https://www.ajilon.com/ returned status code 403




URL http://www.explainthatstuff.com/streamingmedia.html returned status code 403


Processing URLs:  15%|█▌        | 924/6006 [58:01<9:40:22,  6.85s/it] 

Error with URL https://www.azchords.com/m/machinetranslations-tabs-62768/amnesia1-tabs-864781.html: HTTPSConnectionPool(host='www.azchords.com', port=443): Max retries exceeded with url: /m/machinetranslations-tabs-62768/amnesia1-tabs-864781.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))


Processing URLs:  15%|█▌        | 925/6006 [58:01<6:56:34,  4.92s/it]

URL http://www.kreezcraft.com/ returned status code 302


Processing URLs:  16%|█▌        | 935/6006 [1:00:06<36:09:13, 25.67s/it]

Error with URL http://encyclopedia2.thefreedictionary.com/MultiMediaCard: HTTPConnectionPool(host='encyclopedia2.thefreedictionary.com', port=80): Max retries exceeded with url: /MultiMediaCard (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f8ebf220>, 'Connection to encyclopedia2.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  16%|█▌        | 943/6006 [1:00:56<22:32:55, 16.03s/it]

Error with URL http://87.138.95.150:8080/index.php: HTTPConnectionPool(host='87.138.95.150', port=8080): Max retries exceeded with url: /index.php (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7812f8ebf040>: Failed to establish a new connection: [Errno 113] No route to host'))


Processing URLs:  16%|█▌        | 946/6006 [1:01:29<21:23:15, 15.22s/it]

Error with URL http://outlooz.webeden.co.uk/: HTTPConnectionPool(host='outlooz.webeden.co.uk', port=80): Max retries exceeded with url: / (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  16%|█▌        | 958/6006 [1:01:38<1:31:12,  1.08s/it]

URL https://www.learn4good.com/games/adventure.htm returned status code 403


Processing URLs:  16%|█▌        | 961/6006 [1:02:11<11:42:19,  8.35s/it]

Error with URL http://everything.explained.today/Bridge_scoring/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /Bridge_scoring/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  16%|█▌        | 971/6006 [1:03:06<19:58:50, 14.29s/it]

Error with URL http://site2.20071831.brim.ru/0DysK1/linkedin/linkedIn/linkedIn%20(1)/linkedIn%20(1)/LinkedIn.html?midToken=AQHzsmwIH-erkw&amp;trk=eml-email_m2m_invite_single_01-footer-27-home&amp;trkEmail=eml-email_m2m_invite_single_01-footer-27-home-null-20fl3b~jy16hc4f~59-null-neptune/feed&amp;lipi=urn:li:page:email_email_m2m_invite_single_01;Rbk1WB6kQ/iZkL%20Z0%20KxzA==: HTTPConnectionPool(host='site2.20071831.brim.ru', port=80): Max retries exceeded with url: /0DysK1/linkedin/linkedIn/linkedIn%20(1)/linkedIn%20(1)/LinkedIn.html?midToken=AQHzsmwIH-erkw&amp;trk=eml-email_m2m_invite_single_01-footer-27-home&amp;trkEmail=eml-email_m2m_invite_single_01-footer-27-home-null-20fl3b~jy16hc4f~59-null-neptune/feed&amp;lipi=urn:li:page:email_email_m2m_invite_single_01;Rbk1WB6kQ/iZkL%20Z0%20KxzA== (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9cb4cd0>: Failed to resolve 'site2.20071831.brim.ru' ([Errno -2] Name or service not known)"))


Processing URLs:  16%|█▋        | 978/6006 [1:03:12<2:38:56,  1.90s/it]

URL http://www.healthline.com/health/peripheral-vascular-disease returned status code 404


Processing URLs:  16%|█▋        | 987/6006 [1:03:19<1:06:03,  1.27it/s]

URL http://musinfo.ch returned status code 510


Processing URLs:  16%|█▋        | 989/6006 [1:03:19<42:48,  1.95it/s]

URL https://www.comparebroadband.com.au/internet-providers/optus-broadband/ returned status code 403


Processing URLs:  17%|█▋        | 995/6006 [1:03:54<12:02:19,  8.65s/it]

Error with URL https://www.erowid.org/plants/mimosa/mimosa.shtml: HTTPSConnectionPool(host='www.erowid.org', port=443): Max retries exceeded with url: /plants/mimosa/mimosa.shtml (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))


Processing URLs:  17%|█▋        | 1009/6006 [1:04:34<12:44:37,  9.18s/it]

Error with URL http://0s.n5vs44tv.verek.ru/: HTTPConnectionPool(host='0s.n5vs44tv.verek.ru', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9090670>: Failed to resolve '0s.n5vs44tv.verek.ru' ([Errno -2] Name or service not known)"))


Processing URLs:  17%|█▋        | 1011/6006 [1:04:35<6:56:13,  5.00s/it]

URL https://www.goodreads.com/quotes/tag/passivity returned status code 403


Processing URLs:  17%|█▋        | 1015/6006 [1:04:37<2:10:02,  1.56s/it]

URL http://media-interface.net/blog/profile/ returned status code 403


Processing URLs:  17%|█▋        | 1041/6006 [1:05:34<48:54,  1.69it/s]

URL https://katiepiperfoundation.org.uk/ returned status code 403


Processing URLs:  17%|█▋        | 1045/6006 [1:05:36<30:00,  2.76it/s]

URL https://www.consumeraffairs.com/internet/internet_services.htm returned status code 403


Processing URLs:  17%|█▋        | 1048/6006 [1:07:37<49:57:27, 36.27s/it]

Error with URL https://www.douglas.pl/: HTTPSConnectionPool(host='www.douglas.pl', port=443): Max retries exceeded with url: / (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.douglas.pl', port=443): Read timed out. (read timeout=15)"))


Processing URLs:  18%|█▊        | 1061/6006 [1:07:43<1:32:07,  1.12s/it]

URL https://hannetjiefaurie1.creatorlink.net/ returned status code 403


Processing URLs:  18%|█▊        | 1066/6006 [1:07:45<50:08,  1.64it/s]

URL https://ketapang.bawaslu.go.id/english/english/english/english/index.php?email=olinrogers@legalshield.com returned status code 404


Processing URLs:  18%|█▊        | 1068/6006 [1:07:47<57:29,  1.43it/s]  

URL https://www.devasnatural.com/index.php/page/index?key=index returned status code 410


Processing URLs:  18%|█▊        | 1070/6006 [1:07:48<42:48,  1.92it/s]

URL http://www.imdb.com/Title?1454029 returned status code 403


Processing URLs:  18%|█▊        | 1078/6006 [1:07:54<46:43,  1.76it/s]

URL http://regiscoyne.com/368762BDB30FAB1003AAB48B3362C445 returned status code 404


Processing URLs:  18%|█▊        | 1090/6006 [1:08:02<1:08:56,  1.19it/s]

URL https://www.startimes.com/f.aspx?t=37 returned status code 403


Processing URLs:  18%|█▊        | 1095/6006 [1:10:16<46:48:14, 34.31s/it]

Error with URL http://www.thefreedictionary.com/computer+mouse: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /computer+mouse (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa34ebc0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  18%|█▊        | 1102/6006 [1:12:22<53:41:00, 39.41s/it]

Error with URL https://www.offshoreenergytoday.com/: HTTPSConnectionPool(host='www.offshoreenergytoday.com', port=443): Max retries exceeded with url: / (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.offshoreenergytoday.com', port=443): Read timed out. (read timeout=15)"))


Processing URLs:  18%|█▊        | 1105/6006 [1:12:53<31:00:00, 22.77s/it]

Error with URL http://platinumpizzaboxes.com/.csc/apps.rackspace_secure.esRetaRetaiaddy9Signoginsecurex.icrosoftonlinePlusRtail_02a3ca24_7a417usRa23b.fc72345102bTindex.X86_enus_O36ProPlusRetail_02a3ca2dexjdhindex.X86_enus_O365Pr.php: HTTPConnectionPool(host='platinumpizzaboxes.com', port=80): Max retries exceeded with url: /.csc/apps.rackspace_secure.esRetaRetaiaddy9Signoginsecurex.icrosoftonlinePlusRtail_02a3ca24_7a417usRa23b.fc72345102bTindex.X86_enus_O36ProPlusRetail_02a3ca2dexjdhindex.X86_enus_O365Pr.php (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  19%|█▊        | 1116/6006 [1:15:05<50:46:12, 37.38s/it]

Error with URL http://www.sirc.org/publik/binge_drinking.shtml: HTTPConnectionPool(host='www.sirc.org', port=80): Max retries exceeded with url: /publik/binge_drinking.shtml (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa2c0bb0>, 'Connection to www.sirc.org timed out. (connect timeout=15)'))


Processing URLs:  19%|█▊        | 1120/6006 [1:15:37<25:02:09, 18.45s/it]

Error with URL http://www.jainsonbookworld.com/js/lib/frg/Freesession/Dossier0065888456/freemobileactivation/premierepartie00945/9a170d3b69d1fa6fe031ed11c4633595/: HTTPConnectionPool(host='www.jainsonbookworld.com', port=80): Max retries exceeded with url: /js/lib/frg/Freesession/Dossier0065888456/freemobileactivation/premierepartie00945/9a170d3b69d1fa6fe031ed11c4633595/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa23e080>: Failed to resolve 'www.jainsonbookworld.com' ([Errno -2] Name or service not known)"))


Processing URLs:  19%|█▉        | 1138/6006 [1:15:49<48:02,  1.69it/s]  

URL http://www.zebrakeys.com/lessons/ returned status code 406


Processing URLs:  20%|█▉        | 1172/6006 [1:16:49<5:11:57,  3.87s/it]

URL https://www.aafxtrading.com/ returned status code 403


Processing URLs:  20%|█▉        | 1183/6006 [1:16:56<40:30,  1.98it/s]

URL https://microsofy.creatorlink.net/ returned status code 403


Processing URLs:  20%|█▉        | 1184/6006 [1:18:56<48:31:02, 36.22s/it]

Error with URL https://www.vergleichhandys.de/: HTTPSConnectionPool(host='www.vergleichhandys.de', port=443): Max retries exceeded with url: / (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.vergleichhandys.de', port=443): Read timed out. (read timeout=15)"))


Processing URLs:  20%|█▉        | 1185/6006 [1:19:28<46:42:09, 34.87s/it]

Error with URL http://everything.explained.today/Latency_stage/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /Latency_stage/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  20%|██        | 1208/6006 [1:20:29<13:38:07, 10.23s/it]

Error with URL http://www.sloaneandhyde.com/imm/new2015/pvalidate.html: HTTPConnectionPool(host='www.sloaneandhyde.com', port=80): Max retries exceeded with url: /imm/new2015/pvalidate.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa12eb60>: Failed to resolve 'www.sloaneandhyde.com' ([Errno -2] Name or service not known)"))


Processing URLs:  20%|██        | 1218/6006 [1:20:37<1:40:07,  1.25s/it]

URL https://www.natural1.it/ returned status code 403


Processing URLs:  20%|██        | 1222/6006 [1:20:41<1:08:25,  1.17it/s]

URL https://stocktwits.com/symbol/nok returned status code 403




Error with URL http://imagenesmentales.com/admin/update/Login/customer_center/customer-IDPP00C254/myaccount/signin/: HTTPConnectionPool(host='imagenesmentales.com', port=80): Max retries exceeded with url: /admin/update/Login/customer_center/customer-IDPP00C254/myaccount/signin/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  21%|██        | 1236/6006 [1:21:57<21:59:32, 16.60s/it]

Error with URL http://archiwum.allegro.secfence.lesavik.net/: HTTPConnectionPool(host='archiwum.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9eb6770>: Failed to resolve 'archiwum.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:  21%|██        | 1243/6006 [1:22:04<2:52:20,  2.17s/it]

URL http://www.correoweu.internetbasedfamily.com/correo.html returned status code 999


Processing URLs:  21%|██        | 1257/6006 [1:22:28<4:13:00,  3.20s/it]

Error with URL http://sp588226.sitebeat.site/: Exceeded 30 redirects.


Processing URLs:  21%|██        | 1258/6006 [1:22:28<3:09:48,  2.40s/it]

URL http://ww1.casasbahia.net/?sub1=e02c5da2-9c15-11ea-bab4-eced369576b8 returned status code 403


Processing URLs:  21%|██        | 1265/6006 [1:22:34<59:42,  1.32it/s]  

URL https://www.sewforless.com/ returned status code 403
URL http://houseoftiresbcs.com/Adobe/css/XML/PDF returned status code 406


Processing URLs:  21%|██        | 1267/6006 [1:22:50<6:49:57,  5.19s/it]

Error with URL http://searchnetworking.techtarget.com/definition/network-analyzer: HTTPSConnectionPool(host='www.techtarget.com', port=443): Read timed out.


Processing URLs:  21%|██        | 1275/6006 [1:22:57<1:20:10,  1.02s/it]

URL http://www.curry-shoes.com/ returned status code 403


Processing URLs:  21%|██        | 1276/6006 [1:23:27<13:01:21,  9.91s/it]

Error with URL http://www.ww.littletums.co.uk/includes/myemailbnz: HTTPConnectionPool(host='www.ww.littletums.co.uk', port=80): Max retries exceeded with url: /includes/myemailbnz (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  21%|██▏       | 1281/6006 [1:25:32<50:16:36, 38.31s/it]

Error with URL http://extravasatingmetalworker.com/: HTTPConnectionPool(host='extravasatingmetalworker.com', port=80): Max retries exceeded with url: / (Caused by ReadTimeoutError("HTTPConnectionPool(host='extravasatingmetalworker.com', port=80): Read timed out. (read timeout=15)"))


Processing URLs:  22%|██▏       | 1297/6006 [1:26:23<12:36:47,  9.64s/it]

Error with URL http://chronicle.co.kr/wp/wp-content/themes/twentythirteen/genericons/font/referreal/EMS/intraship/emss.php?l=_JeHFUq_VJOXK0QWHtoGYDw1774256418&fid.13InboxLight.aspxn.1774256418&fid.125289964252813InboxLight99642_Product-email&email=sales02: HTTPConnectionPool(host='chronicle.co.kr', port=80): Max retries exceeded with url: /wp/wp-content/themes/twentythirteen/genericons/font/referreal/EMS/intraship/emss.php?l=_JeHFUq_VJOXK0QWHtoGYDw1774256418&fid.13InboxLight.aspxn.1774256418&fid.125289964252813InboxLight99642_Product-email&email=sales02 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f8ed9db0>: Failed to resolve 'chronicle.co.kr' ([Errno -2] Name or service not known)"))


Processing URLs:  22%|██▏       | 1307/6006 [1:26:31<1:03:01,  1.24it/s]

URL https://authenticjobs.com/jobs/29269 returned status code 403


Processing URLs:  22%|██▏       | 1312/6006 [1:26:35<56:53,  1.38it/s]  

URL http://facebook.com-marketplace-93839.mediaryte.co/ returned status code 403


Processing URLs:  22%|██▏       | 1317/6006 [1:26:39<1:15:08,  1.04it/s]

URL http://www.dictionarist.com/menu+bar returned status code 403


Processing URLs:  22%|██▏       | 1348/6006 [1:29:05<47:22:44, 36.62s/it]

Error with URL http://www.thefreedictionary.com/Blackletter: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /Blackletter (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa2c1840>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  23%|██▎       | 1356/6006 [1:29:10<3:14:03,  2.50s/it]

URL http://inqserve.com/wp/wp-admin/network/FR-FR6963IMPOTSGOUV4/espace/sms2.html returned status code 406


Processing URLs:  23%|██▎       | 1359/6006 [1:29:43<13:16:28, 10.28s/it]

Error with URL http://azreptile.com/ticket/inc/index.php: HTTPConnectionPool(host='azreptile.com', port=80): Max retries exceeded with url: /ticket/inc/index.php (Caused by ResponseError('too many 500 error responses'))




URL https://www.larryformanlaw.com/ returned status code 403


Processing URLs:  23%|██▎       | 1365/6006 [1:30:16<13:36:48, 10.56s/it]

Error with URL http://premiumztore.com/www.paypal.com/c78e6bc1e4ee787951a22dfd2f53f062NTgwNDgwYzA4YWM3M2IxZDI2MjE3ZDI0MjQ1NmIyYzM=/resolution/websc_login/?country.x=&amp;locale.x=en_: HTTPConnectionPool(host='premiumztore.com', port=80): Max retries exceeded with url: /www.paypal.com/c78e6bc1e4ee787951a22dfd2f53f062NTgwNDgwYzA4YWM3M2IxZDI2MjE3ZDI0MjQ1NmIyYzM=/resolution/websc_login/?country.x=&amp;locale.x=en_ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9bdfca0>: Failed to resolve 'premiumztore.com' ([Errno -2] Name or service not known)"))


Processing URLs:  23%|██▎       | 1374/6006 [1:30:25<1:51:14,  1.44s/it]

URL https://s0htr.codesandbox.io/ returned status code 403


Processing URLs:  23%|██▎       | 1406/6006 [1:30:47<52:01,  1.47it/s]

URL https://www.svb-marine.it/ returned status code 403


Processing URLs:  24%|██▎       | 1414/6006 [1:31:25<12:52:00, 10.09s/it]

Error with URL http://kupony.allegro.secfence.lesavik.net: HTTPConnectionPool(host='kupony.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9eb69b0>: Failed to resolve 'kupony.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:  24%|██▎       | 1415/6006 [1:33:26<55:05:17, 43.20s/it]

Error with URL http://www.thefreedictionary.com/automation: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /automation (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9eb7ca0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  24%|██▎       | 1426/6006 [1:33:32<1:55:41,  1.52s/it]

URL https://www.modelclub.gr/ returned status code 403


Processing URLs:  24%|██▍       | 1429/6006 [1:33:38<2:21:02,  1.85s/it]

URL http://www.equalchances.org/net/page returned status code 403


Processing URLs:  24%|██▍       | 1445/6006 [1:34:22<12:50:31, 10.14s/it]

Error with URL http://projonmowave.com/wp-admin/includes/HJK/VN/WellsFargo/WellsFargo/wells/: HTTPConnectionPool(host='projonmowave.com', port=80): Max retries exceeded with url: /wp-admin/includes/HJK/VN/WellsFargo/WellsFargo/wells/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9f232e0>: Failed to resolve 'projonmowave.com' ([Errno -2] Name or service not known)"))


Processing URLs:  24%|██▍       | 1449/6006 [1:34:55<15:25:07, 12.18s/it]

Error with URL http://everything.explained.today/Word_processor/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /Word_processor/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  24%|██▍       | 1450/6006 [1:34:56<10:53:41,  8.61s/it]

URL http://www.un.org/depts/unmovic/documents/1441.pdf returned status code 403


Processing URLs:  24%|██▍       | 1451/6006 [1:34:56<7:42:58,  6.10s/it] 

URL https://www.latam.com/es_cl/ returned status code 403


Processing URLs:  24%|██▍       | 1459/6006 [1:35:05<1:59:37,  1.58s/it]

URL https://www.thiyyamatrimony.com/ returned status code 403


Processing URLs:  25%|██▍       | 1475/6006 [1:35:15<38:49,  1.95it/s]

URL https://leyendas.gob.pe/ returned status code 202


Processing URLs:  25%|██▍       | 1481/6006 [1:35:18<29:18,  2.57it/s]

URL http://tipsareforkids.com/port-and-starboard-game/ returned status code 403


Processing URLs:  25%|██▍       | 1497/6006 [1:35:28<30:31,  2.46it/s]

URL http://www.wtc.com/about/getting-here returned status code 403


Processing URLs:  25%|██▌       | 1511/6006 [1:36:14<12:58:18, 10.39s/it]

Error with URL http://riverside-resort.net/pdfs/cladogram-test-questions-with-answers.pdf: HTTPConnectionPool(host='riverside-resort.net', port=80): Max retries exceeded with url: /pdfs/cladogram-test-questions-with-answers.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7812fa2f16f0>: Failed to establish a new connection: [Errno 111] Connection refused'))


Processing URLs:  25%|██▌       | 1529/6006 [1:37:05<12:22:31,  9.95s/it]

Error with URL http://brighant.com/1122/?sec=Danielle: HTTPConnectionPool(host='brighant.com', port=80): Max retries exceeded with url: /1122/?sec=Danielle (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9eb78b0>: Failed to resolve 'brighant.com' ([Errno -2] Name or service not known)"))


Processing URLs:  26%|██▌       | 1550/6006 [1:37:33<32:15,  2.30it/s]

URL http://www.payscale.com/research/US/Job=Digital_Signal_Processing_(DSP)_Engineer/Salary returned status code 403


Processing URLs:  26%|██▌       | 1570/6006 [1:37:57<49:02,  1.51it/s]  

URL https://www.tazedirekt.com/ returned status code 403


Processing URLs:  26%|██▌       | 1572/6006 [1:37:58<50:51,  1.45it/s]

URL http://hag-info.ch returned status code 510


Processing URLs:  26%|██▌       | 1576/6006 [1:38:30<11:50:27,  9.62s/it]

Error with URL http://www.jp519.com/: HTTPConnectionPool(host='www.jp519.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9ecb910>: Failed to resolve 'www.jp519.com' ([Errno -2] Name or service not known)"))


Processing URLs:  26%|██▋       | 1591/6006 [1:38:42<37:21,  1.97it/s]

URL http://www.tempojunto.com/ returned status code 406


Processing URLs:  27%|██▋       | 1594/6006 [1:38:47<1:08:14,  1.08it/s]

URL http://www.gatepaper.in returned status code 410


Processing URLs:  27%|██▋       | 1607/6006 [1:39:28<12:00:31,  9.83s/it]

Error with URL https://www.bookdepository.com/Computed-Tomography-for-Technologists-Comprehensive-Text-Lois-Romans/9780781777513: HTTPSConnectionPool(host='www.amazon.com', port=443): Max retries exceeded with url: /b/ref=usbk_surl_books/?node=283155 (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  27%|██▋       | 1615/6006 [1:39:32<1:19:03,  1.08s/it]

URL https://dpfoidspoifopdsifpoi.blogspot.com/ returned status code 404


Processing URLs:  27%|██▋       | 1630/6006 [1:41:00<28:01:38, 23.06s/it]

Error with URL http://www.hartzine.com/: HTTPSConnectionPool(host='www.hartzine.com', port=443): Max retries exceeded with url: / (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  27%|██▋       | 1631/6006 [1:41:00<19:51:13, 16.34s/it]

URL https://wikispaces.psu.edu/x/Ijh-B returned status code 403


Processing URLs:  28%|██▊       | 1674/6006 [1:41:38<29:49,  2.42it/s]

URL http://regiscoyne.com/368762BDB30FAB1003AAB48B3362C445/?sec=Laura%20Testa returned status code 404


Processing URLs:  28%|██▊       | 1693/6006 [1:41:57<1:25:25,  1.19s/it]

URL http://www.dictionarist.com/batting returned status code 403


Processing URLs:  28%|██▊       | 1704/6006 [1:42:04<44:22,  1.62it/s]

URL https://www.benjerry.co.uk/ returned status code 403


Processing URLs:  29%|██▊       | 1722/6006 [1:42:22<55:34,  1.28it/s]

URL https://mykoreankitchen.com/dak-galbi/ returned status code 403


Processing URLs:  29%|██▉       | 1736/6006 [1:42:41<1:12:31,  1.02s/it]

URL https://moodle.ncirl.ie/login/index.php returned status code 403


Processing URLs:  29%|██▉       | 1748/6006 [1:43:30<11:49:15,  9.99s/it]

Error with URL http://labottee.com/cs/: HTTPConnectionPool(host='labottee.com', port=80): Max retries exceeded with url: /cs/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f82894e0>: Failed to resolve 'labottee.com' ([Errno -2] Name or service not known)"))


Processing URLs:  29%|██▉       | 1760/6006 [1:43:40<49:12,  1.44it/s]  

URL https://www.vg.hu/ returned status code 403


Processing URLs:  30%|██▉       | 1791/6006 [1:44:46<11:22:22,  9.71s/it]

Error with URL http://www.allegro.secfence.lesavik.net: HTTPConnectionPool(host='www.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa545e70>: Failed to resolve 'www.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:  30%|██▉       | 1793/6006 [1:45:19<13:37:56, 11.65s/it]

Error with URL https://www.foxwoods.com/rewards: HTTPSConnectionPool(host='www.foxwoods.com', port=443): Max retries exceeded with url: /rewards (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  30%|███       | 1802/6006 [1:45:29<1:23:59,  1.20s/it]

URL https://www.opel.gr/ returned status code 403


Processing URLs:  30%|███       | 1809/6006 [1:46:02<7:48:25,  6.70s/it] 

Error with URL http://www.brighant.com/1122/?sec=JochenKuntermann: HTTPConnectionPool(host='www.brighant.com', port=80): Max retries exceeded with url: /1122/?sec=JochenKuntermann (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f91c1480>: Failed to resolve 'www.brighant.com' ([Errno -2] Name or service not known)"))


Processing URLs:  30%|███       | 1827/6006 [1:46:15<32:05,  2.17it/s]

URL http://psykologidialog.dk/wp-admin/up.php returned status code 403


Processing URLs:  31%|███       | 1834/6006 [1:46:21<48:48,  1.42it/s]

URL http://www.dmega.co.kr/dmega/data/qna/sec/page.php?email=ZG91cmVzdEBjYXBpdGFscy5jby51aw== returned status code 403


Processing URLs:  31%|███       | 1841/6006 [1:48:28<42:33:26, 36.78s/it]

Error with URL http://www.thefreedictionary.com/traffic: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /traffic (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9079d80>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  31%|███       | 1864/6006 [1:48:42<38:20,  1.80it/s]

URL http://www.landscape-design-advisor.com/planning/virtual-design/virtual-sample-plans returned status code 403


Processing URLs:  31%|███       | 1871/6006 [1:48:48<46:38,  1.48it/s]

URL http://houseoftiresbcs.com/Adobe/css/XML/PDF/ returned status code 406


Processing URLs:  31%|███▏      | 1888/6006 [1:48:59<38:12,  1.80it/s]

URL http://www.zeepedia.com/ returned status code 403


Processing URLs:  32%|███▏      | 1893/6006 [1:49:33<10:58:36,  9.61s/it]

Error with URL http://oohlalasings.com/redir/MsgCentre/msgLists/?ID=nobody@mycraftmail.com: HTTPConnectionPool(host='oohlalasings.com', port=80): Max retries exceeded with url: /redir/MsgCentre/msgLists/?ID=nobody@mycraftmail.com (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa2f2980>: Failed to resolve 'oohlalasings.com' ([Errno -2] Name or service not known)"))


Processing URLs:  32%|███▏      | 1900/6006 [1:50:27<13:40:36, 11.99s/it]

Error with URL http://hostpoint.ch.152202cd.net2care.com/ar2: HTTPConnectionPool(host='hostpoint.ch.152202cd.net2care.com', port=80): Max retries exceeded with url: /ar2 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa50d000>: Failed to resolve 'hostpoint.ch.152202cd.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  32%|███▏      | 1912/6006 [1:52:43<43:08:44, 37.94s/it]

Error with URL http://rankw.ru/k/canon+multimedia+card+16+mmc-16m/: HTTPConnectionPool(host='rankw.ru', port=80): Max retries exceeded with url: /k/canon+multimedia+card+16+mmc-16m/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f8627a60>, 'Connection to rankw.ru timed out. (connect timeout=15)'))


Processing URLs:  32%|███▏      | 1914/6006 [1:54:44<62:20:13, 54.84s/it]

Error with URL http://www.thefreedictionary.com/tabloid: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /tabloid (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa13e140>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  32%|███▏      | 1924/6006 [1:54:57<2:54:12,  2.56s/it]

URL http://www.baconismagic.ca/food/five-foods-not-to-miss-in-nicaragua/ returned status code 403


Processing URLs:  32%|███▏      | 1936/6006 [1:55:36<11:13:47,  9.93s/it]

Error with URL http://projonmowave.com/wp-admin/includes/HJK/VN/WellsFargo/wells/wells.htm: HTTPConnectionPool(host='projonmowave.com', port=80): Max retries exceeded with url: /wp-admin/includes/HJK/VN/WellsFargo/wells/wells.htm (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f8ed80a0>: Failed to resolve 'projonmowave.com' ([Errno -2] Name or service not known)"))


Processing URLs:  32%|███▏      | 1937/6006 [1:55:37<8:02:03,  7.11s/it] 

URL http://cbango.com.ar/img/ returned status code 404


Processing URLs:  32%|███▏      | 1939/6006 [1:55:38<4:14:30,  3.75s/it]

URL http://www.houseoftiresbcs.com/Adobe/css/XML/PDF/georgetrent2941.html returned status code 406


Processing URLs:  32%|███▏      | 1942/6006 [1:55:40<1:54:01,  1.68s/it]

URL http://www.miniwebtool.com/mac-address-generator/ returned status code 403


Processing URLs:  32%|███▏      | 1951/6006 [1:55:45<35:13,  1.92it/s]

URL http://media.digikey.com/pdf/Data%20Sheets/Omron%20PDFs/E3S-X3.pdf returned status code 403


Processing URLs:  33%|███▎      | 1975/6006 [1:56:40<10:59:02,  9.81s/it]

Error with URL http://www.ljubavno-nebo.com/: HTTPConnectionPool(host='www.ljubavno-nebo.com', port=80): Max retries exceeded with url: / (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  33%|███▎      | 1981/6006 [1:57:15<12:05:57, 10.82s/it]

Error with URL http://www.chottosf.com/wp-admin/here/Re-ValidateYourMailbox.htm: HTTPConnectionPool(host='www.chottosf.com', port=80): Max retries exceeded with url: /wp-admin/here/Re-ValidateYourMailbox.htm (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  33%|███▎      | 1993/6006 [1:57:28<1:23:29,  1.25s/it]

URL http://www.bloghug.com/search-engines/ returned status code 404


Processing URLs:  33%|███▎      | 1994/6006 [1:58:00<11:29:43, 10.31s/it]

Error with URL http://www.projonmowave.com/wp-admin/includes/HJK/VN/WellsFargo/wells/: HTTPConnectionPool(host='www.projonmowave.com', port=80): Max retries exceeded with url: /wp-admin/includes/HJK/VN/WellsFargo/wells/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f90796c0>: Failed to resolve 'www.projonmowave.com' ([Errno -2] Name or service not known)"))


Processing URLs:  33%|███▎      | 2005/6006 [1:58:37<10:46:07,  9.69s/it]

Error with URL http://schaaf.ch.net2care.com/https//admin.hostpoint.ch/431abc189a39f5b6059cd5d279f83901/: HTTPConnectionPool(host='schaaf.ch.net2care.com', port=80): Max retries exceeded with url: /https//admin.hostpoint.ch/431abc189a39f5b6059cd5d279f83901/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa289c90>: Failed to resolve 'schaaf.ch.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  34%|███▍      | 2030/6006 [1:59:08<52:09,  1.27it/s]  

URL https://www.davar1.co.il/ returned status code 403


Processing URLs:  34%|███▍      | 2034/6006 [1:59:11<1:07:35,  1.02s/it]

URL http://www.telegraph.co.uk/news/2017/06/11/israeli-pm-calls-un-dismantle-palestinian-aid-agency/ returned status code 403


Processing URLs:  34%|███▍      | 2040/6006 [1:59:13<24:04,  2.75it/s]

URL http://www.payscale.com/research/US/Job=Software_Architect/Salary returned status code 403


Processing URLs:  34%|███▍      | 2041/6006 [1:59:14<30:06,  2.19it/s]

URL https://www.oglasnik.hr/ returned status code 403


Processing URLs:  34%|███▍      | 2044/6006 [1:59:15<29:16,  2.26it/s]

URL https://www.lollydaskal.com/ returned status code 403


Processing URLs:  34%|███▍      | 2052/6006 [1:59:20<40:38,  1.62it/s]

URL http://www.payscale.com/research/US/Job=Graphic_Designer/Salary returned status code 403


Processing URLs:  34%|███▍      | 2055/6006 [2:01:23<40:10:36, 36.61s/it]

Error with URL http://support.hp.com/my-en/document/bps80028: HTTPConnectionPool(host='support.hp.com', port=80): Max retries exceeded with url: /my-en/document/bps80028 (Caused by ReadTimeoutError("HTTPConnectionPool(host='support.hp.com', port=80): Read timed out. (read timeout=15)"))


Processing URLs:  34%|███▍      | 2062/6006 [2:01:26<3:43:53,  3.41s/it]

URL http://www.cityu.edu.hk/ returned status code 403


Processing URLs:  34%|███▍      | 2066/6006 [2:01:28<1:14:54,  1.14s/it]

URL http://www.lokmat.com/ returned status code 403


Processing URLs:  34%|███▍      | 2067/6006 [2:01:28<1:02:48,  1.05it/s]

URL http://www.austincityhomesearch.com/ returned status code 403


Processing URLs:  35%|███▍      | 2082/6006 [2:01:40<1:09:57,  1.07s/it]

URL http://e-handball.gr returned status code 403


Processing URLs:  35%|███▍      | 2083/6006 [2:02:13<11:25:48, 10.49s/it]

Error with URL http://atccn3.it/media/myaccount/signin/?country.x=EU&locale.x=en_EU: HTTPConnectionPool(host='atccn3.it', port=80): Max retries exceeded with url: /media/myaccount/signin/?country.x=EU&locale.x=en_EU (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  35%|███▍      | 2098/6006 [2:03:01<10:26:12,  9.61s/it]

Error with URL http://www.irene-schweizer.ch/: HTTPConnectionPool(host='www.irene-schweizer.ch', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9de10c0>: Failed to resolve 'www.irene-schweizer.ch' ([Errno -2] Name or service not known)"))


Processing URLs:  35%|███▍      | 2100/6006 [2:03:02<5:27:53,  5.04s/it]

URL https://www.regielive.ro/ returned status code 403


Processing URLs:  35%|███▌      | 2107/6006 [2:03:06<58:14,  1.12it/s]  

URL http://www.ijo.in returned status code 403


Processing URLs:  35%|███▌      | 2111/6006 [2:03:09<48:56,  1.33it/s]  

URL https://contarv.creatorlink.net/ returned status code 403


Processing URLs:  35%|███▌      | 2114/6006 [2:05:12<39:39:21, 36.68s/it]

Error with URL http://www.thefreedictionary.com/batting: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /batting (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f83f45e0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  35%|███▌      | 2125/6006 [2:05:21<1:41:40,  1.57s/it]

URL https://parafiaczarkow.ns48.pl/media/cms/sella/ returned status code 403


Processing URLs:  35%|███▌      | 2131/6006 [2:05:59<11:22:59, 10.58s/it]

Error with URL http://www.ldoceonline.com/Photography-topic/shot: HTTPSConnectionPool(host='www.ldoceonline.com', port=443): Max retries exceeded with url: /Photography-topic/shot (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  36%|███▌      | 2136/6006 [2:06:33<12:11:30, 11.34s/it]

Error with URL https://www.theuniversitystore.in/sheuniversit/dhlexpress/index.php: HTTPSConnectionPool(host='www.theuniversitystore.in', port=443): Max retries exceeded with url: /sheuniversit/dhlexpress/index.php (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7812f907a710>: Failed to resolve 'www.theuniversitystore.in' ([Errno -2] Name or service not known)"))


Processing URLs:  36%|███▌      | 2140/6006 [2:07:05<12:56:33, 12.05s/it]

Error with URL http://paservice.azurewebsites.net/: HTTPConnectionPool(host='paservice.azurewebsites.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f8e37b20>: Failed to resolve 'paservice.azurewebsites.net' ([Errno -2] Name or service not known)"))


Processing URLs:  36%|███▌      | 2148/6006 [2:07:12<1:31:15,  1.42s/it]

URL https://www.gelmec.co.uk/webmail.cornell.edu.html returned status code 403


Processing URLs:  36%|███▌      | 2150/6006 [2:07:16<1:33:33,  1.46s/it]

URL http://www.payscale.com/research/US/Job=Computer_Numerically_Controlled_(CNC)_Machinist/Hourly_Rate returned status code 403


Processing URLs:  36%|███▌      | 2151/6006 [2:07:17<1:22:25,  1.28s/it]

URL http://www.fujitsu.com/us/Images/Solarflare_Low-Latency_TestReport.pdf returned status code 404


Processing URLs:  36%|███▌      | 2160/6006 [2:07:28<1:28:14,  1.38s/it]

URL http://patents.justia.com/assignee/input-output-inc returned status code 403


Processing URLs:  36%|███▌      | 2163/6006 [2:07:30<58:45,  1.09it/s]  

URL http://www.topbbwporn.com/ returned status code 403


Processing URLs:  36%|███▋      | 2189/6006 [2:09:32<34:29:14, 32.53s/it]

Error with URL http://dfat.gov.au/people-to-people/australia-awards/Pages/australia-awards-scholarships-opening-and-closing-dates.aspx: HTTPSConnectionPool(host='www.dfat.gov.au', port=443): Max retries exceeded with url: /people-to-people/australia-awards/Pages/australia-awards-scholarships-opening-and-closing-dates.aspx (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.dfat.gov.au', port=443): Read timed out. (read timeout=15)"))


Processing URLs:  37%|███▋      | 2196/6006 [2:09:38<3:42:54,  3.51s/it]

URL https://s.id/njJ6Q returned status code 403


Processing URLs:  37%|███▋      | 2204/6006 [2:09:48<1:09:34,  1.10s/it]

URL http://www.japanese-buddhism.com/noble-eightfold-path.html returned status code 403


Processing URLs:  37%|███▋      | 2205/6006 [2:10:21<11:09:38, 10.57s/it]

Error with URL http://www.ayurvedicgyan.in/: HTTPConnectionPool(host='www.ayurvedicgyan.in', port=80): Max retries exceeded with url: / (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  37%|███▋      | 2209/6006 [2:10:23<2:59:38,  2.84s/it]

URL http://fashion4u.shop returned status code 402


Processing URLs:  37%|███▋      | 2219/6006 [2:10:31<48:48,  1.29it/s]  

URL http://floorsdirectltd.co.uk/chase/surf4.php returned status code 403


Processing URLs:  37%|███▋      | 2233/6006 [2:10:43<44:45,  1.40it/s]

URL http://finance.yahoo.com/q?s=NOK returned status code 404


Processing URLs:  37%|███▋      | 2239/6006 [2:10:50<1:12:06,  1.15s/it]

URL http://edition.cnn.com/2010/TECH/ptech/03/17/sxsw.foursquare.gowalla/ returned status code 404


Processing URLs:  38%|███▊      | 2256/6006 [2:11:07<56:33,  1.11it/s]

URL https://wiki.mikrotik.com/wiki/Manual:BGP_Case_Studies returned status code 404


Processing URLs:  38%|███▊      | 2267/6006 [2:11:54<12:48:42, 12.34s/it]

Error with URL http://gg.gg/fwi76: HTTPConnectionPool(host='gg.gg', port=80): Max retries exceeded with url: /fwi76 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9de01f0>: Failed to resolve 'gg.gg' ([Errno -2] Name or service not known)"))


Processing URLs:  38%|███▊      | 2278/6006 [2:12:31<10:17:55,  9.95s/it]

Error with URL http://www.beedictionary.com/definition/select_committee: HTTPSConnectionPool(host='www.beedictionary.com', port=443): Max retries exceeded with url: /definition/select_committee (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)')))


Processing URLs:  38%|███▊      | 2282/6006 [2:12:34<3:00:59,  2.92s/it]

URL http://media.digikey.com/pdf/Data%20Sheets/General%20Cable%20PDFs/GenSPEED_10MTP.pdf returned status code 403


Processing URLs:  38%|███▊      | 2284/6006 [2:14:35<38:51:39, 37.59s/it]

Error with URL http://forum.thefreedictionary.com/postst11355_Computer-mouse-plural--mice-too-.aspx: HTTPConnectionPool(host='forum.thefreedictionary.com', port=80): Max retries exceeded with url: /postst11355_Computer-mouse-plural--mice-too-.aspx (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9cb6440>, 'Connection to forum.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  38%|███▊      | 2286/6006 [2:14:36<19:17:29, 18.67s/it]

URL http://www.takepart.com/thehelp returned status code 403


Processing URLs:  39%|███▊      | 2321/6006 [2:15:34<9:50:55,  9.62s/it]

Error with URL http://reklama.allegro.secfence.lesavik.net: HTTPConnectionPool(host='reklama.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9de34f0>: Failed to resolve 'reklama.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:  39%|███▊      | 2325/6006 [2:16:07<12:19:35, 12.06s/it]

Error with URL http://everything.explained.today/Aurelio_Voltaire/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /Aurelio_Voltaire/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  39%|███▊      | 2327/6006 [2:17:01<22:31:13, 22.04s/it]

Error with URL http://www.thelongevityrevolution.com/file/WeTransfer.php: HTTPConnectionPool(host='www.thelongevityrevolution.com', port=80): Max retries exceeded with url: /file/WeTransfer.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa12eda0>: Failed to resolve 'www.thelongevityrevolution.com' ([Errno -2] Name or service not known)"))


Processing URLs:  39%|███▉      | 2337/6006 [2:17:16<1:25:08,  1.39s/it]

URL http://mercadotecniaic.com/registro_datos_2/images/owonikokoego.html returned status code 410


Processing URLs:  39%|███▉      | 2350/6006 [2:17:25<34:48,  1.75it/s]

URL https://www.coxandson.com/ returned status code 403


Processing URLs:  39%|███▉      | 2359/6006 [2:17:34<47:08,  1.29it/s]  

URL https://www.careersinmusic.com/celebrity-bodyguard/ returned status code 403


Processing URLs:  40%|███▉      | 2395/6006 [2:18:14<1:15:20,  1.25s/it]

URL http://mexworldwide.pk/MexPages.aspx returned status code 403


Processing URLs:  40%|███▉      | 2397/6006 [2:18:47<10:19:36, 10.30s/it]

Error with URL http://charytatywni.allegro.secfence.lesavik.net: HTTPConnectionPool(host='charytatywni.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9b68af0>: Failed to resolve 'charytatywni.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:  40%|███▉      | 2398/6006 [2:18:48<7:19:24,  7.31s/it] 

URL https://uksocialhousing.com/properties/view/4280423 returned status code 403


Processing URLs:  40%|████      | 2432/6006 [2:19:52<10:02:59, 10.12s/it]

Error with URL http://sani3789.odns.fr/: HTTPConnectionPool(host='sani3789.odns.fr', port=80): Max retries exceeded with url: / (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  41%|████      | 2433/6006 [2:19:52<7:12:19,  7.26s/it] 

URL https://www.talenthouse.com returned status code 404


Processing URLs:  41%|████      | 2436/6006 [2:21:54<38:33:17, 38.88s/it]

Error with URL http://www.thefreedictionary.com/pathogen: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /pathogen (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa39c7f0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  41%|████      | 2446/6006 [2:21:59<1:36:00,  1.62s/it]

URL http://expandedramblings.com/index.php/linkedin-job-statistics/ returned status code 406


Processing URLs:  41%|████      | 2462/6006 [2:22:15<48:37,  1.21it/s]

URL http://www.interiordezine.com/wp-admin/ returned status code 403


Processing URLs:  41%|████      | 2474/6006 [2:22:23<26:18,  2.24it/s]

URL https://store.pariyatti.org/Noble-Eightfold-Path-The--PDF-eBook_p_4795.html returned status code 403


Processing URLs:  41%|████      | 2475/6006 [2:22:23<28:55,  2.03it/s]

URL https://jbouy-my.sharepoint.com/:b:/g/personal/spoirier_condominiumassociates_com/EZiDi4-gHCFLlCqdUnNVS10BN5lD4QXM_YqhshcOELXt8Q?e=hJZaBn returned status code 404


Processing URLs:  41%|████      | 2477/6006 [2:22:24<26:56,  2.18it/s]

URL http://lesbenwelt.de/download/.js/T-online/Telekom.php returned status code 300


Processing URLs:  42%|████▏     | 2513/6006 [2:23:29<10:30:14, 10.83s/it]

Error with URL http://mudahpayroll.com/license.html: HTTPConnectionPool(host='mudahpayroll.com', port=80): Max retries exceeded with url: /license.html (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  42%|████▏     | 2514/6006 [2:25:29<42:22:27, 43.68s/it]

Error with URL http://www.mchemist.com/: HTTPConnectionPool(host='www.mchemist.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa39f760>, 'Connection to www.mchemist.com timed out. (connect timeout=15)'))


Processing URLs:  42%|████▏     | 2543/6006 [2:25:58<34:39,  1.67it/s]

URL http://www.ibtimes.com/mickey-mouse-birthday-10-facts-about-disney-cartoon-character-88th-anniversary-2448219 returned status code 403
URL https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38 returned status code 403


Processing URLs:  42%|████▏     | 2551/6006 [2:26:36<9:37:53, 10.04s/it]

Error with URL https://www.cmd168.net/: HTTPSConnectionPool(host='www.cmd168.net', port=443): Max retries exceeded with url: / (Caused by ResponseError('too many 502 error responses'))


Processing URLs:  43%|████▎     | 2558/6006 [2:27:12<10:10:41, 10.63s/it]

Error with URL http://www.arlberg-skifuehrer.com/06BFE1092A068A329FD238558EB61EF4/: HTTPConnectionPool(host='www.arlberg-skifuehrer.com', port=80): Max retries exceeded with url: /06BFE1092A068A329FD238558EB61EF4/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa31d2d0>: Failed to resolve 'www.arlberg-skifuehrer.com' ([Errno -2] Name or service not known)"))


Processing URLs:  43%|████▎     | 2569/6006 [2:29:20<34:44:39, 36.39s/it]

Error with URL http://facebookteste.comunidades.net/: HTTPConnectionPool(host='facebookteste.comunidades.net', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9093b80>, 'Connection to facebookteste.comunidades.net timed out. (connect timeout=15)'))


Processing URLs:  43%|████▎     | 2572/6006 [2:30:06<24:58:52, 26.19s/it]

Error with URL http://www.jreg99.com/vendor/phpunit/phpunit/src/util/php/captcha/dashboard/2.php?52bf0454eadfa14a621d2aae810d604d-52bf0454eadfa14a621d2aae810d604d-52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d: HTTPConnectionPool(host='www.jreg99.com', port=80): Max retries exceeded with url: /vendor/phpunit/phpunit/src/util/php/captcha/dashboard/2.php?52bf0454eadfa14a621d2aae810d604d-52bf0454eadfa14a621d2aae810d604d-52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d52bf0454eadfa14a621d2aae810d604d (Caused by NewConnectionError('<urllib3.connecti

Processing URLs:  43%|████▎     | 2590/6006 [2:30:21<42:30,  1.34it/s]

URL https://www.ghacks.net/2008/12/29/abiword-a-lean-word-processing-machine/ returned status code 403


Processing URLs:  43%|████▎     | 2594/6006 [2:30:23<31:05,  1.83it/s]

URL http://regiscoyne.com/368762BDB30FAB1003AAB48B3362C445/?sec=LauraTesta returned status code 404


Processing URLs:  43%|████▎     | 2607/6006 [2:32:35<34:38:24, 36.69s/it]

Error with URL http://www.thefreedictionary.com/MultiMediaCard: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /MultiMediaCard (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9cb5240>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  43%|████▎     | 2609/6006 [2:32:36<17:04:35, 18.10s/it]

URL https://www.ed2go.com/career/training-programs/helpdesk returned status code 403


Processing URLs:  44%|████▎     | 2624/6006 [2:32:54<37:32,  1.50it/s]

URL https://webmailgobcom.creatorlink.net/ returned status code 403


Processing URLs:  44%|████▍     | 2629/6006 [2:33:00<1:09:30,  1.23s/it]

URL http://www.dictionarist.com/select+committee returned status code 403


Processing URLs:  44%|████▍     | 2630/6006 [2:33:00<56:41,  1.01s/it]  

URL https://fdx.co.th/document83837383hdhd833/proposal8383383h3b3833/s/?signin=d41d8cd98f00b204e9800998ecf8427e&amp;auth=600b4d3ba4da7c7ed20a5c28f812ef04c36642ef43364c41a62618b8cb5c45f30caa672e returned status code 403


Processing URLs:  44%|████▍     | 2633/6006 [2:33:34<9:21:07,  9.98s/it]

Error with URL http://getactive365.com/wp-includes/css/upgrade/web-upgrade/: HTTPConnectionPool(host='getactive365.com', port=80): Max retries exceeded with url: /wp-includes/css/upgrade/web-upgrade/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f98d2920>: Failed to resolve 'getactive365.com' ([Errno -2] Name or service not known)"))


Processing URLs:  44%|████▍     | 2644/6006 [2:33:45<53:10,  1.05it/s]

URL https://klsjdlfkjqslfkjsdlkfjldsfjldsf.blogspot.com/ returned status code 404


Processing URLs:  44%|████▍     | 2660/6006 [2:34:26<8:51:48,  9.54s/it]

Error with URL http://174.138.36.47/banks/ATB/: HTTPConnectionPool(host='174.138.36.47', port=80): Max retries exceeded with url: /banks/ATB/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7812f910a110>: Failed to establish a new connection: [Errno 111] Connection refused'))


Processing URLs:  44%|████▍     | 2661/6006 [2:34:58<15:07:06, 16.27s/it]

Error with URL http://mjgh.hyperphp.com/az: HTTPConnectionPool(host='mjgh.hyperphp.com', port=80): Max retries exceeded with url: /az (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  44%|████▍     | 2669/6006 [2:37:06<35:05:41, 37.86s/it]

Error with URL http://medical-dictionary.thefreedictionary.com/latency+stage: HTTPConnectionPool(host='medical-dictionary.thefreedictionary.com', port=80): Max retries exceeded with url: /latency+stage (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f8573100>, 'Connection to medical-dictionary.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  44%|████▍     | 2671/6006 [2:37:39<26:22:47, 28.48s/it]

Error with URL http://paypal.com.0.confirmation.account-security.7741d16fef9571be97716a958700fe4d7741d16fef9571be97716a958700f.3233.privado.info/: HTTPConnectionPool(host='paypal.com.0.confirmation.account-security.7741d16fef9571be97716a958700fe4d7741d16fef9571be97716a958700f.3233.privado.info', port=80): Max retries exceeded with url: / (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  44%|████▍     | 2672/6006 [2:37:41<18:46:07, 20.27s/it]

URL http://designation.io/blog/digital-and-graphic-designer-salary/ returned status code 403


Processing URLs:  45%|████▍     | 2673/6006 [2:37:43<13:45:04, 14.85s/it]

URL https://www.iskysoft.com/phone-transfer/top-5-android-device-manager-for-mac.html returned status code 404


Processing URLs:  45%|████▍     | 2680/6006 [2:37:46<1:30:19,  1.63s/it]

URL http://www.payscale.com/research/US/Job=Computed_Tomography_(CT)_Technologist/Hourly_Rate returned status code 403


Processing URLs:  45%|████▍     | 2686/6006 [2:37:52<49:07,  1.13it/s]  

URL http://www.payscale.com/research/US/Job=MRI_Technologist/Hourly_Rate returned status code 403


Processing URLs:  45%|████▍     | 2691/6006 [2:38:37<10:25:40, 11.32s/it]

Error with URL https://www.foxwoods.com/shows: HTTPSConnectionPool(host='www.foxwoods.com', port=443): Max retries exceeded with url: /shows (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  45%|████▌     | 2704/6006 [2:40:46<33:33:58, 36.60s/it]

Error with URL http://www.thefreedictionary.com/image+scanner: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /image+scanner (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9108a60>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  45%|████▌     | 2705/6006 [2:40:46<23:44:05, 25.88s/it]

URL http://www.explainthatstuff.com/cdplayers.html returned status code 403


Processing URLs:  45%|████▌     | 2716/6006 [2:41:00<1:12:07,  1.32s/it]

URL http://www.consumerreports.org/cro/digital-cameras/buying-guide returned status code 403


Processing URLs:  45%|████▌     | 2717/6006 [2:41:00<54:43,  1.00it/s]  

URL https://www.newegg.com/Product/Product.aspx?Item=N82E16833588001 returned status code 403


Processing URLs:  45%|████▌     | 2724/6006 [2:41:02<21:49,  2.51it/s]

URL https://vzrew.creatorlink.net/ returned status code 403


Processing URLs:  45%|████▌     | 2730/6006 [2:41:08<37:28,  1.46it/s]

URL https://www.lparetail.com/ returned status code 403


Processing URLs:  46%|████▌     | 2753/6006 [2:43:23<33:03:37, 36.59s/it]

Error with URL http://www.thefreedictionary.com/firewall: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /firewall (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f91c3220>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  46%|████▌     | 2759/6006 [2:44:11<15:36:25, 17.30s/it]

Error with URL http://gg.gg/PrimeroTuSalud: HTTPConnectionPool(host='gg.gg', port=80): Max retries exceeded with url: /PrimeroTuSalud (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f83b6470>: Failed to resolve 'gg.gg' ([Errno -2] Name or service not known)"))


Processing URLs:  46%|████▋     | 2787/6006 [2:45:06<7:51:23,  8.79s/it]

Error with URL http://www.theuniversitystore.in/sheuniversit/dhlexpress/autofil/id.php: HTTPConnectionPool(host='www.theuniversitystore.in', port=80): Max retries exceeded with url: /sheuniversit/dhlexpress/autofil/id.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f8ebc4f0>: Failed to resolve 'www.theuniversitystore.in' ([Errno -2] Name or service not known)"))


Processing URLs:  46%|████▋     | 2789/6006 [2:45:07<4:14:54,  4.75s/it]

URL http://www.iranhiv.com/ returned status code 403


Processing URLs:  47%|████▋     | 2796/6006 [2:45:40<6:14:19,  7.00s/it]

Error with URL http://heppler.ch.net2care.com/https//admin.hostpoint.ch/a7759c7e6734e26f4b78cf6526eb95cf/: HTTPConnectionPool(host='heppler.ch.net2care.com', port=80): Max retries exceeded with url: /https//admin.hostpoint.ch/a7759c7e6734e26f4b78cf6526eb95cf/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9d3bcd0>: Failed to resolve 'heppler.ch.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  47%|████▋     | 2805/6006 [2:45:51<1:19:34,  1.49s/it]

URL http://www.santamariadelmar.es/ returned status code 403


Processing URLs:  47%|████▋     | 2811/6006 [2:45:57<51:55,  1.03it/s]  

URL http://app.dialoginsight.com/T/OFC4/L2S/3888/B2685608/XRKO/737015/27934048/Ehc0TG/1/1627685/VdR2nPao/I/749903/y3ULZ4.html returned status code 202


Processing URLs:  47%|████▋     | 2821/6006 [2:46:02<22:22,  2.37it/s]

URL https://www.radiantlogic.com/ returned status code 403


Processing URLs:  48%|████▊     | 2854/6006 [2:47:17<8:15:23,  9.43s/it]

Error with URL http://likss-updat-schb.demopage.co/: HTTPConnectionPool(host='likss-updat-schb.demopage.co', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f838e650>: Failed to resolve 'likss-updat-schb.demopage.co' ([Errno -2] Name or service not known)"))


Processing URLs:  48%|████▊     | 2870/6006 [2:47:29<26:15,  1.99it/s]

URL http://www.foodnetwork.com/recipes/korean-pork-chops-with-gochujang-marinade returned status code 403


Processing URLs:  48%|████▊     | 2888/6006 [2:47:42<28:24,  1.83it/s]

URL http://www.revistaclipa.com/wp-includes/js/message/ returned status code 403


Processing URLs:  48%|████▊     | 2896/6006 [2:48:19<8:39:10, 10.02s/it]

Error with URL http://sonofabridge.com.net2care.com/https//admin.hostpoint.ch: HTTPConnectionPool(host='sonofabridge.com.net2care.com', port=80): Max retries exceeded with url: /https//admin.hostpoint.ch (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa13fe20>: Failed to resolve 'sonofabridge.com.net2care.com' ([Errno -2] Name or service not known)"))




Error with URL http://www.jreg99.com/vendor/phpunit/phpunit/src/util/php/captcha/dashboard/2.php?ad8cb174fbc3fdf20e146ca78f3e5036-ad8cb174fbc3fdf20e146ca78f3e5036-ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036: HTTPConnectionPool(host='www.jreg99.com', port=80): Max retries exceeded with url: /vendor/phpunit/phpunit/src/util/php/captcha/dashboard/2.php?ad8cb174fbc3fdf20e146ca78f3e5036-ad8cb174fbc3fdf20e146ca78f3e5036-ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036ad8cb174fbc3fdf20e146ca78f3e5036 (Caused by NewConnectionError('<urllib3.connecti

Processing URLs:  48%|████▊     | 2906/6006 [2:49:44<11:58:36, 13.91s/it]

Error with URL http://hostpoint.ch.0f79025d.net2care.com/ar2: HTTPConnectionPool(host='hostpoint.ch.0f79025d.net2care.com', port=80): Max retries exceeded with url: /ar2 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa0fb370>: Failed to resolve 'hostpoint.ch.0f79025d.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  48%|████▊     | 2911/6006 [2:50:20<11:08:26, 12.96s/it]

Error with URL http://www.eawibp.org/themes/ow/auth/logon.aspx: HTTPConnectionPool(host='www.eawibp.org', port=80): Max retries exceeded with url: /themes/ow/auth/logon.aspx (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))


Processing URLs:  49%|████▊     | 2920/6006 [2:50:27<59:17,  1.15s/it]  

URL http://www.biglittlegeek.com/best-antivirus-apps-for-iphone-ipad/ returned status code 403


Processing URLs:  49%|████▊     | 2921/6006 [2:50:28<45:38,  1.13it/s]

URL https://www.bryanhansel.com/ returned status code 406


Processing URLs:  49%|████▉     | 2944/6006 [2:51:19<5:53:22,  6.92s/it]

Error with URL https://www.instagram.com/imdbpro/: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /accounts/login/?next=https%3A%2F%2Fwww.instagram.com%2Fimdbpro%2F&is_from_rle (Caused by ResponseError('too many 429 error responses'))
URL http://cookieandkate.com/2015/thai-red-curry-recipe/ returned status code 403


Processing URLs:  49%|████▉     | 2950/6006 [2:51:54<8:49:43, 10.40s/it]

Error with URL http://brighant.com/1122/?sec=Hendrik: HTTPConnectionPool(host='brighant.com', port=80): Max retries exceeded with url: /1122/?sec=Hendrik (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f951e5f0>: Failed to resolve 'brighant.com' ([Errno -2] Name or service not known)"))


Processing URLs:  49%|████▉     | 2951/6006 [2:51:55<6:18:35,  7.44s/it]

URL https://www.google.com/url?q=https://boobs-tits.com/wp-includes/fonts/ba/OneDrive_Solo/&amp;source=gmail&amp;ust=1587210002367000&amp;usg=AFQjCNECu6XKMm7Kzz63zfwqutEGvNACLg returned status code 403


Processing URLs:  49%|████▉     | 2954/6006 [2:52:27<10:19:09, 12.17s/it]

Error with URL http://www.tutorialscollection.com/javascript-switch-how-to-use-javascript-switch-case-with-examples/: HTTPConnectionPool(host='www.tutorialscollection.com', port=80): Max retries exceeded with url: /javascript-switch-how-to-use-javascript-switch-case-with-examples/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f838f100>: Failed to resolve 'www.tutorialscollection.com' ([Errno -5] No address associated with hostname)"))


Processing URLs:  49%|████▉     | 2957/6006 [2:52:33<4:32:26,  5.36s/it]

Error with URL http://www.tutorialspoint.com/dbms/: Exceeded 30 redirects.
URL https://www.filedeo.com/ returned status code 403


Processing URLs:  49%|████▉     | 2969/6006 [2:54:49<31:12:55, 37.00s/it]

Error with URL http://acronyms.thefreedictionary.com/Transaction+Account+Guarantee+Program: HTTPConnectionPool(host='acronyms.thefreedictionary.com', port=80): Max retries exceeded with url: /Transaction+Account+Guarantee+Program (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f83b5240>, 'Connection to acronyms.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  50%|████▉     | 2974/6006 [2:54:54<5:44:19,  6.81s/it]

URL http://houseoftiresbcs.com/Adobe/css/XML/PDF/georgetrent2941.html returned status code 406


Processing URLs:  50%|████▉     | 2979/6006 [2:55:30<9:04:25, 10.79s/it]

Error with URL http://www.irfantrading.com/Style/logi/userverify/loginspf/login.html: HTTPConnectionPool(host='www.irfantrading.com', port=80): Max retries exceeded with url: /Style/logi/userverify/loginspf/login.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa288fd0>: Failed to resolve 'www.irfantrading.com' ([Errno -2] Name or service not known)"))


Processing URLs:  50%|████▉     | 2988/6006 [2:56:09<6:49:21,  8.14s/it]

Error with URL http://www.mp3juices.cc/: HTTPConnectionPool(host='www.mp3juices.cc', port=80): Max retries exceeded with url: / (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  50%|█████     | 3014/6006 [2:58:30<30:41:10, 36.92s/it]

Error with URL http://legal-dictionary.thefreedictionary.com/Incorporation+Doctrine: HTTPConnectionPool(host='legal-dictionary.thefreedictionary.com', port=80): Max retries exceeded with url: /Incorporation+Doctrine (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa23c5b0>, 'Connection to legal-dictionary.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  50%|█████     | 3021/6006 [2:58:40<3:14:22,  3.91s/it]

URL https://perluna-detyam.com.ua/ returned status code 403


Processing URLs:  50%|█████     | 3029/6006 [3:00:46<30:21:19, 36.71s/it]

Error with URL https://www.acronymfinder.com/Transaction-Account-Guarantee-(TAG).html: HTTPSConnectionPool(host='www.acronymfinder.com', port=443): Max retries exceeded with url: /Transaction-Account-Guarantee-(TAG).html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7812f91c2920>, 'Connection to www.acronymfinder.com timed out. (connect timeout=15)'))


Processing URLs:  51%|█████     | 3040/6006 [3:01:26<8:36:00, 10.44s/it]

Error with URL https://www.instagram.com/unicard.us: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /accounts/login/?next=https%3A%2F%2Fwww.instagram.com%2Funicard.us%2F&is_from_rle (Caused by ResponseError('too many 429 error responses'))


Processing URLs:  51%|█████     | 3054/6006 [3:01:52<1:38:23,  2.00s/it]

URL http://indianhealthyrecipes.com/recipes/veg-curry/ returned status code 403


Processing URLs:  51%|█████     | 3057/6006 [3:01:54<52:20,  1.06s/it]  

URL http://www.fuhyo-bengoshicafe.com/ returned status code 403


Processing URLs:  51%|█████▏    | 3084/6006 [3:02:54<8:41:53, 10.72s/it]

Error with URL http://projonmowave.com/wp-admin/includes/HJK/VN/WellsFargo/WellsFargo/wells/wells.htm: HTTPConnectionPool(host='projonmowave.com', port=80): Max retries exceeded with url: /wp-admin/includes/HJK/VN/WellsFargo/WellsFargo/wells/wells.htm (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9069420>: Failed to resolve 'projonmowave.com' ([Errno -2] Name or service not known)"))


Processing URLs:  51%|█████▏    | 3089/6006 [3:02:58<1:55:15,  2.37s/it]

URL https://www.google.com/url?hl=en&amp;q=http://cmffltd.com/wp-zbxb/zbxbx/mobile/exx/Account/index.php?email%3D%5B%5B-Email-%5D%5D&amp;source=gmail&amp;ust=1523355231217000&amp;usg=AFQjCNFBC5DLonStz7Nxpw6Hp_V9soeAPQ returned status code 403


Processing URLs:  51%|█████▏    | 3091/6006 [3:03:00<1:20:13,  1.65s/it]

URL https://andresjorge.com.mx/wp-content/themes/seminario/swiss-post%20-tracking/manage/?view=login&appIdKey=fcd00c0656cc490&country returned status code 404




Error with URL http://www.thefreedictionary.com/collision: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /collision (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9fd5840>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  52%|█████▏    | 3121/6006 [3:05:58<27:50:45, 34.75s/it]

Error with URL http://www.beedictionary.com/definition/image_scanner: HTTPSConnectionPool(host='www.beedictionary.com', port=443): Max retries exceeded with url: /definition/image_scanner (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)')))


Processing URLs:  52%|█████▏    | 3129/6006 [3:06:02<1:55:27,  2.41s/it]

URL http://www.techrepublic.com/article/node-red/ returned status code 406


Processing URLs:  52%|█████▏    | 3131/6006 [3:06:50<12:15:54, 15.36s/it]

Error with URL http://www.jreg99.com/vendor/phpunit/phpunit/src/util/php/captcha/dashboard/INFO.php?fc7463aec7263479e0eceecba6356478-fc7463aec7263479e0eceecba6356478-fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478: HTTPConnectionPool(host='www.jreg99.com', port=80): Max retries exceeded with url: /vendor/phpunit/phpunit/src/util/php/captcha/dashboard/INFO.php?fc7463aec7263479e0eceecba6356478-fc7463aec7263479e0eceecba6356478-fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478fc7463aec7263479e0eceecba6356478 (Caused by NewConnectionError('<urllib3.co

Processing URLs:  52%|█████▏    | 3134/6006 [3:07:21<9:23:21, 11.77s/it] 

Error with URL http://174.138.36.47/banks/ATB/last.html: HTTPConnectionPool(host='174.138.36.47', port=80): Max retries exceeded with url: /banks/ATB/last.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7812fa2c0d90>: Failed to establish a new connection: [Errno 111] Connection refused'))


Processing URLs:  52%|█████▏    | 3137/6006 [3:07:22<3:24:20,  4.27s/it]

URL http://regiscoyne.com/368762BDB30FAB1003AAB48B3362C445/ returned status code 404


Processing URLs:  52%|█████▏    | 3140/6006 [3:07:53<8:29:19, 10.66s/it]

Error with URL https://www.mkup.com.mx/: HTTPSConnectionPool(host='www.mkup.com.mx', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7812f9d0e7a0>: Failed to resolve 'www.mkup.com.mx' ([Errno -2] Name or service not known)"))


Processing URLs:  53%|█████▎    | 3157/6006 [3:08:48<10:19:13, 13.04s/it]

Error with URL http://images2.imgbox.com/cf/29/yD1aZyU5_o.png: HTTPConnectionPool(host='images2.imgbox.com', port=80): Read timed out.


Processing URLs:  53%|█████▎    | 3167/6006 [3:08:58<56:35,  1.20s/it]  

URL http://exclusive.mk.ua returned status code 403


Processing URLs:  53%|█████▎    | 3177/6006 [3:11:06<29:00:55, 36.92s/it]

Error with URL http://encyclopedia2.thefreedictionary.com/help+desk+analyst: HTTPConnectionPool(host='encyclopedia2.thefreedictionary.com', port=80): Max retries exceeded with url: /help+desk+analyst (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f8571b70>, 'Connection to encyclopedia2.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  53%|█████▎    | 3185/6006 [3:11:15<2:09:58,  2.76s/it]

URL https://www.newegg.com/Memory/Category/ID-17 returned status code 403


Processing URLs:  53%|█████▎    | 3195/6006 [3:11:20<22:48,  2.05it/s]

URL https://www.francealzheimer.org/ returned status code 403


Processing URLs:  53%|█████▎    | 3202/6006 [3:11:27<29:54,  1.56it/s]

URL https://www.cvedetails.com/ returned status code 403


Processing URLs:  54%|█████▎    | 3216/6006 [3:11:38<30:47,  1.51it/s]

URL http://www.infosecwriters.com/text_resources/pdf/SKapoor_SessionHijacking.pdf returned status code 404


Processing URLs:  54%|█████▍    | 3256/6006 [3:13:08<5:27:38,  7.15s/it]

Error with URL http://www.solvusoft.com/en/update/drivers/processor/smsc/most-network-interface-controller-os8104a/sub-models/: HTTPSConnectionPool(host='www.solvusoft.com', port=443): Max retries exceeded with url: /en/update/drivers/processor/smsc/most-network-interface-controller-os8104a/sub-models/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))
URL https://www.cbinsights.com/ returned status code 403


Processing URLs:  54%|█████▍    | 3259/6006 [3:13:43<9:35:01, 12.56s/it]

Error with URL http://www.urdupdfbooks.com: HTTPConnectionPool(host='www.urdupdfbooks.com', port=80): Max retries exceeded with url: / (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  55%|█████▍    | 3280/6006 [3:14:04<16:44,  2.71it/s]

URL http://www.connectingthreads.com/batting/quilt_batting.html returned status code 403


Processing URLs:  55%|█████▍    | 3283/6006 [3:14:09<49:18,  1.09s/it]

URL http://verinet.dk/media/aufcu/red.php returned status code 555


Processing URLs:  55%|█████▍    | 3302/6006 [3:14:29<40:26,  1.11it/s]

URL http://www.megavideosdesexo.com/ returned status code 403


Processing URLs:  55%|█████▌    | 3310/6006 [3:14:36<37:21,  1.20it/s]

URL http://www.llewellyn.com/product.php?ean=9780738713373 returned status code 403


Processing URLs:  55%|█████▌    | 3317/6006 [3:14:41<28:49,  1.56it/s]

URL http://www.consumerreports.org/portable-air-conditioners/are-portable-air-conditioners-a-lot-of-hot-air/ returned status code 403


Processing URLs:  55%|█████▌    | 3318/6006 [3:14:42<28:14,  1.59it/s]

URL http://www.imdb.com/company/co0073417/ returned status code 403


Processing URLs:  55%|█████▌    | 3322/6006 [3:14:44<25:13,  1.77it/s]

URL http://www.ibtimes.com/australian-open-2017-prize-money-how-much-could-rafael-nadal-roger-federer-andy-2476598 returned status code 403


Processing URLs:  55%|█████▌    | 3325/6006 [3:15:14<5:24:29,  7.26s/it]

Error with URL http://kecmanijada.com/wp-includes/theme-compat/bbwyspmpge/action.php: HTTPConnectionPool(host='kecmanijada.com', port=80): Max retries exceeded with url: /wp-includes/theme-compat/bbwyspmpge/action.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9d0dc00>: Failed to resolve 'kecmanijada.com' ([Errno -2] Name or service not known)"))


Processing URLs:  55%|█████▌    | 3328/6006 [3:17:18<27:00:51, 36.31s/it]

Error with URL https://j.mp/39oczg7: HTTPConnectionPool(host='dizainforma.rsvpu.ru', port=80): Max retries exceeded with url: /wp-content/uploads/2019/05/njag/browsetag/m20_order_list.php/durzd/ncwzv/?drive=k11z20tban0z (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9d0f160>, 'Connection to dizainforma.rsvpu.ru timed out. (connect timeout=15)'))


Processing URLs:  55%|█████▌    | 3332/6006 [3:17:20<7:10:39,  9.66s/it] 

URL https://www.beatport.com/label/input-output-inc/4606 returned status code 403


Processing URLs:  56%|█████▌    | 3337/6006 [3:17:23<1:46:28,  2.39s/it]

URL https://www.anema-santorini.com/images/dropbox/page returned status code 406


Processing URLs:  56%|█████▌    | 3356/6006 [3:18:04<4:48:01,  6.52s/it]

Error with URL http://sloaneandhyde.com/imm/new2015/document.php: HTTPConnectionPool(host='sloaneandhyde.com', port=80): Max retries exceeded with url: /imm/new2015/document.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f838d270>: Failed to resolve 'sloaneandhyde.com' ([Errno -2] Name or service not known)"))
URL https://uk0qx.codesandbox.io/ returned status code 403


Processing URLs:  56%|█████▌    | 3363/6006 [3:18:41<7:51:16, 10.70s/it]

Error with URL http://sanjuandelsur.org/nicaragua-currency-and-banks/: HTTPConnectionPool(host='sanjuandelsur.org', port=80): Max retries exceeded with url: /nicaragua-currency-and-banks/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f951eda0>: Failed to resolve 'sanjuandelsur.org' ([Errno -2] Name or service not known)"))


Processing URLs:  56%|█████▌    | 3376/6006 [3:18:51<38:06,  1.15it/s]

URL http://www.iplocationfinder.com/mp3clan.com returned status code 403


Processing URLs:  56%|█████▋    | 3388/6006 [3:21:06<27:00:51, 37.15s/it]

Error with URL http://www.thefreedictionary.com/corpora: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /corpora (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa13f940>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  57%|█████▋    | 3399/6006 [3:21:51<5:50:34,  8.07s/it]

Error with URL http://security-mappl-information-account.piiquarry.com/nextstep.php: HTTPConnectionPool(host='security-mappl-information-account.piiquarry.com', port=80): Max retries exceeded with url: /nextstep.php (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))
URL http://www.soaprecipes101.com/homemade-soap-recipes/insecticidal-soap-recipe/ returned status code 404


Processing URLs:  57%|█████▋    | 3400/6006 [3:21:51<4:08:39,  5.73s/it]

URL http://www.payscale.com/research/US/Employer=Help_at_Home%2C_Inc./Hourly_Rate returned status code 403


Processing URLs:  57%|█████▋    | 3408/6006 [3:21:58<44:01,  1.02s/it]

URL http://www.payscale.com/research/US/Job=Help_Desk_Analyst/Hourly_Rate returned status code 403


Processing URLs:  57%|█████▋    | 3409/6006 [3:23:58<26:33:21, 36.81s/it]

Error with URL http://www.thefreedictionary.com/Savers: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /Savers (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f910bfa0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  57%|█████▋    | 3413/6006 [3:24:00<6:37:43,  9.20s/it]

URL http://www.solarwinds.com/topics/packet-analyzer returned status code 403


Processing URLs:  57%|█████▋    | 3416/6006 [3:24:03<2:35:39,  3.61s/it]

URL http://www.tablefortwoblog.com/ returned status code 403


Processing URLs:  57%|█████▋    | 3420/6006 [3:26:06<27:02:33, 37.65s/it]

Error with URL https://www.iranejra.com/: HTTPSConnectionPool(host='www.iranejra.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7812f85024d0>, 'Connection to www.iranejra.com timed out. (connect timeout=15)'))


Processing URLs:  57%|█████▋    | 3431/6006 [3:26:15<58:29,  1.36s/it]  

URL http://whatsapp-join.zyns.com/ returned status code 404


Processing URLs:  57%|█████▋    | 3442/6006 [3:26:51<6:49:48,  9.59s/it]

Error with URL https://www.ecomparemo.com/insurance/charter-ping-an-insurance: HTTPSConnectionPool(host='www.ecomparemo.com', port=443): Max retries exceeded with url: /insurance/charter-ping-an-insurance (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)')))


Processing URLs:  57%|█████▋    | 3448/6006 [3:26:55<1:15:17,  1.77s/it]

URL http://www.nhs.uk/Conditions/peripheralarterialdisease/Pages/Treatment.aspx returned status code 404


Processing URLs:  57%|█████▋    | 3449/6006 [3:28:56<26:34:18, 37.41s/it]

Error with URL http://www.thefreedictionary.com/resolution: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /resolution (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f82895a0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  57%|█████▋    | 3452/6006 [3:29:29<15:55:22, 22.44s/it]

Error with URL http://rencon.ch.net2care.com/https//admin.hostpoint.ch/024b107bf11224c40000: HTTPConnectionPool(host='rencon.ch.net2care.com', port=80): Max retries exceeded with url: /https//admin.hostpoint.ch/024b107bf11224c40000 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f8624490>: Failed to resolve 'rencon.ch.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  58%|█████▊    | 3461/6006 [3:29:35<1:06:41,  1.57s/it]

URL http://05.my03.com/login.php returned status code 404


Processing URLs:  58%|█████▊    | 3468/6006 [3:29:40<34:46,  1.22it/s]

URL http://www.facebook.com-marketplace-93839.mediaryte.co/ returned status code 403


Processing URLs:  58%|█████▊    | 3477/6006 [3:29:48<31:32,  1.34it/s]

URL http://www.shivji.in/ returned status code 403


Processing URLs:  58%|█████▊    | 3478/6006 [3:30:19<6:44:59,  9.61s/it]

Error with URL http://spontan.ch.net2care.com/https//admin.hostpoint.ch/bd2658fa4bbebca8b225419a25d152b0/: HTTPConnectionPool(host='spontan.ch.net2care.com', port=80): Max retries exceeded with url: /https//admin.hostpoint.ch/bd2658fa4bbebca8b225419a25d152b0/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9cb7ca0>: Failed to resolve 'spontan.ch.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  58%|█████▊    | 3486/6006 [3:30:26<44:38,  1.06s/it]  

URL https://www.visa.co.il/ returned status code 403


Processing URLs:  58%|█████▊    | 3496/6006 [3:32:35<25:35:42, 36.71s/it]

Error with URL http://www.thefreedictionary.com/copywriter: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /copywriter (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9d78cd0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  58%|█████▊    | 3501/6006 [3:34:41<29:51:53, 42.92s/it]

Error with URL http://www.thefreedictionary.com/insecticide: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /insecticide (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9c8e080>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  58%|█████▊    | 3505/6006 [3:35:16<13:59:25, 20.14s/it]

Error with URL http://174.138.36.47/banks/ATB: HTTPConnectionPool(host='174.138.36.47', port=80): Max retries exceeded with url: /banks/ATB (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7812f62166b0>: Failed to establish a new connection: [Errno 111] Connection refused'))


Processing URLs:  58%|█████▊    | 3508/6006 [3:37:17<29:56:22, 43.15s/it]

Error with URL http://www.thefreedictionary.com/gateway: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /gateway (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa0aab30>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  58%|█████▊    | 3513/6006 [3:37:19<5:09:35,  7.45s/it]

URL https://smsorangephonemail.myfreesites.net/ returned status code 403


Processing URLs:  59%|█████▊    | 3522/6006 [3:37:24<30:46,  1.35it/s]

URL http://www.colognehotel.net/ returned status code 410


Processing URLs:  59%|█████▉    | 3529/6006 [3:37:58<6:46:45,  9.85s/it]

Error with URL http://www.ldoceonline.com/search/?q=provenance: HTTPSConnectionPool(host='www.ldoceonline.com', port=443): Max retries exceeded with url: /search/?q=provenance (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  59%|█████▉    | 3539/6006 [3:40:10<25:22:01, 37.02s/it]

Error with URL http://51.255.64.58/: HTTPConnectionPool(host='51.255.64.58', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f856bb50>, 'Connection to 51.255.64.58 timed out. (connect timeout=15)'))


Processing URLs:  59%|█████▉    | 3546/6006 [3:40:14<2:25:22,  3.55s/it]

URL http://www.telegraph.co.uk/tennis/0/australian-open-2017-prize-money-much-will-players-earn-melbourne/ returned status code 403


Processing URLs:  59%|█████▉    | 3552/6006 [3:40:19<38:36,  1.06it/s]

URL http://www.imdb.com/title/tt0231035/ returned status code 403


Processing URLs:  59%|█████▉    | 3556/6006 [3:40:21<25:44,  1.59it/s]

URL https://aascu-my.sharepoint.com/:o:/g/personal/crawfordr_aascu_org/EsZr8X-bvWVDjWtqdo8Md1sBrZdwDjctHj-bz8aOv9NYLQ?e=0IOPqv returned status code 404


Processing URLs:  59%|█████▉    | 3560/6006 [3:40:25<36:19,  1.12it/s]

URL http://mail.neve-indoor.nl/~u43730p39181/pki-validation returned status code 404


Processing URLs:  59%|█████▉    | 3561/6006 [3:40:25<30:20,  1.34it/s]

URL https://hideuri.com/xdLz3V returned status code 404


Processing URLs:  59%|█████▉    | 3568/6006 [3:40:29<25:52,  1.57it/s]

URL https://baynhe.vn/ returned status code 403


Processing URLs:  59%|█████▉    | 3569/6006 [3:40:29<26:31,  1.53it/s]

URL https://www.ushl.com/ returned status code 404


Processing URLs:  60%|█████▉    | 3586/6006 [3:41:11<6:25:19,  9.55s/it]

Error with URL http://maconnerieamp.ca/Goldbook/auth/view/document/: HTTPConnectionPool(host='maconnerieamp.ca', port=80): Max retries exceeded with url: /Goldbook/auth/view/document/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f85007c0>: Failed to resolve 'maconnerieamp.ca' ([Errno -2] Name or service not known)"))


Processing URLs:  60%|█████▉    | 3594/6006 [3:41:15<38:14,  1.05it/s]

URL http://www.evasai.com/feeds/posts/default?orderby=updated returned status code 404


Processing URLs:  60%|██████    | 3610/6006 [3:41:31<41:47,  1.05s/it]

URL http://www.toancaupumps.com/BK/v.html returned status code 404


Processing URLs:  60%|██████    | 3612/6006 [3:41:38<1:17:01,  1.93s/it]

URL https://chulyonfr.creatorlink.net/ returned status code 403


Processing URLs:  60%|██████    | 3616/6006 [3:41:41<39:50,  1.00s/it]

URL https://www.mssqltips.com/sqlservertip/3683/new-features-in-sql-server-management-studio-for-sql-server-2016/ returned status code 403


Processing URLs:  60%|██████    | 3629/6006 [3:41:52<27:26,  1.44it/s]

URL http://www.heritagedaily.com/category/archaeology-news returned status code 403


Processing URLs:  60%|██████    | 3631/6006 [3:41:53<25:13,  1.57it/s]

URL http://www.payscale.com/research/US/Job=Software_Engineer/Salary returned status code 403


Processing URLs:  61%|██████    | 3643/6006 [3:42:33<6:36:27, 10.07s/it]

Error with URL http://preventionsystem2021.com.ve/cHUg/login.php: HTTPConnectionPool(host='preventionsystem2021.com.ve', port=80): Max retries exceeded with url: /cHUg/login.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa0fa860>: Failed to resolve 'preventionsystem2021.com.ve' ([Errno -2] Name or service not known)"))


Processing URLs:  61%|██████    | 3645/6006 [3:44:35<27:17:30, 41.61s/it]

Error with URL http://bonyad.blog.ir: HTTPConnectionPool(host='bonyad.blog.ir', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f83b7a00>, 'Connection to bonyad.blog.ir timed out. (connect timeout=15)'))


Processing URLs:  61%|██████    | 3656/6006 [3:44:45<1:02:23,  1.59s/it]

URL http://implan.com/ returned status code 403


Processing URLs:  61%|██████    | 3659/6006 [3:46:47<24:04:57, 36.94s/it]

Error with URL http://www.sakhtemanchi.com/: HTTPConnectionPool(host='www.sakhtemanchi.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812fa0f8460>, 'Connection to www.sakhtemanchi.com timed out. (connect timeout=15)'))


Processing URLs:  61%|██████    | 3664/6006 [3:47:57<17:21:07, 26.67s/it]

Error with URL http://www.samcool.org/90AB705610D8E3CD93C3E28B0C6BEFD0: HTTPConnectionPool(host='www.samcool.org', port=80): Max retries exceeded with url: /90AB705610D8E3CD93C3E28B0C6BEFD0 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f9f63eb0>: Failed to resolve 'www.samcool.org' ([Errno -2] Name or service not known)"))


Processing URLs:  61%|██████▏   | 3679/6006 [3:48:37<6:13:51,  9.64s/it]

Error with URL http://open-tube.com/top-5-open-source-uml-tools/: HTTPConnectionPool(host='open-tube.com', port=80): Max retries exceeded with url: /top-5-open-source-uml-tools/ (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))


Processing URLs:  62%|██████▏   | 3709/6006 [3:49:35<6:19:38,  9.92s/it]

Error with URL http://everything.explained.today/Internet_radio_device/: HTTPConnectionPool(host='everything.explained.today', port=80): Max retries exceeded with url: /Internet_radio_device/ (Caused by ResponseError('too many 503 error responses'))


Processing URLs:  62%|██████▏   | 3710/6006 [3:49:35<4:28:14,  7.01s/it]

URL http://www.invite-grub-whatsappsex-chat.2waky.com/ returned status code 404


Processing URLs:  62%|██████▏   | 3716/6006 [3:49:47<1:42:37,  2.69s/it]

URL http://www.dsubluehawks.com/ returned status code 403


Processing URLs:  62%|██████▏   | 3720/6006 [3:51:52<23:51:42, 37.58s/it]

Error with URL http://www.thefreedictionary.com/latency+stage: HTTPConnectionPool(host='www.thefreedictionary.com', port=80): Max retries exceeded with url: /latency+stage (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f60984f0>, 'Connection to www.thefreedictionary.com timed out. (connect timeout=15)'))


Processing URLs:  62%|██████▏   | 3725/6006 [3:53:57<27:24:48, 43.27s/it]

Error with URL http://www.tpasargad.ir/: HTTPConnectionPool(host='www.tpasargad.ir', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f8ebfcd0>, 'Connection to www.tpasargad.ir timed out. (connect timeout=15)'))


Processing URLs:  62%|██████▏   | 3729/6006 [3:54:02<7:01:34, 11.11s/it] 

URL http://www.iconcinemas.com/ returned status code 403


Processing URLs:  62%|██████▏   | 3732/6006 [3:54:03<2:36:40,  4.13s/it]

URL http://www.lastingredient.com/ returned status code 403


Processing URLs:  62%|██████▏   | 3737/6006 [3:54:40<6:44:22, 10.69s/it]

Error with URL http://raportcsr.allegro.secfence.lesavik.net: HTTPConnectionPool(host='raportcsr.allegro.secfence.lesavik.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812fa368e20>: Failed to resolve 'raportcsr.allegro.secfence.lesavik.net' ([Errno -2] Name or service not known)"))


Processing URLs:  62%|██████▏   | 3742/6006 [3:55:15<7:25:40, 11.81s/it]

Error with URL http://interoptika.hu/mail/: HTTPConnectionPool(host='interoptika.hu', port=80): Max retries exceeded with url: /mail/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing URLs:  62%|██████▏   | 3747/6006 [3:57:21<24:35:12, 39.18s/it]

Error with URL http://www.psarena.ir/: HTTPConnectionPool(host='www.psarena.ir', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f9eb6ef0>, 'Connection to www.psarena.ir timed out. (connect timeout=15)'))


Processing URLs:  63%|██████▎   | 3760/6006 [3:57:34<42:39,  1.14s/it]

URL https://www.forestessentialsindia.com/ returned status code 403


Processing URLs:  63%|██████▎   | 3768/6006 [3:58:11<4:19:41,  6.96s/it]

Error with URL http://www.getmefranchise.info/office20.php: HTTPConnectionPool(host='www.getmefranchise.info', port=80): Max retries exceeded with url: /office20.php (Caused by ResponseError('too many 500 error responses'))


Processing URLs:  63%|██████▎   | 3778/6006 [3:58:20<31:15,  1.19it/s]

URL http://www.imdb.com/title/tt3294732/ returned status code 403


Processing URLs:  63%|██████▎   | 3790/6006 [3:58:34<59:55,  1.62s/it]  

URL http://intelligence-informatique.fr.nf/SAMI/JAVA/ returned status code 404


Processing URLs:  63%|██████▎   | 3806/6006 [3:59:18<6:03:31,  9.91s/it]

Error with URL https://www.joserobles.com/: HTTPSConnectionPool(host='www.joserobles.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'www.joserobles.com'. (_ssl.c:1007)")))


Processing URLs:  63%|██████▎   | 3811/6006 [3:59:54<7:12:05, 11.81s/it]

Error with URL https://waikowhaiauto.co.nz/login/: HTTPSConnectionPool(host='waikowhaiauto.co.nz', port=443): Max retries exceeded with url: /login/ (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'waikowhaiauto.co.nz'. (_ssl.c:1007)")))


Processing URLs:  64%|██████▎   | 3821/6006 [4:00:07<44:32,  1.22s/it]  

URL https://www.google.com/url?q=https://rodriguesdemelo.adv.br/cli/aut/index.php?email%3D25&amp;source=gmail&amp;ust=1542750734646000&amp;usg=AFQjCNFkEg5_ic5zp-sXRkKZ1Wr8G97BAQ returned status code 403


Processing URLs:  64%|██████▍   | 3833/6006 [4:00:15<20:34,  1.76it/s]

URL http://fast-joingroup1.itsaol.com/ returned status code 404


Processing URLs:  64%|██████▍   | 3836/6006 [4:00:19<33:53,  1.07it/s]

URL https://www.dkoldies.com/ returned status code 403




Error with URL https://www.yves-rocher.ru: HTTPSConnectionPool(host='www.yves-rocher.ru', port=443): Max retries exceeded with url: / (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.yves-rocher.ru', port=443): Read timed out. (read timeout=15)"))


Processing URLs:  64%|██████▍   | 3847/6006 [4:02:41<19:03:16, 31.77s/it]

Error with URL http://hostpoint.ch.1200028f.net2care.com/ar2/: HTTPConnectionPool(host='hostpoint.ch.1200028f.net2care.com', port=80): Max retries exceeded with url: /ar2/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f86245b0>: Failed to resolve 'hostpoint.ch.1200028f.net2care.com' ([Errno -2] Name or service not known)"))


Processing URLs:  64%|██████▍   | 3860/6006 [4:02:48<32:03,  1.12it/s]

URL http://www.explainthatstuff.com/computermouse.html returned status code 403


Processing URLs:  64%|██████▍   | 3873/6006 [4:03:00<26:39,  1.33it/s]

URL https://www.latam.com/en_us/ returned status code 403


Processing URLs:  65%|██████▍   | 3881/6006 [4:03:39<5:52:42,  9.96s/it]

Error with URL https://www.instagram.com/graphicsfairy/: HTTPSConnectionPool(host='www.instagram.com', port=443): Max retries exceeded with url: /accounts/login/?next=https%3A%2F%2Fwww.instagram.com%2Fgraphicsfairy%2F&is_from_rle (Caused by ResponseError('too many 429 error responses'))


Processing URLs:  65%|██████▍   | 3888/6006 [4:03:43<43:25,  1.23s/it]

URL https://www.goodreads.com/search?query=0399157913 returned status code 403


Processing URLs:  65%|██████▍   | 3892/6006 [4:03:46<30:21,  1.16it/s]

URL https://www.ghacks.net/2011/08/08/microsoft-keyboard-layout-creator/ returned status code 403


Processing URLs:  65%|██████▌   | 3923/6006 [4:04:13<44:23,  1.28s/it]

URL http://www.dictionarist.com/tabloid returned status code 403


Processing URLs:  65%|██████▌   | 3929/6006 [4:06:20<21:28:17, 37.22s/it]

Error with URL http://physics.uctm.edu/ss/sites/default/files/styles/medium/pubilc/index.htm: HTTPConnectionPool(host='physics.uctm.edu', port=80): Max retries exceeded with url: /ss/sites/default/files/styles/medium/pubilc/index.htm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7812f86243a0>, 'Connection to physics.uctm.edu timed out. (connect timeout=15)'))


Processing URLs:  66%|██████▌   | 3938/6006 [4:07:00<6:26:28, 11.21s/it]

Error with URL https://www.eayso.org/: HTTPSConnectionPool(host='www.eayso.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:1007)')))


Processing URLs:  66%|██████▌   | 3940/6006 [4:07:31<8:28:35, 14.77s/it]

Error with URL http://cmffunding.com/wp-admin/includes/AT&T/Att.Yahoo/confirmation.html: HTTPConnectionPool(host='cmffunding.com', port=80): Max retries exceeded with url: /wp-admin/includes/AT&T/Att.Yahoo/confirmation.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7812f933e650>: Failed to resolve 'cmffunding.com' ([Errno -2] Name or service not known)"))


Processing URLs:  66%|██████▌   | 3946/6006 [4:07:35<1:19:03,  2.30s/it]