In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np

# Function to extract features from the URL
def extract_features_from_url(url):
    features = {
        'url_length': 0,
        'has_suspicious_keyword': 0,
        'is_numeric_domain': 0,
        'has_suspicious_tld': 0,
        'has_https': 0,
        'has_javascript': 0,
        'has_hidden_elements': 0,
        'has_iframe': 0,
        'number_of_links': 0,
        'number_of_input_fields': 0,
        'has_login_form': 0
    }

    # Suspicious keywords and TLDs (can be expanded)
    suspicious_keywords = ['login', 'signin', 'account', 'verify', 'secure']
    suspicious_tlds = ['.xyz', '.top', '.club']

    try:
        # 1. URL-based features
        features['url_length'] = len(url)
        features['has_suspicious_keyword'] = int(any(keyword in url for keyword in suspicious_keywords))
        domain = url.split('//')[1].split('/')[0]
        features['is_numeric_domain'] = int(any(char.isdigit() for char in domain))
        features['has_suspicious_tld'] = int(any(url.endswith(tld) for tld in suspicious_tlds))
        features['has_https'] = int(url.startswith('https'))

        # 2. HTML content-based features
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        features['has_javascript'] = int('javascript' in str(soup))
        features['has_hidden_elements'] = int('display:none' in str(soup))
        features['has_iframe'] = int('<iframe' in str(soup))
        features['number_of_links'] = len(soup.find_all('a'))
        features['number_of_input_fields'] = len(soup.find_all('input'))
        features['has_login_form'] = int('login' in str(soup).lower())

    except Exception as e:
        # In case of error fetching the URL
        print(f"Error fetching {url}: {e}")

    return features

# Function to extract features for a range of rows in the dataset
def extract_features_for_range(df, start_idx, end_idx, output_file):
    # Initialize the list to store extracted features
    extracted_data = []

    # Loop through the dataset in the given range
    for idx in tqdm(range(start_idx, end_idx), desc="Extracting Features", unit="URLs"):
        url = df.iloc[idx]['URL']  # Get the URL from the dataset
        features = extract_features_from_url(url)  # Extract features from the URL
        features['URL'] = url  # Add the URL itself to the features
        extracted_data.append(features)  # Append to the list

    # Convert the list to a DataFrame
    features_df = pd.DataFrame(extracted_data)

    # Save the result to a new CSV file
    features_df.to_csv(output_file, index=False)
    print(f"Feature extraction completed. The results are saved in {output_file}")

# Load the original dataset (adjust the file name as needed)
input_file = "combined_data_randomized.csv"  # Replace with your dataset file path
df = pd.read_csv(input_file)

# Ask the user to input the starting and ending record indices
start_idx = int(input("Enter the starting record index: "))
end_idx = int(input("Enter the ending record index: "))

# Define the output file name where the results will be saved
output_file = "features_extracted.csv"  # Replace with the desired output file name

# Extract features for the specified range of records
extract_features_for_range(df, start_idx, end_idx, output_file)


Enter the starting record index: 1
Enter the ending record index: 500


Extracting Features:   3%|▎         | 13/499 [00:10<05:39,  1.43URLs/s]

Error fetching http://www.macsarerd.n9uv92.icu/page1.php: HTTPConnectionPool(host='www.macsarerd.n9uv92.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6875030>: Failed to resolve 'www.macsarerd.n9uv92.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macesaoeod.8p0sos.icu/page1.php: HTTPConnectionPool(host='www.macesaoeod.8p0sos.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6874fd0>: Failed to resolve 'www.macesaoeod.8p0sos.icu' ([Errno -2] Name or service not known)"))
Error fetching https://uniswapreward.org/: HTTPSConnectionPool(host='uniswapreward.org', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ff7c663e980>: Failed to resolve 'uniswapreward.org' ([Errno -2] Name or service not known)"))


Extracting Features:   4%|▍         | 19/499 [00:15<07:19,  1.09URLs/s]

Error fetching http://www.macesarrod.ciiwd0.icu/page1.php: HTTPConnectionPool(host='www.macesarrod.ciiwd0.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7df3280>: Failed to resolve 'www.macesarrod.ciiwd0.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   4%|▍         | 21/499 [00:16<05:52,  1.36URLs/s]

Error fetching http://www.vivcsiivcaia.vicsvesai.lphv98.icu/page1.php: HTTPConnectionPool(host='www.vivcsiivcaia.vicsvesai.lphv98.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7d21a80>: Failed to resolve 'www.vivcsiivcaia.vicsvesai.lphv98.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aescceeesaas.txgkcn.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aescceeesaas.txgkcn.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e64550>: Failed to resolve 'www.eki-net.con-aescceeesaas.txgkcn.top' ([Errno -2] Name or service not known)"))


Extracting Features:   5%|▌         | 25/499 [00:18<05:16,  1.50URLs/s]

Error fetching http://www.vivcsviveai.vicsvesai.09vz0h.icu/page1.php: HTTPConnectionPool(host='www.vivcsviveai.vicsvesai.09vz0h.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e64cd0>: Failed to resolve 'www.vivcsviveai.vicsvesai.09vz0h.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccoy.aeeseaccaocmeoy.zjynwc.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccoy.aeeseaccaocmeoy.zjynwc.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6934070>: Failed to resolve 'www.au-paccoy.aeeseaccaocmeoy.zjynwc.top' ([Errno -2] Name or service not known)"))


Extracting Features:   6%|▋         | 32/499 [00:19<02:14,  3.47URLs/s]

Error fetching http://www.macsaeord.w5p6z9.icu/page1.php: HTTPConnectionPool(host='www.macsaeord.w5p6z9.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63d9a20>: Failed to resolve 'www.macsaeord.w5p6z9.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccy.aceeseaoraocmeoy.zeekfn.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aceeseaoraocmeoy.zeekfn.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63db400>: Failed to resolve 'www.au-poccy.aceeseaoraocmeoy.zeekfn.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccy.aceeseaeraocmeoy.otuxec.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aceeseaeraocmeoy.otuxec.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConn

Extracting Features:   8%|▊         | 39/499 [00:22<02:17,  3.35URLs/s]

Error fetching http://www.maceseoeod.1bmro5.icu/page1.php: HTTPConnectionPool(host='www.maceseoeod.1bmro5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c597f190>: Failed to resolve 'www.maceseoeod.1bmro5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   8%|▊         | 41/499 [00:25<06:32,  1.17URLs/s]

Error fetching http://www.macesarrod.f6553s.icu/page1.php: HTTPConnectionPool(host='www.macesarrod.f6553s.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6183580>: Failed to resolve 'www.macesarrod.f6553s.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeseaceaocmeoy.mlufzb.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseaceaocmeoy.mlufzb.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c61835e0>: Failed to resolve 'www.au-poccay.aeeseaceaocmeoy.mlufzb.top' ([Errno -2] Name or service not known)"))


Extracting Features:   9%|▉         | 44/499 [00:26<04:02,  1.88URLs/s]

Error fetching http://www.vivcsviaaaiecc.vicsvesai.gh43qf.icu/page1.php: HTTPConnectionPool(host='www.vivcsviaaaiecc.vicsvesai.gh43qf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c597f4f0>: Failed to resolve 'www.vivcsviaaaiecc.vicsvesai.gh43qf.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   9%|▉         | 46/499 [00:26<03:07,  2.42URLs/s]

Error fetching http://www.macesareod.zobca6.icu/page1.php: HTTPConnectionPool(host='www.macesareod.zobca6.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5adb700>: Failed to resolve 'www.macesareod.zobca6.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviveai.vicsvesai.bwlrdn.icu/page1.php: HTTPConnectionPool(host='www.vivcsviveai.vicsvesai.bwlrdn.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a41cc0>: Failed to resolve 'www.vivcsviveai.vicsvesai.bwlrdn.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsisvcai.vicsvesai.lmc2ry.icu/page1.php: HTTPConnectionPool(host='www.vivcsisvcai.vicsvesai.lmc2ry.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63d81

Extracting Features:  10%|█         | 50/499 [00:26<01:51,  4.04URLs/s]

Error fetching http://www.au-paccey.aeeauasoaocmeoy.axnnut.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeauasoaocmeoy.axnnut.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a42e60>: Failed to resolve 'www.au-paccey.aeeauasoaocmeoy.axnnut.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccey.aeeauasoaocmeoy.jfwjev.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeauasoaocmeoy.jfwjev.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a42110>: Failed to resolve 'www.au-paccey.aeeauasoaocmeoy.jfwjev.top' ([Errno -2] Name or service not known)"))


Extracting Features:  12%|█▏        | 58/499 [00:28<01:41,  4.36URLs/s]

Error fetching http://www.vivcsiiscaias.vicsvesai.n9uv92.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiscaias.vicsvesai.n9uv92.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a43fd0>: Failed to resolve 'www.vivcsiiscaias.vicsvesai.n9uv92.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviavaiec.vicsvesai.kesgc9.icu/page1.php: HTTPConnectionPool(host='www.vivcsviavaiec.vicsvesai.kesgc9.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6ad9960>: Failed to resolve 'www.vivcsviavaiec.vicsvesai.kesgc9.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccey.aeeseasoaocmeoy.shfxjv.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeseasoaocmeoy.shfxjv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError

Extracting Features:  12%|█▏        | 62/499 [00:31<02:54,  2.51URLs/s]

Error fetching http://top100.ru: HTTPConnectionPool(host='top100.ru', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff7c6797d90>: Failed to establish a new connection: [Errno 111] Connection refused'))
Error fetching http://www.vivcsviiaaieca.vicsvesai.n9k5yv.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiaaieca.vicsvesai.n9k5yv.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60f6230>: Failed to resolve 'www.vivcsviiaaieca.vicsvesai.n9k5yv.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  13%|█▎        | 64/499 [00:32<03:09,  2.29URLs/s]

Error fetching http://www.au-pcccny.aueseacaomceoy.wxtxex.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aueseacaomceoy.wxtxex.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60f6800>: Failed to resolve 'www.au-pcccny.aueseacaomceoy.wxtxex.top' ([Errno -2] Name or service not known)"))


Extracting Features:  13%|█▎        | 66/499 [02:44<2:42:28, 22.51s/URLs]

Error fetching http://vgtrk.com: HTTPConnectionPool(host='vgtrk.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ff7c65d4c10>, 'Connection to vgtrk.com timed out. (connect timeout=None)'))


Extracting Features:  13%|█▎        | 67/499 [02:44<2:10:58, 18.19s/URLs]

Error fetching http://www.au-paccny.aeeauaceaocmeoy.aoxxdp.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccny.aeeauaceaocmeoy.aoxxdp.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c65d55a0>: Failed to resolve 'www.au-paccny.aeeauaceaocmeoy.aoxxdp.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccy.aceeseacraocmeoy.flfoew.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aceeseacraocmeoy.flfoew.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60f6440>: Failed to resolve 'www.au-poccy.aceeseacraocmeoy.flfoew.top' ([Errno -2] Name or service not known)"))


Extracting Features:  15%|█▍        | 73/499 [02:47<31:48,  4.48s/URLs]

Error fetching http://www.au-pacccy.aeeseacoaocmeoy.kplwvi.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aeeseacoaocmeoy.kplwvi.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66101f0>: Failed to resolve 'www.au-pacccy.aeeseacoaocmeoy.kplwvi.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macesarrod.c23ios.icu/page1.php: HTTPConnectionPool(host='www.macesarrod.c23ios.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7df38e0>: Failed to resolve 'www.macesarrod.c23ios.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  16%|█▌        | 81/499 [02:53<08:11,  1.18s/URLs]

Error fetching http://www.ekl-net.com-asccecceaas.zttpjx.top/jp.php: HTTPConnectionPool(host='www.ekl-net.com-asccecceaas.zttpjx.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60b5840>: Failed to resolve 'www.ekl-net.com-asccecceaas.zttpjx.top' ([Errno -2] Name or service not known)"))


Extracting Features:  17%|█▋        | 85/499 [02:55<04:41,  1.47URLs/s]

Error fetching http://www.au-poccay.aeeauaccaocmeoy.vrxppt.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaccaocmeoy.vrxppt.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6838340>: Failed to resolve 'www.au-poccay.aeeauaccaocmeoy.vrxppt.top' ([Errno -2] Name or service not known)"))


Extracting Features:  18%|█▊        | 90/499 [03:01<08:40,  1.27s/URLs]

Error fetching http://www.macsaeerd.n8zehe.icu/page1.php: HTTPConnectionPool(host='www.macsaeerd.n8zehe.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7b0b790>: Failed to resolve 'www.macsaeerd.n8zehe.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  19%|█▉        | 97/499 [03:02<02:12,  3.04URLs/s]

Error fetching http://www.vivcsviacveaiao.visvsai.7v560g.icu/page1.php: HTTPConnectionPool(host='www.vivcsviacveaiao.visvsai.7v560g.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e9fd30>: Failed to resolve 'www.vivcsviacveaiao.visvsai.7v560g.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiaaieca.vicsvesai.c23ios.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiaaieca.vicsvesai.c23ios.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e9e650>: Failed to resolve 'www.vivcsviiaaieca.vicsvesai.c23ios.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiacvaieca.visvsai.ruw0bh.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.ruw0bh.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<ur

Extracting Features:  21%|██        | 104/499 [03:08<03:08,  2.10URLs/s]

Error fetching http://www.ekl-net.com-asccecceaas.ucwxyw.top/jp.php: HTTPConnectionPool(host='www.ekl-net.com-asccecceaas.ucwxyw.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5a82e30>: Failed to resolve 'www.ekl-net.com-asccecceaas.ucwxyw.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccy.aeueseacaomceoy.pvnoai.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aeueseacaomceoy.pvnoai.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5a833d0>: Failed to resolve 'www.au-poccy.aeueseacaomceoy.pvnoai.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pcccny.aeeseasaaocmeoy.qeptvv.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseasaaocmeoy.qeptvv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionEr

Extracting Features:  22%|██▏       | 110/499 [03:09<01:44,  3.72URLs/s]

Error fetching http://www.vivcsiiacvcaiae.visvsai.9qwfvb.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiacvcaiae.visvsai.9qwfvb.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c64b6c50>: Failed to resolve 'www.vivcsiiacvcaiae.visvsai.9qwfvb.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  22%|██▏       | 111/499 [05:19<2:56:12, 27.25s/URLs]

Error fetching https://www.idavivienda.de/login.php?app-token=a9i6j3b4g8270d%201%20ce5fh3%20N6b7D%20B%20%20Srw285wMuTZHESGIKApoXg9PmJ1%20OYgFLk4Cfa%20%20a23246985320: HTTPSConnectionPool(host='www.idavivienda.de', port=443): Max retries exceeded with url: /login.php?app-token=a9i6j3b4g8270d%201%20ce5fh3%20N6b7D%20B%20%20Srw285wMuTZHESGIKApoXg9PmJ1%20OYgFLk4Cfa%20%20a23246985320 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7ff7c5ff4d30>, 'Connection to www.idavivienda.de timed out. (connect timeout=None)'))


Extracting Features:  23%|██▎       | 114/499 [05:21<1:23:06, 12.95s/URLs]

Error fetching http://www.au-paccy.aceeseaeraocmeoy.pafzaw.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseaeraocmeoy.pafzaw.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5ff56f0>: Failed to resolve 'www.au-paccy.aceeseaeraocmeoy.pafzaw.top' ([Errno -2] Name or service not known)"))


Extracting Features:  24%|██▍       | 120/499 [05:25<21:30,  3.40s/URLs]

Error fetching http://www.au-paccoy.aueseacaemceoy.zlxcfy.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccoy.aueseacaemceoy.zlxcfy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60f56c0>: Failed to resolve 'www.au-paccoy.aueseacaemceoy.zlxcfy.top' ([Errno -2] Name or service not known)"))


Extracting Features:  24%|██▍       | 121/499 [05:25<17:03,  2.71s/URLs]

Error fetching http://www.au-pcccny.aueseasaomceoy.qeptvv.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aueseasaomceoy.qeptvv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60f54b0>: Failed to resolve 'www.au-pcccny.aueseasaomceoy.qeptvv.top' ([Errno -2] Name or service not known)"))


Extracting Features:  25%|██▌       | 127/499 [05:26<04:48,  1.29URLs/s]

Error fetching http://www.vivcsiisacvaicca.visvsai.wa0hmg.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.wa0hmg.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6611ed0>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.wa0hmg.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiiscaias.vicsvesai.3xju6c.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiscaias.vicsvesai.3xju6c.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66106a0>: Failed to resolve 'www.vivcsiiscaias.vicsvesai.3xju6c.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macesareod.j8vn1q.icu/page1.php: HTTPConnectionPool(host='www.macesareod.j8vn1q.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnect

Extracting Features:  26%|██▌       | 129/499 [05:26<03:27,  1.79URLs/s]

Error fetching http://www.au-pacccy.aeeseacoaocmeoy.tzqbcg.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aeeseacoaocmeoy.tzqbcg.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6610340>: Failed to resolve 'www.au-pacccy.aeeseacoaocmeoy.tzqbcg.top' ([Errno -2] Name or service not known)"))
Error fetching https://freefirespinsgfree.dnsme.eu.org/: HTTPSConnectionPool(host='freefirespinsgfree.dnsme.eu.org', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ff7c66117e0>: Failed to resolve 'freefirespinsgfree.dnsme.eu.org' ([Errno -2] Name or service not known)"))


Extracting Features:  26%|██▋       | 132/499 [07:39<2:28:12, 24.23s/URLs]

Error fetching http://elon.edu: HTTPConnectionPool(host='elon.edu', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ff7c6610ee0>, 'Connection to elon.edu timed out. (connect timeout=None)'))


Extracting Features:  27%|██▋       | 135/499 [07:45<1:13:29, 12.11s/URLs]

Error fetching http://www.au-pcccny.aeeseacaaocmeoy.boufbt.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseacaaocmeoy.boufbt.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7b0b7c0>: Failed to resolve 'www.au-pcccny.aeeseacaaocmeoy.boufbt.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviaaaiecc.vicsvesai.8jyshd.icu/page1.php: HTTPConnectionPool(host='www.vivcsviaaaiecc.vicsvesai.8jyshd.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7d22a40>: Failed to resolve 'www.vivcsviaaaiecc.vicsvesai.8jyshd.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  28%|██▊       | 139/499 [07:46<30:01,  5.00s/URLs]

Error fetching http://www.au-poccay.aeeauaceaocmeoy.hcfugu.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaceaocmeoy.hcfugu.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7d20dc0>: Failed to resolve 'www.au-poccay.aeeauaceaocmeoy.hcfugu.top' ([Errno -2] Name or service not known)"))


Extracting Features:  28%|██▊       | 141/499 [07:47<19:53,  3.33s/URLs]

Error fetching http://www.vivcsviiacvaieca.visvsai.7n6v2v.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.7n6v2v.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a400a0>: Failed to resolve 'www.vivcsviiacvaieca.visvsai.7n6v2v.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  29%|██▉       | 144/499 [07:48<11:39,  1.97s/URLs]

Error fetching http://www.au-pacccy.aeeauacoaocmeoy.wzyrmk.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aeeauacoaocmeoy.wzyrmk.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c62a6e30>: Failed to resolve 'www.au-pacccy.aeeauacoaocmeoy.wzyrmk.top' ([Errno -2] Name or service not known)"))


Extracting Features:  30%|██▉       | 148/499 [07:50<06:22,  1.09s/URLs]

Error fetching http://www.macesarerd.fi03rf.icu/page1.php: HTTPConnectionPool(host='www.macesarerd.fi03rf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63c96f0>: Failed to resolve 'www.macesarerd.fi03rf.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  31%|███       | 154/499 [07:54<04:41,  1.22URLs/s]

Error fetching http://www.au-poccay.aeeseacoaocmeoy.yrwtop.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseacoaocmeoy.yrwtop.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5b0f3a0>: Failed to resolve 'www.au-poccay.aeeseacoaocmeoy.yrwtop.top' ([Errno -2] Name or service not known)"))
Error fetching http://pearsoncmg.com: HTTPConnectionPool(host='pearsoncmg.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c684b490>: Failed to resolve 'pearsoncmg.com' ([Errno -5] No address associated with hostname)"))
Error fetching http://www.vivcsiisaaicca.vicsvesai.3wj9fr.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaaicca.vicsvesai.3wj9fr.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c684bd60>: Failed to res

Extracting Features:  32%|███▏      | 160/499 [07:56<02:23,  2.35URLs/s]

Error fetching http://www.au-poccy.aceeseacraocmeoy.otuxec.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aceeseacraocmeoy.otuxec.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6058c40>: Failed to resolve 'www.au-poccy.aceeseacraocmeoy.otuxec.top' ([Errno -2] Name or service not known)"))


Extracting Features:  33%|███▎      | 165/499 [07:58<02:51,  1.95URLs/s]

Error fetching http://www.macesareod.9a8k92.icu/page1.php: HTTPConnectionPool(host='www.macesareod.9a8k92.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6058c10>: Failed to resolve 'www.macesareod.9a8k92.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  33%|███▎      | 167/499 [07:59<02:06,  2.62URLs/s]

Error fetching http://www.vivcsiisacvaicca.visvsai.ey2duy.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.ey2duy.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6646fe0>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.ey2duy.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  34%|███▍      | 169/499 [07:59<01:39,  3.32URLs/s]

Error fetching http://www.vivcsviavaiec.vicsvesai.usytdje.icu/page1.php: HTTPConnectionPool(host='www.vivcsviavaiec.vicsvesai.usytdje.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5a21ba0>: Failed to resolve 'www.vivcsviavaiec.vicsvesai.usytdje.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  35%|███▌      | 175/499 [08:03<03:22,  1.60URLs/s]

Error fetching http://www.au-paccey.aeeauasoaocmeoy.islxvm.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeauasoaocmeoy.islxvm.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5b0c190>: Failed to resolve 'www.au-paccey.aeeauasoaocmeoy.islxvm.top' ([Errno -2] Name or service not known)"))


Extracting Features:  36%|███▌      | 178/499 [08:04<02:08,  2.49URLs/s]

Error fetching http://www.au-poccay.aeeauasoaocmeoy.qriyir.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauasoaocmeoy.qriyir.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5bde380>: Failed to resolve 'www.au-poccay.aeeauasoaocmeoy.qriyir.top' ([Errno -2] Name or service not known)"))


Extracting Features:  36%|███▌      | 180/499 [08:06<03:43,  1.43URLs/s]

Error fetching http://www.vivcsviveai.vicsvesai.c18y03.icu/page1.php: HTTPConnectionPool(host='www.vivcsviveai.vicsvesai.c18y03.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5c8aa70>: Failed to resolve 'www.vivcsviveai.vicsvesai.c18y03.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiacvaieca.visvsai.1iui33.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.1iui33.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c61e6050>: Failed to resolve 'www.vivcsviiacvaieca.visvsai.1iui33.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  37%|███▋      | 185/499 [08:09<03:48,  1.37URLs/s]

Error fetching http://www.au-poccay.aueseasaomceoy.yrwtop.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseasaomceoy.yrwtop.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7df2c80>: Failed to resolve 'www.au-poccay.aueseasaomceoy.yrwtop.top' ([Errno -2] Name or service not known)"))


Extracting Features:  37%|███▋      | 187/499 [08:10<03:05,  1.68URLs/s]

Error fetching http://www.au-poccay.aueseasaomceoy.boufbt.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseasaomceoy.boufbt.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7df27d0>: Failed to resolve 'www.au-poccay.aueseasaomceoy.boufbt.top' ([Errno -2] Name or service not known)"))


Extracting Features:  38%|███▊      | 189/499 [08:11<03:15,  1.59URLs/s]

Error fetching http://www.vivcsiivcaia.vicsvesai.3wuhd3.icu/page1.php: HTTPConnectionPool(host='www.vivcsiivcaia.vicsvesai.3wuhd3.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7df2e00>: Failed to resolve 'www.vivcsiivcaia.vicsvesai.3wuhd3.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  38%|███▊      | 191/499 [08:12<03:03,  1.68URLs/s]

Error fetching http://www.macsarord.n8zehe.icu/page1.php: HTTPConnectionPool(host='www.macsarord.n8zehe.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6514d30>: Failed to resolve 'www.macsarord.n8zehe.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  39%|███▉      | 194/499 [08:14<02:43,  1.87URLs/s]

Error fetching http://www.vivcsiiscvcaias.visvsai.htof1s.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiscvcaias.visvsai.htof1s.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5db6140>: Failed to resolve 'www.vivcsiiscvcaias.visvsai.htof1s.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  40%|███▉      | 198/499 [08:29<16:18,  3.25s/URLs]

Error fetching http://www.macesaoeod.se9ijc.icu/page1.php: HTTPConnectionPool(host='www.macesaoeod.se9ijc.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6537880>: Failed to resolve 'www.macesaoeod.se9ijc.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  40%|████      | 200/499 [08:37<17:42,  3.55s/URLs]

Error fetching http://diigo.com: HTTPConnectionPool(host='diigo.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c65354b0>: Failed to resolve 'diigo.com' ([Errno -2] Name or service not known)"))


Extracting Features:  41%|████▏     | 207/499 [08:41<03:54,  1.25URLs/s]

Error fetching http://www.au-poccay.aeeauacoaocmeoy.egakdz.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauacoaocmeoy.egakdz.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6537430>: Failed to resolve 'www.au-poccay.aeeauacoaocmeoy.egakdz.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesceeccesoas.tddonu.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesceeccesoas.tddonu.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7ea8760>: Failed to resolve 'www.eki-net.con-aesceeccesoas.tddonu.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsaorod.15tf68.icu/page1.php: HTTPConnectionPool(host='www.macsaorod.15tf68.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTP

Extracting Features:  42%|████▏     | 209/499 [08:43<04:06,  1.18URLs/s]

Error fetching http://www.au-pcccoy.aueseacaemceoy.wfclkk.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccoy.aueseacaemceoy.wfclkk.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6537430>: Failed to resolve 'www.au-pcccoy.aueseacaemceoy.wfclkk.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiiaaaicca.vicsvesai.s6vx9m.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.s6vx9m.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c683bfa0>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.s6vx9m.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  43%|████▎     | 213/499 [08:45<03:29,  1.36URLs/s]

Error fetching http://www.macsaeord.x7qbjf.icu/page1.php: HTTPConnectionPool(host='www.macsaeord.x7qbjf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e64fa0>: Failed to resolve 'www.macsaeord.x7qbjf.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  43%|████▎     | 215/499 [08:46<02:48,  1.68URLs/s]

Error fetching http://www.macesareod.2jld0p.icu/page1.php: HTTPConnectionPool(host='www.macesareod.2jld0p.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6058910>: Failed to resolve 'www.macesareod.2jld0p.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  44%|████▍     | 221/499 [08:47<01:17,  3.59URLs/s]

Error fetching http://www.vivcsiiaaaicca.vicsvesai.bf2o5x.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.bf2o5x.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c61906a0>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.bf2o5x.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsarerd.09vz0h.icu/page1.php: HTTPConnectionPool(host='www.macsarerd.09vz0h.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5e980d0>: Failed to resolve 'www.macsarerd.09vz0h.icu' ([Errno -2] Name or service not known)"))
Error fetching https://connecct-login.884santarita.com/: HTTPSConnectionPool(host='connecct-login.884santarita.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ff7c5e98a60>: Failed to res

Extracting Features:  45%|████▍     | 224/499 [08:49<02:15,  2.03URLs/s]

Error fetching http://www.macsaeerd.n9k5yv.icu/page1.php: HTTPConnectionPool(host='www.macsaeerd.n9k5yv.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5e99090>: Failed to resolve 'www.macsaeerd.n9k5yv.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeauaccaocmeoy.pafzaw.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaccaocmeoy.pafzaw.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5e99630>: Failed to resolve 'www.au-poccay.aeeauaccaocmeoy.pafzaw.top' ([Errno -2] Name or service not known)"))


Extracting Features:  46%|████▌     | 229/499 [08:51<02:40,  1.68URLs/s]

Error fetching https://connecct-login.aidanharold.com/: HTTPSConnectionPool(host='connecct-login.aidanharold.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ff7c7df2b00>: Failed to resolve 'connecct-login.aidanharold.com' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesceeosesoas.okhusg.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesceeosesoas.okhusg.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7b40160>: Failed to resolve 'www.eki-net.con-aesceeosesoas.okhusg.top' ([Errno -2] Name or service not known)"))


Extracting Features:  46%|████▋     | 232/499 [08:52<02:07,  2.09URLs/s]

Error fetching http://www.au-paccy.aceeseacraocmeoy.glxpxy.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.glxpxy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7b43df0>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.glxpxy.top' ([Errno -2] Name or service not known)"))


Extracting Features:  47%|████▋     | 234/499 [08:54<02:36,  1.70URLs/s]

Error fetching http://www.macsaeerd.v72weu.icu/page1.php: HTTPConnectionPool(host='www.macsaeerd.v72weu.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7b423b0>: Failed to resolve 'www.macsaeerd.v72weu.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  47%|████▋     | 236/499 [08:55<02:06,  2.07URLs/s]

Error fetching http://www.macesaorod.a7w3p7.icu/page1.php: HTTPConnectionPool(host='www.macesaorod.a7w3p7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7d5d150>: Failed to resolve 'www.macesaorod.a7w3p7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  48%|████▊     | 238/499 [08:55<01:57,  2.22URLs/s]

Error fetching http://www.macessrerd.7v560g.icu/page1.php: HTTPConnectionPool(host='www.macessrerd.7v560g.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60f63e0>: Failed to resolve 'www.macessrerd.7v560g.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  48%|████▊     | 242/499 [09:00<03:51,  1.11URLs/s]

Error fetching http://www.vivcsiiaaaicca.vicsvesai.q4nb0o.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.q4nb0o.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63d8e20>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.q4nb0o.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  49%|████▉     | 244/499 [09:00<02:43,  1.56URLs/s]

Error fetching http://www.vivcsiisaviccai.vicsvesai.4awkri.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaviccai.vicsvesai.4awkri.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c62403d0>: Failed to resolve 'www.vivcsiisaviccai.vicsvesai.4awkri.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  50%|█████     | 250/499 [09:07<05:41,  1.37s/URLs]

Error fetching http://www.au-pcccny.aueseacaomceoy.juvnbj.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aueseacaomceoy.juvnbj.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e64e20>: Failed to resolve 'www.au-pcccny.aueseacaomceoy.juvnbj.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pcccny.aeeseacaaocmeoy.gjaxjw.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseacaaocmeoy.gjaxjw.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e642e0>: Failed to resolve 'www.au-pcccny.aeeseacaaocmeoy.gjaxjw.top' ([Errno -2] Name or service not known)"))


Extracting Features:  51%|█████     | 253/499 [11:16<1:34:33, 23.06s/URLs]

Error fetching http://xtec.cat: HTTPConnectionPool(host='xtec.cat', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ff7c7e64490>, 'Connection to xtec.cat timed out. (connect timeout=None)'))
Error fetching http://www.macsarord.zobca6.icu/page1.php: HTTPConnectionPool(host='www.macsarord.zobca6.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e65060>: Failed to resolve 'www.macsarord.zobca6.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  51%|█████     | 255/499 [13:27<2:31:06, 37.16s/URLs]

Error fetching http://torob.com: HTTPConnectionPool(host='torob.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ff7c6711090>, 'Connection to torob.com timed out. (connect timeout=None)'))


Extracting Features:  52%|█████▏    | 259/499 [13:27<1:06:11, 16.55s/URLs]

Error fetching http://www.vivcsiavai.vicsvesai.yc0xn8.icu/page1.php: HTTPConnectionPool(host='www.vivcsiavai.vicsvesai.yc0xn8.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e670a0>: Failed to resolve 'www.vivcsiavai.vicsvesai.yc0xn8.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aueseacaomceoy.nvwszn.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseacaomceoy.nvwszn.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5ff47f0>: Failed to resolve 'www.au-poccay.aueseacaomceoy.nvwszn.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiivcaia.vicsvesai.c2yfkr.icu/page1.php: HTTPConnectionPool(host='www.vivcsiivcaia.vicsvesai.c2yfkr.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.c

Extracting Features:  53%|█████▎    | 263/499 [13:30<26:38,  6.77s/URLs]

Error fetching http://www.vivcsiisacvaicca.visvsai.of6jh4.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.of6jh4.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e9d240>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.of6jh4.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  54%|█████▎    | 268/499 [13:34<08:53,  2.31s/URLs]

Error fetching http://www.vivcsviiaaieca.vicsvesai.cx03ta.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiaaieca.vicsvesai.cx03ta.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c60fe980>: Failed to resolve 'www.vivcsviiaaieca.vicsvesai.cx03ta.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  54%|█████▍    | 271/499 [13:34<04:30,  1.19s/URLs]

Error fetching http://www.vivcsiisacvaicca.visvsai.0rv6h8.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.0rv6h8.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66f5b70>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.0rv6h8.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  55%|█████▍    | 273/499 [13:35<03:01,  1.25URLs/s]

Error fetching http://www.au-paccey.aeeauasoaocmeoy.bmzcgy.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeauasoaocmeoy.bmzcgy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c664ee60>: Failed to resolve 'www.au-paccey.aeeauasoaocmeoy.bmzcgy.top' ([Errno -2] Name or service not known)"))


Extracting Features:  55%|█████▌    | 276/499 [13:36<02:27,  1.52URLs/s]

Error fetching http://www.au-poccoy.aueseacaemceoy.yunpgg.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccoy.aueseacaemceoy.yunpgg.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c664e9e0>: Failed to resolve 'www.au-poccoy.aueseacaemceoy.yunpgg.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiacvaieca.visvsai.jqsqm7.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.jqsqm7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7b95ed0>: Failed to resolve 'www.vivcsviiacvaieca.visvsai.jqsqm7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  57%|█████▋    | 282/499 [13:37<01:00,  3.57URLs/s]

Error fetching http://www.au-poccy.aeueseaeaomceoy.islxvm.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aeueseaeaomceoy.islxvm.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c664f460>: Failed to resolve 'www.au-poccy.aeueseaeaomceoy.islxvm.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pacccy.aueseacaomceoy.jrvsgv.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aueseacaomceoy.jrvsgv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7ea46d0>: Failed to resolve 'www.au-pacccy.aueseacaomceoy.jrvsgv.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeseaceaocmeoy.vhypzf.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseaceaocmeoy.vhypzf.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by N

Extracting Features:  57%|█████▋    | 284/499 [13:37<00:58,  3.68URLs/s]

Error fetching http://www.vivcsiiaacvaicca.visvsai.no9sh7.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaacvaicca.visvsai.no9sh7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c663e050>: Failed to resolve 'www.vivcsiiaacvaicca.visvsai.no9sh7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  58%|█████▊    | 288/499 [13:38<00:45,  4.64URLs/s]

Error fetching http://www.macesaoeod.usytdje.icu/page1.php: HTTPConnectionPool(host='www.macesaoeod.usytdje.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c663d990>: Failed to resolve 'www.macesaoeod.usytdje.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aueseasaomceoy.tzqbcg.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseasaomceoy.tzqbcg.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5dbef20>: Failed to resolve 'www.au-poccay.aueseasaomceoy.tzqbcg.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesccosesaas.okhusg.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesccosesaas.okhusg.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection obje

Extracting Features:  59%|█████▊    | 292/499 [13:40<01:12,  2.84URLs/s]

Error fetching http://www.au-poccay.aueseasaomceoy.axnnut.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseasaomceoy.axnnut.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6710340>: Failed to resolve 'www.au-poccay.aueseasaomceoy.axnnut.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiisacvaicca.visvsai.yvey51.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.yvey51.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6711630>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.yvey51.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  59%|█████▉    | 296/499 [13:41<00:59,  3.39URLs/s]

Error fetching http://www.au-pcccny.aeeauasaaocmeoy.nwbezf.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeauasaaocmeoy.nwbezf.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6535150>: Failed to resolve 'www.au-pcccny.aeeauasaaocmeoy.nwbezf.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macessrord.q4nb0o.icu/page1.php: HTTPConnectionPool(host='www.macessrord.q4nb0o.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b1d50>: Failed to resolve 'www.macessrord.q4nb0o.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  60%|█████▉    | 299/499 [13:42<00:43,  4.59URLs/s]

Error fetching http://www.au-poccay.aeeauaccaocmeoy.wsejfh.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaccaocmeoy.wsejfh.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b25f0>: Failed to resolve 'www.au-poccay.aeeauaccaocmeoy.wsejfh.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesceeeeesas.wiphfb.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesceeeeesas.wiphfb.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b2e00>: Failed to resolve 'www.eki-net.con-aesceeeeesas.wiphfb.top' ([Errno -2] Name or service not known)"))


Extracting Features:  60%|██████    | 301/499 [13:42<00:43,  4.58URLs/s]

Error fetching http://gaoqingw.com: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


Extracting Features:  61%|██████    | 302/499 [13:43<01:16,  2.57URLs/s]

Error fetching http://www.macsaoeod.45ml1t.icu/page1.php: HTTPConnectionPool(host='www.macsaoeod.45ml1t.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6534e80>: Failed to resolve 'www.macsaoeod.45ml1t.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  61%|██████    | 305/499 [13:46<02:13,  1.45URLs/s]

Error fetching http://www.au-paccy.aceeseacraocmeoy.nwgjza.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.nwgjza.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5af9030>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.nwgjza.top' ([Errno -2] Name or service not known)"))


Extracting Features:  62%|██████▏   | 307/499 [13:47<01:59,  1.60URLs/s]

Error fetching http://www.au-pcccoy.aueseacaemceoy.shfxjv.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccoy.aueseacaemceoy.shfxjv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5dd0a00>: Failed to resolve 'www.au-pcccoy.aueseacaemceoy.shfxjv.top' ([Errno -2] Name or service not known)"))


Extracting Features:  62%|██████▏   | 311/499 [13:49<01:38,  1.91URLs/s]

Error fetching http://www.macesaeord.yvey51.icu/page1.php: HTTPConnectionPool(host='www.macesaeord.yvey51.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5af9150>: Failed to resolve 'www.macesaeord.yvey51.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  63%|██████▎   | 312/499 [13:50<01:38,  1.91URLs/s]

Error fetching http://www.au-poccay.aeeauacoaocmeoy.auomwo.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauacoaocmeoy.auomwo.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e3b2e0>: Failed to resolve 'www.au-poccay.aeeauacoaocmeoy.auomwo.top' ([Errno -2] Name or service not known)"))


Extracting Features:  64%|██████▎   | 317/499 [13:51<00:53,  3.40URLs/s]

Error fetching http://www.vivcsiisaviccai.vicsvesai.e5susf.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaviccai.vicsvesai.e5susf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6266f50>: Failed to resolve 'www.vivcsiisaviccai.vicsvesai.e5susf.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pcccny.aeeseacaaocmeoy.wfclkk.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseacaaocmeoy.wfclkk.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6941c00>: Failed to resolve 'www.au-pcccny.aeeseacaaocmeoy.wfclkk.top' ([Errno -2] Name or service not known)"))


Extracting Features:  64%|██████▍   | 321/499 [13:52<01:07,  2.64URLs/s]

Error fetching http://www.au-paccy.aceeseacraocmeoy.ohdlut.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.ohdlut.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6942b00>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.ohdlut.top' ([Errno -2] Name or service not known)"))


Extracting Features:  65%|██████▍   | 324/499 [13:54<01:23,  2.09URLs/s]

Error fetching http://www.macsaoeod.tepyl0.icu/page1.php: HTTPConnectionPool(host='www.macsaoeod.tepyl0.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7ea6f80>: Failed to resolve 'www.macsaoeod.tepyl0.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  66%|██████▌   | 327/499 [13:57<02:00,  1.43URLs/s]

Error fetching http://www.vivcsiiaaviccai.vicsvesai.lzvsi5.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaviccai.vicsvesai.lzvsi5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5fe0970>: Failed to resolve 'www.vivcsiiaaviccai.vicsvesai.lzvsi5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  66%|██████▌   | 330/499 [13:58<01:29,  1.89URLs/s]

Error fetching http://www.au-poccay.aueseacaomceoy.jfwjev.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseacaomceoy.jfwjev.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7ea5ff0>: Failed to resolve 'www.au-poccay.aueseacaomceoy.jfwjev.top' ([Errno -2] Name or service not known)"))


Extracting Features:  67%|██████▋   | 335/499 [14:00<01:34,  1.74URLs/s]

Error fetching http://www.au-pacccy.aueseacaomceoy.qvgxtq.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aueseacaomceoy.qvgxtq.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b2d40>: Failed to resolve 'www.au-pacccy.aueseacaomceoy.qvgxtq.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.maceseoeod.4161t5.icu/page1.php: HTTPConnectionPool(host='www.maceseoeod.4161t5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b0f40>: Failed to resolve 'www.maceseoeod.4161t5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  68%|██████▊   | 338/499 [14:02<01:20,  2.00URLs/s]

Error fetching http://www.vivcsiiscaias.vicsvesai.1iui33.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiscaias.vicsvesai.1iui33.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b0640>: Failed to resolve 'www.vivcsiiscaias.vicsvesai.1iui33.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviveai.vicsvesai.4161t5.icu/page1.php: HTTPConnectionPool(host='www.vivcsviveai.vicsvesai.4161t5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c663e7a0>: Failed to resolve 'www.vivcsviveai.vicsvesai.4161t5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  68%|██████▊   | 341/499 [14:03<01:03,  2.49URLs/s]

Error fetching http://www.vivcsicisvcai.visvsai.ncbhdjei.icu/page1.php: HTTPConnectionPool(host='www.vivcsicisvcai.visvsai.ncbhdjei.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c663e4d0>: Failed to resolve 'www.vivcsicisvcai.visvsai.ncbhdjei.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  69%|██████▉   | 344/499 [14:05<01:36,  1.60URLs/s]

Error fetching http://www.vivcsiiaaaicca.vicsvesai.45ml1t.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.45ml1t.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63d8340>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.45ml1t.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  70%|██████▉   | 348/499 [14:08<01:31,  1.66URLs/s]

Error fetching http://www.vivcsiavcai.vicsvesai.j328u2.icu/page1.php: HTTPConnectionPool(host='www.vivcsiavcai.vicsvesai.j328u2.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63da980>: Failed to resolve 'www.vivcsiavcai.vicsvesai.j328u2.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  70%|███████   | 351/499 [16:19<1:09:26, 28.15s/URLs]

Error fetching http://karnataka.gov.in: HTTPConnectionPool(host='karnataka.gov.in', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ff7c6515990>, 'Connection to karnataka.gov.in timed out. (connect timeout=None)'))


Extracting Features:  71%|███████   | 352/499 [16:20<53:46, 21.95s/URLs]  

Error fetching http://www.macsaorod.vhvjv7.icu/page1.php: HTTPConnectionPool(host='www.macsaorod.vhvjv7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5e4f550>: Failed to resolve 'www.macsaorod.vhvjv7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  71%|███████   | 354/499 [16:23<33:03, 13.68s/URLs]

Error fetching http://www.macesarrod.q4nb0o.icu/page1.php: HTTPConnectionPool(host='www.macesarrod.q4nb0o.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c5e4f850>: Failed to resolve 'www.macesarrod.q4nb0o.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  72%|███████▏  | 359/499 [16:27<10:49,  4.64s/URLs]

Error fetching http://www.vivcsiiaacvaicca.visvsai.9u60bt.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaacvaicca.visvsai.9u60bt.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c635bbb0>: Failed to resolve 'www.vivcsiiaacvaicca.visvsai.9u60bt.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  73%|███████▎  | 363/499 [16:29<04:26,  1.96s/URLs]

Error fetching http://www.au-pcccoy.aueseacaemceoy.nwbezf.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccoy.aueseacaemceoy.nwbezf.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6877b20>: Failed to resolve 'www.au-pcccoy.aueseacaemceoy.nwbezf.top' ([Errno -2] Name or service not known)"))


Extracting Features:  74%|███████▎  | 368/499 [16:32<02:00,  1.09URLs/s]

Error fetching http://www.macsaorod.208e6a.icu/page1.php: HTTPConnectionPool(host='www.macsaorod.208e6a.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b11e0>: Failed to resolve 'www.macsaorod.208e6a.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  74%|███████▍  | 371/499 [16:36<02:29,  1.17s/URLs]

Error fetching http://www.au-poccay.aeeseaccaocmeoy.sdnrfu.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseaccaocmeoy.sdnrfu.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c62e84c0>: Failed to resolve 'www.au-poccay.aeeseaccaocmeoy.sdnrfu.top' ([Errno -2] Name or service not known)"))


Extracting Features:  75%|███████▍  | 373/499 [16:36<01:51,  1.13URLs/s]

Error fetching http://www.macesaoeod.lmc2ry.icu/page1.php: HTTPConnectionPool(host='www.macesaoeod.lmc2ry.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c62687f0>: Failed to resolve 'www.macesaoeod.lmc2ry.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  75%|███████▌  | 375/499 [16:37<01:15,  1.64URLs/s]

Error fetching http://www.au-paccy.aceeseaeraocmeoy.iqlnii.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseaeraocmeoy.iqlnii.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6816020>: Failed to resolve 'www.au-paccy.aceeseaeraocmeoy.iqlnii.top' ([Errno -2] Name or service not known)"))


Extracting Features:  75%|███████▌  | 376/499 [16:38<01:41,  1.21URLs/s]

Error fetching http://www.vivcsicisvcai.visvsai.tcv93y.icu/page1.php: HTTPConnectionPool(host='www.vivcsicisvcai.visvsai.tcv93y.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6816ce0>: Failed to resolve 'www.vivcsicisvcai.visvsai.tcv93y.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  77%|███████▋  | 382/499 [16:41<00:57,  2.05URLs/s]

Error fetching http://www.macsarerd.xg5mqq.icu/page1.php: HTTPConnectionPool(host='www.macsarerd.xg5mqq.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c68170d0>: Failed to resolve 'www.macsarerd.xg5mqq.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  77%|███████▋  | 384/499 [16:42<00:44,  2.61URLs/s]

Error fetching http://www.au-poccy.aceeseaoraocmeoy.ekijve.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aceeseaoraocmeoy.ekijve.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6874f40>: Failed to resolve 'www.au-poccy.aceeseaoraocmeoy.ekijve.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macessrord.ciiwd0.icu/page1.php: HTTPConnectionPool(host='www.macessrord.ciiwd0.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c69a9810>: Failed to resolve 'www.macessrord.ciiwd0.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  78%|███████▊  | 388/499 [16:44<00:52,  2.12URLs/s]

Error fetching http://www.vivcsviiacvaieca.visvsai.65xfer.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.65xfer.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c61d71f0>: Failed to resolve 'www.vivcsviiacvaieca.visvsai.65xfer.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  79%|███████▊  | 392/499 [16:46<00:51,  2.08URLs/s]

Error fetching http://www.vivcsiivai.vicsvesai.45v2lh.icu/page1.php: HTTPConnectionPool(host='www.vivcsiivai.vicsvesai.45v2lh.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c69abeb0>: Failed to resolve 'www.vivcsiivai.vicsvesai.45v2lh.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  79%|███████▉  | 394/499 [16:46<00:39,  2.64URLs/s]

Error fetching http://www.vivcsiisaviccai.vicsvesai.w5p6z9.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaviccai.vicsvesai.w5p6z9.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c68d3220>: Failed to resolve 'www.vivcsiisaviccai.vicsvesai.w5p6z9.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsareod.k6yxxb.icu/page1.php: HTTPConnectionPool(host='www.macsareod.k6yxxb.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c68d2920>: Failed to resolve 'www.macsareod.k6yxxb.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  81%|████████  | 403/499 [16:54<01:28,  1.09URLs/s]

Error fetching http://www.vivcsiiaaviccai.vicsvesai.urgggv.shop/page1.php: HTTPConnectionPool(host='www.vivcsiiaaviccai.vicsvesai.urgggv.shop', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c68d0c40>: Failed to resolve 'www.vivcsiiaaviccai.vicsvesai.urgggv.shop' ([Errno -2] Name or service not known)"))


Extracting Features:  81%|████████  | 405/499 [16:54<00:58,  1.62URLs/s]

Error fetching http://www.au-pacccy.aueseacaomceoy.gsqzpy.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aueseacaomceoy.gsqzpy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e641f0>: Failed to resolve 'www.au-pacccy.aueseacaomceoy.gsqzpy.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsarrod.208e6a.icu/page1.php: HTTPConnectionPool(host='www.macsarrod.208e6a.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a17be0>: Failed to resolve 'www.macsarrod.208e6a.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  82%|████████▏ | 408/499 [16:55<00:35,  2.59URLs/s]

Error fetching http://www.macsaeord.ueyjdiks.icu/page1.php: HTTPConnectionPool(host='www.macsaeord.ueyjdiks.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63565c0>: Failed to resolve 'www.macsaeord.ueyjdiks.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  82%|████████▏ | 411/499 [16:57<00:53,  1.65URLs/s]

Error fetching http://www.macesaeerd.1bmro5.icu/page1.php: HTTPConnectionPool(host='www.macesaeerd.1bmro5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c68756f0>: Failed to resolve 'www.macesaeerd.1bmro5.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pcccny.aeeseasaaocmeoy.wzyrmk.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseasaaocmeoy.wzyrmk.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63db6d0>: Failed to resolve 'www.au-pcccny.aeeseasaaocmeoy.wzyrmk.top' ([Errno -2] Name or service not known)"))


Extracting Features:  83%|████████▎ | 416/499 [17:00<00:52,  1.59URLs/s]

Error fetching http://www.au-pcccny.aueseacaomceoy.vrxppt.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aueseacaomceoy.vrxppt.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63d8ac0>: Failed to resolve 'www.au-pcccny.aueseacaomceoy.vrxppt.top' ([Errno -2] Name or service not known)"))
Error fetching https://banco-.piichiincha.repl.co/: HTTPSConnectionPool(host='banco-.piichiincha.repl.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ff7c5af1b70>: Failed to resolve 'banco-.piichiincha.repl.co' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pcccny.aeeseasaaocmeoy.gjaxjw.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseasaaocmeoy.gjaxjw.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection obje

Extracting Features:  84%|████████▍ | 420/499 [17:01<00:31,  2.48URLs/s]

Error fetching http://ohnizwl.htswyx.cn/: HTTPConnectionPool(host='ohnizwl.htswyx.cn', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63db730>: Failed to resolve 'ohnizwl.htswyx.cn' ([Errno -2] Name or service not known)"))


Extracting Features:  85%|████████▍ | 423/499 [17:04<01:00,  1.25URLs/s]

Error fetching http://www.au-poccay.aeeseacoaocmeoy.vcdolz.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseacoaocmeoy.vcdolz.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c637f070>: Failed to resolve 'www.au-poccay.aeeseacoaocmeoy.vcdolz.top' ([Errno -2] Name or service not known)"))


Extracting Features:  85%|████████▌ | 425/499 [17:06<01:13,  1.01URLs/s]

Error fetching http://www.macesaoeod.fi03rf.icu/page1.php: HTTPConnectionPool(host='www.macesaoeod.fi03rf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c637ec80>: Failed to resolve 'www.macesaoeod.fi03rf.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiacvaieca.visvsai.k6yxxb.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.k6yxxb.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c619bb80>: Failed to resolve 'www.vivcsviiacvaieca.visvsai.k6yxxb.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  86%|████████▌ | 429/499 [17:07<00:43,  1.62URLs/s]

Error fetching http://emis.gov.eg: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Extracting Features:  87%|████████▋ | 432/499 [17:10<00:57,  1.17URLs/s]

Error fetching http://www.vivcsiivcaia.vicsvesai.ugu66t.icu/page1.php: HTTPConnectionPool(host='www.vivcsiivcaia.vicsvesai.ugu66t.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6a6dd20>: Failed to resolve 'www.vivcsiivcaia.vicsvesai.ugu66t.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  88%|████████▊ | 440/499 [17:18<01:00,  1.02s/URLs]

Error fetching http://www.au-paccny.aueseacaomceoy.nwbezf.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccny.aueseacaomceoy.nwbezf.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c63554e0>: Failed to resolve 'www.au-paccny.aueseacaomceoy.nwbezf.top' ([Errno -2] Name or service not known)"))


Extracting Features:  89%|████████▊ | 442/499 [17:18<00:37,  1.53URLs/s]

Error fetching http://www.vivcsiavcai.vicsvesai.9qwfvb.icu/page1.php: HTTPConnectionPool(host='www.vivcsiavcai.vicsvesai.9qwfvb.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7ea4760>: Failed to resolve 'www.vivcsiavcai.vicsvesai.9qwfvb.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccey.aeeseasoaocmeoy.cufyoh.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeseasoaocmeoy.cufyoh.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c61d5150>: Failed to resolve 'www.au-paccey.aeeseasoaocmeoy.cufyoh.top' ([Errno -2] Name or service not known)"))


Extracting Features:  90%|████████▉ | 447/499 [17:20<00:20,  2.54URLs/s]

Error fetching http://www.vivcsvciiveai.visvsai.lmc2ry.icu/page1.php: HTTPConnectionPool(host='www.vivcsvciiveai.visvsai.lmc2ry.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c61d52a0>: Failed to resolve 'www.vivcsvciiveai.visvsai.lmc2ry.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  90%|████████▉ | 449/499 [17:24<00:49,  1.00URLs/s]

Error fetching http://www.macesaeord.gly7z6.icu/page1.php: HTTPConnectionPool(host='www.macesaeord.gly7z6.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c64edd20>: Failed to resolve 'www.macesaeord.gly7z6.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsarrod.bp7uq2.icu/page1.php: HTTPConnectionPool(host='www.macsarrod.bp7uq2.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e3afe0>: Failed to resolve 'www.macsarrod.bp7uq2.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  91%|█████████▏| 456/499 [17:25<00:15,  2.73URLs/s]

Error fetching http://www.au-paccy.aceeseaoraocmeoy.hcfugu.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseaoraocmeoy.hcfugu.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e3a260>: Failed to resolve 'www.au-paccy.aceeseaoraocmeoy.hcfugu.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.ekl-net.com-asoceecceaas.wdjsvq.top/jp.php: HTTPConnectionPool(host='www.ekl-net.com-asoceecceaas.wdjsvq.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c671a5f0>: Failed to resolve 'www.ekl-net.com-asoceecceaas.wdjsvq.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccy.aceeseaeraocmeoy.cjjfec.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aceeseaeraocmeoy.cjjfec.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolu

Extracting Features:  92%|█████████▏| 457/499 [17:25<00:13,  3.02URLs/s]

Error fetching http://www.vivcsiicsvcaia.visvsai.0a67vu.icu/page1.php: HTTPConnectionPool(host='www.vivcsiicsvcaia.visvsai.0a67vu.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7e3af50>: Failed to resolve 'www.vivcsiicsvcaia.visvsai.0a67vu.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  93%|█████████▎| 463/499 [17:28<00:14,  2.55URLs/s]

Error fetching http://www.macsareod.n9k5yv.icu/page1.php: HTTPConnectionPool(host='www.macsareod.n9k5yv.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b3c70>: Failed to resolve 'www.macsareod.n9k5yv.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiaaiecc.vicsvesai.2t8hvj.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiaaiecc.vicsvesai.2t8hvj.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b1d20>: Failed to resolve 'www.vivcsviiaaiecc.vicsvesai.2t8hvj.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  93%|█████████▎| 466/499 [17:29<00:11,  2.97URLs/s]

Error fetching http://www.au-poccay.aeeauaceaocmeoy.acebmv.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaceaocmeoy.acebmv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c66b2980>: Failed to resolve 'www.au-poccay.aeeauaceaocmeoy.acebmv.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccoy.aueseacaemceoy.kplwvi.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccoy.aueseacaemceoy.kplwvi.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c671b700>: Failed to resolve 'www.au-paccoy.aueseacaemceoy.kplwvi.top' ([Errno -2] Name or service not known)"))


Extracting Features:  94%|█████████▍| 471/499 [17:29<00:05,  4.84URLs/s]

Error fetching http://www.vivcsiisaviccai.vicsvesai.n8zehe.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaviccai.vicsvesai.n8zehe.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6719900>: Failed to resolve 'www.vivcsiisaviccai.vicsvesai.n8zehe.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccoy.aueseacaemceoy.pvnoai.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccoy.aueseacaemceoy.pvnoai.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7d5fb80>: Failed to resolve 'www.au-paccoy.aueseacaemceoy.pvnoai.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeauasoaocmeoy.qoepuz.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauasoaocmeoy.qoepuz.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by Name

Extracting Features:  95%|█████████▍| 474/499 [17:31<00:07,  3.17URLs/s]

Error fetching http://www.au-poccay.aeeseaceaocmeoy.wsejfh.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseaceaocmeoy.wsejfh.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c7d5ffd0>: Failed to resolve 'www.au-poccay.aeeseaceaocmeoy.wsejfh.top' ([Errno -2] Name or service not known)"))


Extracting Features:  96%|█████████▌| 477/499 [17:32<00:10,  2.14URLs/s]

Error fetching http://www.au-poccay.aeeseacoaocmeoy.zlxcfy.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseacoaocmeoy.zlxcfy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c6877c70>: Failed to resolve 'www.au-poccay.aeeseacoaocmeoy.zlxcfy.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-paccy.aceeseacraocmeoy.mgpisy.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.mgpisy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c683bbe0>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.mgpisy.top' ([Errno -2] Name or service not known)"))


Extracting Features:  96%|█████████▋| 481/499 [17:35<00:09,  1.82URLs/s]

Error fetching http://www.vivcsviavaiec.vicsvesai.w5p6z9.icu/page1.php: HTTPConnectionPool(host='www.vivcsviavaiec.vicsvesai.w5p6z9.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c683ba00>: Failed to resolve 'www.vivcsviavaiec.vicsvesai.w5p6z9.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  99%|█████████▊| 492/499 [17:49<00:10,  1.48s/URLs]

Error fetching http://www.macsarrod.of6jh4.icu/page1.php: HTTPConnectionPool(host='www.macsarrod.of6jh4.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c641b130>: Failed to resolve 'www.macsarrod.of6jh4.icu' ([Errno -2] Name or service not known)"))


Extracting Features: 100%|█████████▉| 497/499 [17:53<00:01,  1.12URLs/s]

Error fetching http://www.vivcsiciavcai.visvsai.bd7tlz.icu/page1.php: HTTPConnectionPool(host='www.vivcsiciavcai.visvsai.bd7tlz.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c641a710>: Failed to resolve 'www.vivcsiciavcai.visvsai.bd7tlz.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macesaeord.gd4365.icu/page1.php: HTTPConnectionPool(host='www.macesaeord.gd4365.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c67a0700>: Failed to resolve 'www.macesaeord.gd4365.icu' ([Errno -2] Name or service not known)"))


Extracting Features: 100%|██████████| 499/499 [17:54<00:00,  2.15s/URLs]

Error fetching http://www.eki-net.con-aesceoseesas.wzpclp.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesceoseesas.wzpclp.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ff7c641a620>: Failed to resolve 'www.eki-net.con-aesceoseesas.wzpclp.top' ([Errno -2] Name or service not known)"))
Feature extraction completed. The results are saved in features_extracted.csv





In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np

# Function to extract features from the URL
def extract_features_from_url(url):
    features = {
        'url_length': 0,
        'has_suspicious_keyword': 0,
        'is_numeric_domain': 0,
        'has_suspicious_tld': 0,
        'has_https': 0,
        'has_javascript': 0,
        'has_hidden_elements': 0,
        'has_iframe': 0,
        'number_of_links': 0,
        'number_of_input_fields': 0,
        'has_login_form': 0
    }

    # Suspicious keywords and TLDs (can be expanded)
    suspicious_keywords = ['login', 'signin', 'account', 'verify', 'secure']
    suspicious_tlds = ['.xyz', '.top', '.club']

    try:
        # 1. URL-based features
        features['url_length'] = len(url)
        features['has_suspicious_keyword'] = int(any(keyword in url for keyword in suspicious_keywords))
        domain = url.split('//')[1].split('/')[0]
        features['is_numeric_domain'] = int(any(char.isdigit() for char in domain))
        features['has_suspicious_tld'] = int(any(url.endswith(tld) for tld in suspicious_tlds))
        features['has_https'] = int(url.startswith('https'))

        # 2. HTML content-based features
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        features['has_javascript'] = int('javascript' in str(soup))
        features['has_hidden_elements'] = int('display:none' in str(soup))
        features['has_iframe'] = int('<iframe' in str(soup))
        features['number_of_links'] = len(soup.find_all('a'))
        features['number_of_input_fields'] = len(soup.find_all('input'))
        features['has_login_form'] = int('login' in str(soup).lower())

    except Exception as e:
        # In case of error fetching the URL
        print(f"Error fetching {url}: {e}")

    return features

# Function to extract features for a range of rows in the dataset
def extract_features_for_range(df, start_idx, end_idx, output_file):
    # Initialize the list to store extracted features
    extracted_data = []

    # Loop through the dataset in the given range
    for idx in tqdm(range(start_idx, end_idx), desc="Extracting Features", unit="URLs"):
        # Get the URL and the label from the dataset
        url = df.iloc[idx]['URL']
        label = df.iloc[idx]['label']  # Assuming 'label' is the target column

        # Extract features from the URL
        features = extract_features_from_url(url)

        # Add the URL and label to the extracted features
        features['URL'] = url
        features['label'] = label

        # Append to the list
        extracted_data.append(features)

    # Convert the list to a DataFrame
    features_df = pd.DataFrame(extracted_data)

    # Save the result to a new CSV file
    features_df.to_csv(output_file, index=False)
    print(f"Feature extraction completed. The results are saved in {output_file}")

# Load the original dataset (adjust the file name as needed)
input_file = "combined_data_randomized.csv"  # Replace with your dataset file path
df = pd.read_csv(input_file)

# Ask the user to input the starting and ending record indices
start_idx = int(input("Enter the starting record index: "))
end_idx = int(input("Enter the ending record index: "))

# Define the output file name where the results will be saved
output_file = "features_extracted_with_labels_201_1000.csv"  # Replace with the desired output file name

# Extract features for the specified range of records
extract_features_for_range(df, start_idx, end_idx, output_file)


Enter the starting record index: 200
Enter the ending record index: 600


Extracting Features:   1%|▏         | 5/400 [00:03<05:06,  1.29URLs/s]

Error fetching http://www.au-poccay.aeeauacoaocmeoy.egakdz.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauacoaocmeoy.egakdz.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bc0e8d30>: Failed to resolve 'www.au-poccay.aeeauacoaocmeoy.egakdz.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesceeccesoas.tddonu.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesceeccesoas.tddonu.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd857910>: Failed to resolve 'www.eki-net.con-aesceeccesoas.tddonu.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsaorod.15tf68.icu/page1.php: HTTPConnectionPool(host='www.macsaorod.15tf68.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTP

Extracting Features:   3%|▎         | 11/400 [00:05<02:41,  2.40URLs/s]

Error fetching http://www.au-pcccoy.aueseacaemceoy.wfclkk.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccoy.aueseacaemceoy.wfclkk.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd855f60>: Failed to resolve 'www.au-pcccoy.aueseacaemceoy.wfclkk.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiiaaaicca.vicsvesai.s6vx9m.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.s6vx9m.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad67c2b0>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.s6vx9m.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   4%|▎         | 14/400 [00:07<03:31,  1.82URLs/s]

Error fetching http://www.macsaeord.x7qbjf.icu/page1.php: HTTPConnectionPool(host='www.macsaeord.x7qbjf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad96ef80>: Failed to resolve 'www.macsaeord.x7qbjf.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   4%|▍         | 16/400 [00:08<02:58,  2.15URLs/s]

Error fetching http://www.macesareod.2jld0p.icu/page1.php: HTTPConnectionPool(host='www.macesareod.2jld0p.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd8568f0>: Failed to resolve 'www.macesareod.2jld0p.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   5%|▍         | 19/400 [00:08<02:08,  2.97URLs/s]

Error fetching http://www.vivcsiiaaaicca.vicsvesai.bf2o5x.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.bf2o5x.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae01b2e0>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.bf2o5x.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.macsarerd.09vz0h.icu/page1.php: HTTPConnectionPool(host='www.macsarerd.09vz0h.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2addc7fd0>: Failed to resolve 'www.macsarerd.09vz0h.icu' ([Errno -2] Name or service not known)"))
Error fetching https://connecct-login.884santarita.com/: HTTPSConnectionPool(host='connecct-login.884santarita.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ca2addc68f0>: Failed to res

Extracting Features:   6%|▌         | 24/400 [00:10<01:43,  3.63URLs/s]

Error fetching http://www.au-paccoy.aeeseaccaocmeoy.nfrtli.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccoy.aeeseaccaocmeoy.nfrtli.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae0183a0>: Failed to resolve 'www.au-paccoy.aeeseaccaocmeoy.nfrtli.top' ([Errno -2] Name or service not known)"))


Extracting Features:   6%|▋         | 25/400 [00:11<03:08,  1.98URLs/s]

Error fetching http://www.macsaeerd.n9k5yv.icu/page1.php: HTTPConnectionPool(host='www.macsaeerd.n9k5yv.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae018640>: Failed to resolve 'www.macsaeerd.n9k5yv.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeauaccaocmeoy.pafzaw.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaccaocmeoy.pafzaw.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae661750>: Failed to resolve 'www.au-poccay.aeeauaccaocmeoy.pafzaw.top' ([Errno -2] Name or service not known)"))


Extracting Features:   8%|▊         | 30/400 [00:12<02:26,  2.53URLs/s]

Error fetching https://connecct-login.aidanharold.com/: HTTPSConnectionPool(host='connecct-login.aidanharold.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7ca2ae661660>: Failed to resolve 'connecct-login.aidanharold.com' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesceeosesoas.okhusg.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesceeosesoas.okhusg.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae047790>: Failed to resolve 'www.eki-net.con-aesceeosesoas.okhusg.top' ([Errno -2] Name or service not known)"))


Extracting Features:   8%|▊         | 33/400 [00:13<02:14,  2.73URLs/s]

Error fetching http://www.au-paccy.aceeseacraocmeoy.glxpxy.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.glxpxy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae047a30>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.glxpxy.top' ([Errno -2] Name or service not known)"))


Extracting Features:   9%|▉         | 35/400 [00:15<03:19,  1.83URLs/s]

Error fetching http://www.macsaeerd.v72weu.icu/page1.php: HTTPConnectionPool(host='www.macsaeerd.v72weu.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae06bee0>: Failed to resolve 'www.macsaeerd.v72weu.icu' ([Errno -2] Name or service not known)"))


Extracting Features:   9%|▉         | 37/400 [00:16<02:30,  2.41URLs/s]

Error fetching http://www.macesaorod.a7w3p7.icu/page1.php: HTTPConnectionPool(host='www.macesaorod.a7w3p7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae06beb0>: Failed to resolve 'www.macesaorod.a7w3p7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  10%|▉         | 39/400 [00:16<02:25,  2.49URLs/s]

Error fetching http://www.macessrerd.7v560g.icu/page1.php: HTTPConnectionPool(host='www.macessrerd.7v560g.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae06aa40>: Failed to resolve 'www.macessrerd.7v560g.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  11%|█         | 43/400 [00:23<06:42,  1.13s/URLs]

Error fetching http://www.vivcsiiaaaicca.vicsvesai.q4nb0o.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.q4nb0o.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd841b70>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.q4nb0o.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  11%|█▏        | 45/400 [00:23<05:03,  1.17URLs/s]

Error fetching http://www.vivcsiisaviccai.vicsvesai.4awkri.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaviccai.vicsvesai.4awkri.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd842ef0>: Failed to resolve 'www.vivcsiisaviccai.vicsvesai.4awkri.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  13%|█▎        | 52/400 [00:29<04:40,  1.24URLs/s]

Error fetching http://www.au-pcccny.aueseacaomceoy.juvnbj.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aueseacaomceoy.juvnbj.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad96dc00>: Failed to resolve 'www.au-pcccny.aueseacaomceoy.juvnbj.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-pcccny.aeeseacaaocmeoy.gjaxjw.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseacaaocmeoy.gjaxjw.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad96d330>: Failed to resolve 'www.au-pcccny.aeeseacaaocmeoy.gjaxjw.top' ([Errno -2] Name or service not known)"))


Extracting Features:  14%|█▎        | 54/400 [02:41<2:47:22, 29.02s/URLs]

Error fetching http://xtec.cat: HTTPConnectionPool(host='xtec.cat', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ca2bd5d8f40>, 'Connection to xtec.cat timed out. (connect timeout=None)'))
Error fetching http://www.macsarord.zobca6.icu/page1.php: HTTPConnectionPool(host='www.macsarord.zobca6.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae372f80>: Failed to resolve 'www.macsarord.zobca6.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  14%|█▍        | 56/400 [04:52<4:06:19, 42.96s/URLs]

Error fetching http://torob.com: HTTPConnectionPool(host='torob.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ca2ae3726b0>, 'Connection to torob.com timed out. (connect timeout=None)'))


Extracting Features:  14%|█▍        | 57/400 [04:52<3:13:18, 33.82s/URLs]

Error fetching http://www.vivcsiavai.vicsvesai.yc0xn8.icu/page1.php: HTTPConnectionPool(host='www.vivcsiavai.vicsvesai.yc0xn8.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad96f940>: Failed to resolve 'www.vivcsiavai.vicsvesai.yc0xn8.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aueseacaomceoy.nvwszn.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseacaomceoy.nvwszn.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae045a80>: Failed to resolve 'www.au-poccay.aueseacaomceoy.nvwszn.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiivcaia.vicsvesai.c2yfkr.icu/page1.php: HTTPConnectionPool(host='www.vivcsiivcaia.vicsvesai.c2yfkr.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.c

Extracting Features:  16%|█▌        | 64/400 [04:54<46:47,  8.35s/URLs]

Error fetching http://www.vivcsiisacvaicca.visvsai.of6jh4.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.of6jh4.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd7c6560>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.of6jh4.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  18%|█▊        | 71/400 [04:57<09:01,  1.65s/URLs]

Error fetching http://www.vivcsviiaaieca.vicsvesai.cx03ta.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiaaieca.vicsvesai.cx03ta.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2b4019d50>: Failed to resolve 'www.vivcsviiaaieca.vicsvesai.cx03ta.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  18%|█▊        | 72/400 [04:58<07:29,  1.37s/URLs]

Error fetching http://www.vivcsiisacvaicca.visvsai.0rv6h8.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.0rv6h8.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd69c250>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.0rv6h8.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  18%|█▊        | 74/400 [04:58<04:57,  1.10URLs/s]

Error fetching http://www.au-paccey.aeeauasoaocmeoy.bmzcgy.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccey.aeeauasoaocmeoy.bmzcgy.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad44d690>: Failed to resolve 'www.au-paccey.aeeauasoaocmeoy.bmzcgy.top' ([Errno -2] Name or service not known)"))


Extracting Features:  19%|█▉        | 77/400 [05:00<03:54,  1.38URLs/s]

Error fetching http://www.au-poccoy.aueseacaemceoy.yunpgg.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccoy.aueseacaemceoy.yunpgg.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae6603d0>: Failed to resolve 'www.au-poccoy.aueseacaemceoy.yunpgg.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviiacvaieca.visvsai.jqsqm7.icu/page1.php: HTTPConnectionPool(host='www.vivcsviiacvaieca.visvsai.jqsqm7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad8ebca0>: Failed to resolve 'www.vivcsviiacvaieca.visvsai.jqsqm7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  20%|██        | 80/400 [05:00<02:36,  2.04URLs/s]

Error fetching http://www.au-poccy.aeueseaeaomceoy.islxvm.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccy.aeueseaeaomceoy.islxvm.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad78c0a0>: Failed to resolve 'www.au-poccy.aeueseaeaomceoy.islxvm.top' ([Errno -2] Name or service not known)"))


Extracting Features:  20%|██        | 82/400 [05:02<02:39,  2.00URLs/s]

Error fetching http://www.au-pacccy.aueseacaomceoy.jrvsgv.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aueseacaomceoy.jrvsgv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad78faf0>: Failed to resolve 'www.au-pacccy.aueseacaomceoy.jrvsgv.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeseaceaocmeoy.vhypzf.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeseaceaocmeoy.vhypzf.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad78ce80>: Failed to resolve 'www.au-poccay.aeeseaceaocmeoy.vhypzf.top' ([Errno -2] Name or service not known)"))


Extracting Features:  21%|██        | 84/400 [05:02<02:06,  2.51URLs/s]

Error fetching http://www.vivcsiiaacvaicca.visvsai.no9sh7.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaacvaicca.visvsai.no9sh7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad78e230>: Failed to resolve 'www.vivcsiiaacvaicca.visvsai.no9sh7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  22%|██▏       | 86/400 [05:03<02:07,  2.47URLs/s]

Error fetching http://www.macesaoeod.usytdje.icu/page1.php: HTTPConnectionPool(host='www.macesaoeod.usytdje.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae06bfd0>: Failed to resolve 'www.macesaoeod.usytdje.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aueseasaomceoy.tzqbcg.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseasaomceoy.tzqbcg.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad36a8c0>: Failed to resolve 'www.au-poccay.aueseasaomceoy.tzqbcg.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.eki-net.con-aesccosesaas.okhusg.top/jp.php: HTTPConnectionPool(host='www.eki-net.con-aesccosesaas.okhusg.top', port=80): Max retries exceeded with url: /jp.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection obje

Extracting Features:  24%|██▎       | 94/400 [05:05<01:51,  2.73URLs/s]

Error fetching http://www.au-poccay.aueseasaomceoy.axnnut.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseasaomceoy.axnnut.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2b5a9cbe0>: Failed to resolve 'www.au-poccay.aueseasaomceoy.axnnut.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsiisacvaicca.visvsai.yvey51.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisacvaicca.visvsai.yvey51.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2b5a9f400>: Failed to resolve 'www.vivcsiisacvaicca.visvsai.yvey51.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  24%|██▍       | 97/400 [05:06<01:29,  3.39URLs/s]

Error fetching http://www.au-pcccny.aeeauasaaocmeoy.nwbezf.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeauasaaocmeoy.nwbezf.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2b5a9f010>: Failed to resolve 'www.au-pcccny.aeeauasaaocmeoy.nwbezf.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.macessrord.q4nb0o.icu/page1.php: HTTPConnectionPool(host='www.macessrord.q4nb0o.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd6db310>: Failed to resolve 'www.macessrord.q4nb0o.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.au-poccay.aeeauaccaocmeoy.wsejfh.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauaccaocmeoy.wsejfh.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPC

Extracting Features:  26%|██▌       | 102/400 [05:06<00:54,  5.48URLs/s]

Error fetching http://gaoqingw.com: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


Extracting Features:  26%|██▌       | 103/400 [05:07<01:34,  3.13URLs/s]

Error fetching http://www.macsaoeod.45ml1t.icu/page1.php: HTTPConnectionPool(host='www.macsaoeod.45ml1t.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2b5a9fc10>: Failed to resolve 'www.macsaoeod.45ml1t.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  26%|██▋       | 106/400 [05:09<02:05,  2.35URLs/s]

Error fetching http://www.au-paccy.aceeseacraocmeoy.nwgjza.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.nwgjza.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd9e2c80>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.nwgjza.top' ([Errno -2] Name or service not known)"))


Extracting Features:  27%|██▋       | 108/400 [05:10<02:06,  2.30URLs/s]

Error fetching http://www.au-pcccoy.aueseacaemceoy.shfxjv.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccoy.aueseacaemceoy.shfxjv.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae046200>: Failed to resolve 'www.au-pcccoy.aueseacaemceoy.shfxjv.top' ([Errno -2] Name or service not known)"))


Extracting Features:  28%|██▊       | 112/400 [05:11<01:44,  2.76URLs/s]

Error fetching http://www.macesaeord.yvey51.icu/page1.php: HTTPConnectionPool(host='www.macesaeord.yvey51.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad47e2c0>: Failed to resolve 'www.macesaeord.yvey51.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  29%|██▉       | 115/400 [05:12<01:30,  3.16URLs/s]

Error fetching http://www.au-poccay.aeeauacoaocmeoy.auomwo.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aeeauacoaocmeoy.auomwo.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad427460>: Failed to resolve 'www.au-poccay.aeeauacoaocmeoy.auomwo.top' ([Errno -2] Name or service not known)"))


Extracting Features:  29%|██▉       | 116/400 [05:13<01:32,  3.08URLs/s]

Error fetching http://www.vivcsiisaviccai.vicsvesai.e5susf.icu/page1.php: HTTPConnectionPool(host='www.vivcsiisaviccai.vicsvesai.e5susf.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad4275e0>: Failed to resolve 'www.vivcsiisaviccai.vicsvesai.e5susf.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  30%|██▉       | 119/400 [05:14<01:37,  2.88URLs/s]

Error fetching http://www.au-pcccny.aeeseacaaocmeoy.wfclkk.top/AU/page1.php: HTTPConnectionPool(host='www.au-pcccny.aeeseacaaocmeoy.wfclkk.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad424eb0>: Failed to resolve 'www.au-pcccny.aeeseacaaocmeoy.wfclkk.top' ([Errno -2] Name or service not known)"))


Extracting Features:  30%|███       | 122/400 [05:15<01:45,  2.65URLs/s]

Error fetching http://www.au-paccy.aceeseacraocmeoy.ohdlut.top/AU/page1.php: HTTPConnectionPool(host='www.au-paccy.aceeseacraocmeoy.ohdlut.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2bd6c5390>: Failed to resolve 'www.au-paccy.aceeseacraocmeoy.ohdlut.top' ([Errno -2] Name or service not known)"))


Extracting Features:  31%|███▏      | 125/400 [05:16<01:54,  2.40URLs/s]

Error fetching http://www.macsaoeod.tepyl0.icu/page1.php: HTTPConnectionPool(host='www.macsaoeod.tepyl0.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae1af580>: Failed to resolve 'www.macsaoeod.tepyl0.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  32%|███▏      | 128/400 [05:19<02:46,  1.63URLs/s]

Error fetching http://www.vivcsiiaaviccai.vicsvesai.lzvsi5.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaviccai.vicsvesai.lzvsi5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2b5b9c640>: Failed to resolve 'www.vivcsiiaaviccai.vicsvesai.lzvsi5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  33%|███▎      | 131/400 [05:19<01:58,  2.27URLs/s]

Error fetching http://www.au-poccay.aueseacaomceoy.jfwjev.top/AU/page1.php: HTTPConnectionPool(host='www.au-poccay.aueseacaomceoy.jfwjev.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2addc6470>: Failed to resolve 'www.au-poccay.aueseacaomceoy.jfwjev.top' ([Errno -2] Name or service not known)"))


Extracting Features:  34%|███▍      | 136/400 [05:22<01:51,  2.36URLs/s]

Error fetching http://www.au-pacccy.aueseacaomceoy.qvgxtq.top/AU/page1.php: HTTPConnectionPool(host='www.au-pacccy.aueseacaomceoy.qvgxtq.top', port=80): Max retries exceeded with url: /AU/page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2adfc3100>: Failed to resolve 'www.au-pacccy.aueseacaomceoy.qvgxtq.top' ([Errno -2] Name or service not known)"))
Error fetching http://www.maceseoeod.4161t5.icu/page1.php: HTTPConnectionPool(host='www.maceseoeod.4161t5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae0477c0>: Failed to resolve 'www.maceseoeod.4161t5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  35%|███▍      | 139/400 [05:23<01:28,  2.95URLs/s]

Error fetching http://www.vivcsiiscaias.vicsvesai.1iui33.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiscaias.vicsvesai.1iui33.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae045ea0>: Failed to resolve 'www.vivcsiiscaias.vicsvesai.1iui33.icu' ([Errno -2] Name or service not known)"))
Error fetching http://www.vivcsviveai.vicsvesai.4161t5.icu/page1.php: HTTPConnectionPool(host='www.vivcsviveai.vicsvesai.4161t5.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae047dc0>: Failed to resolve 'www.vivcsviveai.vicsvesai.4161t5.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  36%|███▌      | 142/400 [05:23<01:20,  3.22URLs/s]

Error fetching http://www.vivcsicisvcai.visvsai.ncbhdjei.icu/page1.php: HTTPConnectionPool(host='www.vivcsicisvcai.visvsai.ncbhdjei.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ae046fb0>: Failed to resolve 'www.vivcsicisvcai.visvsai.ncbhdjei.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  36%|███▋      | 145/400 [05:26<02:54,  1.46URLs/s]

Error fetching http://www.vivcsiiaaaicca.vicsvesai.45ml1t.icu/page1.php: HTTPConnectionPool(host='www.vivcsiiaaaicca.vicsvesai.45ml1t.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2adfc3880>: Failed to resolve 'www.vivcsiiaaaicca.vicsvesai.45ml1t.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  37%|███▋      | 148/400 [05:29<03:05,  1.36URLs/s]

Error fetching http://www.vivcsiavcai.vicsvesai.j328u2.icu/page1.php: HTTPConnectionPool(host='www.vivcsiavcai.vicsvesai.j328u2.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2adfc15d0>: Failed to resolve 'www.vivcsiavcai.vicsvesai.j328u2.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  38%|███▊      | 152/400 [07:40<1:35:42, 23.16s/URLs]

Error fetching http://karnataka.gov.in: HTTPConnectionPool(host='karnataka.gov.in', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7ca2b5aba6e0>, 'Connection to karnataka.gov.in timed out. (connect timeout=None)'))


Extracting Features:  38%|███▊      | 153/400 [07:41<1:17:27, 18.82s/URLs]

Error fetching http://www.macsaorod.vhvjv7.icu/page1.php: HTTPConnectionPool(host='www.macsaorod.vhvjv7.icu', port=80): Max retries exceeded with url: /page1.php (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7ca2ad4552a0>: Failed to resolve 'www.macsaorod.vhvjv7.icu' ([Errno -2] Name or service not known)"))


Extracting Features:  38%|███▊      | 154/400 [1:05:30<1:44:38, 25.52s/URLs]


KeyboardInterrupt: 

# **New Dataset, New Approach**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Step 1: Load the datasets
# Phishing dataset
phishing_data_path = 'PhiUSIIL_Phishing_URL_Dataset.csv'
phishing_data = pd.read_csv(phishing_data_path)

# Whitelist dataset
whitelist_data_path = 'top-1m.csv'
whitelist_data = pd.read_csv(whitelist_data_path, header=None, names=['URL'])

# Display dataset information
phishing_data.info(), whitelist_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39984 entries, 0 to 39983
Data columns (total 56 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILENAME                    39984 non-null  object 
 1   URL                         39984 non-null  object 
 2   URLLength                   39984 non-null  int64  
 3   Domain                      39984 non-null  object 
 4   DomainLength                39984 non-null  int64  
 5   IsDomainIP                  39983 non-null  float64
 6   TLD                         39983 non-null  object 
 7   URLSimilarityIndex          39983 non-null  float64
 8   CharContinuationRate        39983 non-null  float64
 9   TLDLegitimateProb           39983 non-null  float64
 10  URLCharProb                 39983 non-null  float64
 11  TLDLength                   39983 non-null  float64
 12  NoOfSubDomain               39983 non-null  float64
 13  HasObfuscation              399

(None, None)

In [None]:
# Step 2: Preprocessing
# Whitelist dataset - convert to lowercase and remove duplicates
whitelist_data['URL'] = whitelist_data['URL'].str.lower().str.strip()
whitelist_data = whitelist_data.drop_duplicates()

# Phishing dataset - ensure all URLs are in lowercase and clean any invalid data
phishing_data['URL'] = phishing_data['URL'].str.lower().str.strip()
phishing_data = phishing_data.dropna()

# Separate features and target labels for phishing dataset
features = phishing_data.drop(['label', 'FILENAME', 'URL', 'Domain', 'TLD', 'Title'], axis=1)
labels = phishing_data['label']


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Train the model
# Using a Random Forest Classifier for simplicity
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Save the trained model for future use
model_path = 'phishing_detection_model.pkl'
joblib.dump(model, model_path)

# Save the whitelist for lookup
whitelist_path = 'whitelist_urls.csv'
whitelist_data.to_csv(whitelist_path, index=False)

# Output model evaluation results
#accuracy, classification_rep, model_path, whitelist_path
print(f"Model accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print(f"Model saved to: {model_path}")
print(f"Whitelist saved to: {whitelist_path}")


Model accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3281
         1.0       1.00      1.00      1.00      4716

    accuracy                           1.00      7997
   macro avg       1.00      1.00      1.00      7997
weighted avg       1.00      1.00      1.00      7997

Model saved to: phishing_detection_model.pkl
Whitelist saved to: whitelist_urls.csv


In [None]:
# Step 2: Optimize preprocessing by reducing features to most relevant ones
relevant_features = [
    'URLLength', 'DomainLength', 'IsDomainIP', 'TLDLength', 'NoOfSubDomain',
    'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL',
    'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'SpacialCharRatioInURL',
    'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'DomainTitleMatchScore', 'URLTitleMatchScore',
    'NoOfURLRedirect', 'HasExternalFormSubmit', 'HasPasswordField', 'NoOfJS', 'NoOfImage'
]

# Retain only the relevant features for training
features = phishing_data[relevant_features]

# Retry splitting the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train the model again with optimized features
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Save the trained model
model_path = 'phishing_detection_model_optimized.pkl'
joblib.dump(model, model_path)

# Save the whitelist for lookup
whitelist_path = 'whitelist_urls_optimized.csv'
whitelist_data.to_csv(whitelist_path, index=False)

# Output model evaluation results
accuracy, classification_rep, model_path, whitelist_path
print(f"Model accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print(f"Model saved to: {model_path}")
print(f"Whitelist saved to: {whitelist_path}")


Model accuracy: 0.999124671751907
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3281
         1.0       1.00      1.00      1.00      4716

    accuracy                           1.00      7997
   macro avg       1.00      1.00      1.00      7997
weighted avg       1.00      1.00      1.00      7997

Model saved to: phishing_detection_model_optimized.pkl
Whitelist saved to: whitelist_urls_optimized.csv


In [None]:
pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [None]:
import joblib
import pandas as pd
from urllib.parse import urlparse
import tldextract

# Load trained model and whitelist
model_path = 'phishing_detection_model_optimized.pkl'  # Update with your model path
whitelist_path = 'whitelist_urls_optimized.csv'  # Update with your whitelist path

# Load the trained model and whitelist dataset
model = joblib.load(model_path)
whitelist_data = pd.read_csv(whitelist_path)
whitelist = whitelist_data['URL'].str.lower().str.strip().tolist()

# Function to extract the base domain from a URL
def extract_base_domain(url):
    parsed_url = urlparse(url)
    base_domain = parsed_url.netloc  # Extract the domain with subdomain
    return base_domain.lower().strip()

# Preprocessing function to extract features from the URL
def extract_features(url):
    url = url.lower().strip()

    # Extract basic features
    url_length = len(url)
    domain_length = len(url.split('/')[0])
    is_domain_ip = int(any(char.isdigit() for char in url.split('/')[0]))
    tld_length = len(url.split('.')[-1]) if '.' in url else 0
    no_of_subdomains = url.count('.') - 1
    has_obfuscation = int('%' in url or '@' in url)
    no_of_js = 0  # Mock value, can be enhanced for JS checks

    # Create a feature vector based on the selected features
    feature_vector = [
        url_length, domain_length, is_domain_ip, tld_length, no_of_subdomains,
        has_obfuscation, 0, 0.0, 0, 0.0, 0, 0.0, 0.0, 1, 0, 0, 0.0, 0.0, 0, 0, 0, no_of_js, 0
    ]
    return feature_vector

# Function to dynamically predict a URL
def predict_url(url):
    # Extract the base domain from the input URL
    input_base_domain = extract_base_domain(url)

    # Check against the whitelist for base domain match
    matching_record = [record for record in whitelist if input_base_domain.startswith(record)]
    if matching_record:
        return {
            "classification": "Legitimate (Found in Whitelist)",
            "reason": f"URL base domain '{input_base_domain}' matches the whitelist record '{matching_record[0]}'.",
            "phishy_score": 0,
            "legitimate_score": 1,
            "whitelist_match": True,
            "whitelist_record": matching_record[0]
        }

    # Step 2: Extract features for prediction
    feature_vector = extract_features(url)

    # Step 3: Predict using the trained model
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    # Get probability scores for both classes
    phishy_score = prediction_proba[0][1]  # Probability for class 1 (phishing)
    legitimate_score = prediction_proba[0][0]  # Probability for class 0 (legitimate)

    # Step 4: Construct detailed reasoning
    reasons = []
    if feature_vector[0] > 100:
        reasons.append("Long URL length, commonly used in phishing.")
    else:
        reasons.append("Short URL length suggests legitimacy.")

    if feature_vector[1] > 10:
        reasons.append("Long domain length, often used in phishing.")
    else:
        reasons.append("Normal domain length suggests legitimacy.")

    if feature_vector[4] > 1:
        reasons.append("Multiple subdomains found, common in phishing URLs.")
    else:
        reasons.append("Normal subdomain count suggests legitimacy.")

    if feature_vector[5] == 1:
        reasons.append("Obfuscation detected (e.g., % or @), common in phishing attempts.")
    else:
        reasons.append("No obfuscation detected, indicating legitimacy.")

    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    return {
        "classification": classification,
        "reason": " ".join(reasons),
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "whitelist_match": False,
        "whitelist_record": None
    }

    # Step 2: Extract features for prediction
    feature_vector = extract_features(url)

    # Step 3: Predict using the trained model
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    # Get probability scores for both classes
    phishy_score = prediction_proba[0][1]  # Probability for class 1 (phishing)
    legitimate_score = prediction_proba[0][0]  # Probability for class 0 (legitimate)

    # Step 4: Construct output
    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    reason = f"Model prediction: {classification}. Phishy score: {phishy_score:.4f}, Legitimate score: {legitimate_score:.4f}."

    return {
        "classification": classification,
        "reason": reason,
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "whitelist_match": False,
        "whitelist_record": None
    }

# Example usage (input URL dynamically)
input_url = input("Enter the URL to check: ")
result = predict_url(input_url)

# Display the result
print("\nResult:")
print(f"Classification: {result['classification']}")
print(f"Reason: {result['reason']}")
print(f"Phishy Score: {result['phishy_score']:.4f}")
print(f"Legitimate Score: {result['legitimate_score']:.4f}")
print(f"Whitelisted: {'Yes' if result['whitelist_match'] else 'No'}")
if result['whitelist_match']:
    print(f"Whitelist Record: {result['whitelist_record']}")


Enter the URL to check: https://dossier-de-contravention.info/

Result:
Classification: Legitimate
Reason: Short URL length suggests legitimacy. Normal domain length suggests legitimacy. Normal subdomain count suggests legitimacy. No obfuscation detected, indicating legitimacy.
Phishy Score: 0.0100
Legitimate Score: 0.9900
Whitelisted: No




In [None]:
# Function to dynamically predict a URL
def predict_url(url):
    # Extract the domain from the input URL
    input_domain = extract_domain(url)

    # Check against the whitelist
    if any(domain in input_domain for domain in whitelist):
        return {
            "classification": "Legitimate (Found in Whitelist)",
            "reason": f"URL domain '{input_domain}' matches a whitelisted domain.",
            "phishy_score": 0,
            "legitimate_score": 1,
            "whitelist_match": True
        }

    # Step 2: Extract features for prediction
    feature_vector = extract_features(url)

    # Step 3: Predict using the trained model
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    # Get probability scores for both classes
    phishy_score = prediction_proba[0][1]  # Probability for class 1 (phishing)
    legitimate_score = prediction_proba[0][0]  # Probability for class 0 (legitimate)

    # Construct detailed reasoning
    reasons = []
    if feature_vector[0] > 100:
        reasons.append("Long URL length, commonly used in phishing.")
    else:
        reasons.append("Short URL length suggests legitimacy.")

    if feature_vector[1] > 10:
        reasons.append("Long domain length, often used in phishing.")
    else:
        reasons.append("Normal domain length suggests legitimacy.")

    if feature_vector[4] > 1:
        reasons.append("Multiple subdomains found, common in phishing URLs.")
    else:
        reasons.append("Normal subdomain count suggests legitimacy.")

    if feature_vector[5] == 1:
        reasons.append("Obfuscation detected (e.g., % or @), common in phishing attempts.")
    else:
        reasons.append("No obfuscation detected, indicating legitimacy.")

    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    return {
        "classification": classification,
        "reason": " ".join(reasons),
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "whitelist_match": False
    }


In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load trained model and whitelist
model_path = 'phishing_detection_model_optimized.pkl'  # Update with your model path
whitelist_path = 'whitelist_urls_optimized.csv'  # Update with your whitelist path

# Load the trained model and whitelist dataset
model = joblib.load(model_path)
whitelist = pd.read_csv(whitelist_path)['URL'].tolist()

# Preprocessing function to extract features from the URL (simplified)
def extract_features(url):
    url = url.lower().strip()

    # Extract basic features (you can expand this as needed)
    url_length = len(url)
    domain_length = len(url.split('/')[0])
    is_domain_ip = int(any(char.isdigit() for char in url.split('/')[0]))
    tld_length = len(url.split('.')[-1]) if '.' in url else 0
    no_of_subdomains = url.count('.') - 1
    has_obfuscation = int('%' in url or '@' in url)
    no_of_js = 0  # Mock value, you can add actual JS script checks

    # Create a feature vector based on the selected features
    feature_vector = [
        url_length, domain_length, is_domain_ip, tld_length, no_of_subdomains,
        has_obfuscation, 0, 0.0, 0, 0.0, 0, 0.0, 0.0, 1, 0, 0, 0.0, 0.0, 0, 0, 0, no_of_js, 0
    ]

    return feature_vector

# Function to dynamically predict a URL
def predict_url(url):
    # Step 1: Check against the whitelist
    if url in whitelist:
        return {
            "classification": "Legitimate (Found in Whitelist)",
            "reason": "URL found in whitelist.",
            "phishy_score": 0,
            "legitimate_score": 1,
            "whitelist_match": True
        }

    # Step 2: Extract features for prediction
    feature_vector = extract_features(url)

    # Step 3: Predict using the trained model
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    # Get probability scores for both classes
    phishy_score = prediction_proba[0][1]  # Probability for class 1 (phishing)
    legitimate_score = prediction_proba[0][0]  # Probability for class 0 (legitimate)

    # Step 4: Construct output
    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    reason = f"Model prediction: {classification}. Phishy score: {phishy_score:.4f}, Legitimate score: {legitimate_score:.4f}."

    return {
        "classification": classification,
        "reason": reason,
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "whitelist_match": False
    }

# Example usage (input URL dynamically)
input_url = input("Enter the URL to check: ")
result = predict_url(input_url)

# Display the result
print("\nResult:")
print(f"Classification: {result['classification']}")
print(f"Reason: {result['reason']}")
print(f"Phishy Score: {result['phishy_score']:.4f}")
print(f"Legitimate Score: {result['legitimate_score']:.4f}")
print(f"Whitelisted: {'Yes' if result['whitelist_match'] else 'No'}")


Enter the URL to check: https://wetransfer-file634.serwer-p8317.online

Result:
Classification: Legitimate
Reason: Model prediction: Legitimate. Phishy score: 0.0000, Legitimate score: 1.0000.
Phishy Score: 0.0000
Legitimate Score: 1.0000
Whitelisted: No




In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load trained model and whitelist
model_path = 'phishing_detection_model_optimized.pkl'  # Update with your model path
whitelist_path = 'whitelist_urls_optimized.csv'  # Update with your whitelist path

# Load the trained model and whitelist dataset
model = joblib.load(model_path)
whitelist = pd.read_csv(whitelist_path)['URL'].tolist()

# Preprocessing function to extract features from the URL (simplified)
def extract_features(url):
    url = url.lower().strip()

    # Extract basic features (you can expand this as needed)
    url_length = len(url)
    domain_length = len(url.split('/')[0])
    is_domain_ip = int(any(char.isdigit() for char in url.split('/')[0]))
    tld_length = len(url.split('.')[-1]) if '.' in url else 0
    no_of_subdomains = url.count('.') - 1
    has_obfuscation = int('%' in url or '@' in url)
    no_of_js = 0  # Mock value, you can add actual JS script checks

    # Create a feature vector based on the selected features
    feature_vector = [
        url_length, domain_length, is_domain_ip, tld_length, no_of_subdomains,
        has_obfuscation, 0, 0.0, 0, 0.0, 0, 0.0, 0.0, 1, 0, 0, 0.0, 0.0, 0, 0, 0, no_of_js, 0
    ]

    return feature_vector

# Function to dynamically predict a URL
def predict_url(url):
    # Step 1: Check against the whitelist
    if url in whitelist:
        return {
            "classification": "Legitimate (Found in Whitelist)",
            "reason": "URL found in whitelist, so classified as legitimate.",
            "phishy_score": 0,
            "legitimate_score": 1,
            "phishy_features_count": 0,
            "legitimate_features_count": len(features),  # Total features are considered legitimate
            "whitelist_match": True
        }

    # Step 2: Extract features for prediction
    feature_vector = extract_features(url)

    # Step 3: Predict using the trained model
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    # Get probability scores for both classes
    phishy_score = prediction_proba[0][1]  # Probability for class 1 (phishing)
    legitimate_score = prediction_proba[0][0]  # Probability for class 0 (legitimate)

    # Step 4: Feature matching analysis
    phishy_features_count = 0
    legitimate_features_count = 0
    reason = []

    # Define conditions for phishing and legitimate features
    if feature_vector[0] > 100:  # URL length long for phishing
        phishy_features_count += 1
        reason.append("Long URL length detected, often a phishing characteristic.")
    else:
        legitimate_features_count += 1

    if feature_vector[1] > 10:  # Domain length long for phishing
        phishy_features_count += 1
        reason.append("Long domain length detected, often a phishing characteristic.")
    else:
        legitimate_features_count += 1

    if feature_vector[2] == 1:  # If it's an IP address for the domain
        phishy_features_count += 1
        reason.append("IP address used in the domain, which is common in phishing URLs.")
    else:
        legitimate_features_count += 1

    if feature_vector[3] > 3:  # TLD length (long TLD might indicate phishing)
        phishy_features_count += 1
        reason.append("Unusual TLD length detected, a potential phishing signal.")
    else:
        legitimate_features_count += 1

    if feature_vector[4] > 1:  # Multiple subdomains (often phishing)
        phishy_features_count += 1
        reason.append("Multiple subdomains found, which is a common phishing characteristic.")
    else:
        legitimate_features_count += 1

    if feature_vector[5] == 1:  # Obfuscation (like % or @)
        phishy_features_count += 1
        reason.append("URL contains obfuscation (e.g., % or @), typical of phishing attempts.")
    else:
        legitimate_features_count += 1

    # Step 5: Construct output
    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    reason = " ".join(reason)

    return {
        "classification": classification,
        "reason": reason,
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "phishy_features_count": phishy_features_count,
        "legitimate_features_count": legitimate_features_count,
        "whitelist_match": False
    }

# Example usage (input URL dynamically)
input_url = input("Enter the URL to check: ")
result = predict_url(input_url)

# Display the result
print("\nResult:")
print(f"Classification: {result['classification']}")
print(f"Reason: {result['reason']}")
print(f"Phishy Score: {result['phishy_score']:.4f}")
print(f"Legitimate Score: {result['legitimate_score']:.4f}")
print(f"Phishy Features Count: {result['phishy_features_count']}")
print(f"Legitimate Features Count: {result['legitimate_features_count']}")
print(f"Whitelisted: {'Yes' if result['whitelist_match'] else 'No'}")


FileNotFoundError: [Errno 2] No such file or directory: 'phishing_detection_model_optimized.pkl'

In [None]:
pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load trained model and whitelist
model_path = 'phishing_detection_model_optimized.pkl'  # Update with your model path
whitelist_path = 'whitelist_urls_optimized.csv'  # Update with your whitelist path

# Load the trained model and whitelist dataset
model = joblib.load(model_path)
whitelist = pd.read_csv(whitelist_path)['URL'].tolist()

# Preprocessing function to extract features from the URL (simplified)
def extract_features(url):
    url = url.lower().strip()

    # Extract basic features (you can expand this as needed)
    url_length = len(url)
    domain_length = len(url.split('/')[0])
    is_domain_ip = int(any(char.isdigit() for char in url.split('/')[0]))
    tld_length = len(url.split('.')[-1]) if '.' in url else 0
    no_of_subdomains = url.count('.') - 1
    has_obfuscation = int('%' in url or '@' in url)
    no_of_js = 0  # Mock value, you can add actual JS script checks

    # Create a feature vector based on the selected features
    feature_vector = [
        url_length, domain_length, is_domain_ip, tld_length, no_of_subdomains,
        has_obfuscation, 0, 0.0, 0, 0.0, 0, 0.0, 0.0, 1, 0, 0, 0.0, 0.0, 0, 0, 0, no_of_js, 0
    ]

    return feature_vector

# Function to dynamically predict a URL
def predict_url(url):
    # Step 1: Check against the whitelist
    if url in whitelist:
        return {
            "classification": "Legitimate (Found in Whitelist)",
            "reason": "URL found in whitelist, so classified as legitimate.",
            "phishy_score": 0,
            "legitimate_score": 1,
            "phishy_features_count": 0,
            "legitimate_features_count": 6,  # Total features considered legitimate
            "whitelist_match": True
        }

    # Step 2: Extract features for prediction
    feature_vector = extract_features(url)

    # Step 3: Predict using the trained model
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    # Get probability scores for both classes
    phishy_score = prediction_proba[0][1]  # Probability for class 1 (phishing)
    legitimate_score = prediction_proba[0][0]  # Probability for class 0 (legitimate)

    # Step 4: Feature matching analysis
    phishy_features_count = 0
    legitimate_features_count = 0
    reason = []

    # Define conditions for phishing and legitimate features
    if feature_vector[0] > 100:  # URL length long for phishing
        phishy_features_count += 1
        reason.append("Long URL length, commonly used in phishing.")
    else:
        legitimate_features_count += 1

    if feature_vector[1] > 10:  # Domain length long for phishing
        phishy_features_count += 1
        reason.append("Long domain length, often used for phishing.")
    else:
        legitimate_features_count += 1

    if feature_vector[2] == 1:  # If it's an IP address for the domain
        phishy_features_count += 1
        reason.append("IP address used in the domain, which is common in phishing URLs.")
    else:
        legitimate_features_count += 1

    if feature_vector[3] > 3:  # TLD length (long TLD might indicate phishing)
        phishy_features_count += 1
        reason.append("Short TLD detected, often used for phishing.")
    else:
        legitimate_features_count += 1

    if feature_vector[4] > 1:  # Multiple subdomains (often phishing)
        phishy_features_count += 1
        reason.append("Multiple subdomains found, common in phishing URLs.")
    else:
        legitimate_features_count += 1

    if feature_vector[5] == 1:  # Obfuscation (like % or @)
        phishy_features_count += 1
        reason.append("Obfuscation detected (e.g., % or @), common in phishing attempts.")
    else:
        legitimate_features_count += 1

    # Step 5: Construct output
    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    reason = " ".join(reason)

    return {
        "classification": classification,
        "reason": reason,
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "phishy_features_count": phishy_features_count,
        "legitimate_features_count": legitimate_features_count,
        "whitelist_match": False
    }

# Example usage (input URL dynamically)
input_url = input("Enter the URL to check: ")
result = predict_url(input_url)

# Display the result
print("\nResult:")
print(f"Classification: {result['classification']}")
print(f"Reason: {result['reason']}")
print(f"Phishy Score: {result['phishy_score']:.4f}")
print(f"Legitimate Score: {result['legitimate_score']:.4f}")
print(f"Phishy Features Count: {result['phishy_features_count']}")
print(f"Legitimate Features Count: {result['legitimate_features_count']}")
print(f"Whitelisted: {'Yes' if result['whitelist_match'] else 'No'}")


Enter the URL to check: 	https://wetransfer-file634.serwer-p8317.online

Result:
Classification: Legitimate
Reason: Short TLD detected, often used for phishing.
Phishy Score: 0.0000
Legitimate Score: 1.0000
Phishy Features Count: 1
Legitimate Features Count: 5
Whitelisted: No




# Complete different approach of the very first **dataset**

In [None]:
import pandas as pd

# Load the dataset
dataset = pd.read_csv('final_cleaned_dataset.csv')

# Define the threshold values for labeling
thresholds = {
    "url_length": 75,
    "url_entropy": 4.5,
    "contains_ip_address": 1,
    "count_dots": 4,
    "contains_hyphens": 1,
    "count_special_chars": 3,
    "suspicious_tld": 1,
    "url_redirection": 1,
    "contains_https_token": 0,
    "domain_age": 1,  # in years
    "domain_expiration": 1,  # in years
    "privacy_protection": 1,
    "ssl_validity": 0,
    "ssl_issuer": "Unknown issuer",
    "external_links": 50,
    "internal_links": 50,
    "login_forms": 1,
    "input_fields": 5,
    "hidden_elements": 5,
    "javascript_length": 1000,
    "inline_script_length": 500,
    "iframes": 1
}

# Function to label URLs based on thresholds
def label_url(row):
    if (row['url_length'] > thresholds['url_length'] or
        row['url_entropy'] > thresholds['url_entropy'] or
        row['contains_ip_address'] == thresholds['contains_ip_address'] or
        row['count_dots'] > thresholds['count_dots'] or
        row['contains_hyphens'] == thresholds['contains_hyphens'] or
        row['count_special_chars'] > thresholds['count_special_chars'] or
        row['suspicious_tld'] == thresholds['suspicious_tld'] or
        row['url_redirection'] == thresholds['url_redirection'] or
        row['contains_https_token'] == thresholds['contains_https_token'] or
        row['domain_age'] < thresholds['domain_age'] or
        row['domain_expiration'] < thresholds['domain_expiration'] or
        row['privacy_protection'] == thresholds['privacy_protection'] or
        row['ssl_validity'] == thresholds['ssl_validity'] or
        row['ssl_issuer'] == thresholds['ssl_issuer'] or
        row['external_links'] > thresholds['external_links'] or
        row['internal_links'] > thresholds['internal_links'] or
        row['login_forms'] > thresholds['login_forms'] or
        row['input_fields'] > thresholds['input_fields'] or
        row['hidden_elements'] > thresholds['hidden_elements'] or
        row['javascript_length'] > thresholds['javascript_length'] or
        row['inline_script_length'] > thresholds['inline_script_length'] or
        row['iframes'] == thresholds['iframes']):
        return 1  # Phishy
    else:
        return 0  # Legitimate

# Apply the labeling function to each row
dataset['label'] = dataset.apply(label_url, axis=1)

# Save the labeled dataset to a new file
dataset.to_csv('labeled_dataset.csv', index=False)

# Display the updated dataset
print(dataset[['url', 'label']].head())


                                                 url  label
0              http://www.crestonwood.com/router.php      1
1                                 http://rgipt.ac.in      1
2                                http://www.mutuo.it      1
3         http://vamoaestudiarmedicina.blogspot.com/      1
4  https://www.astrologyonline.eu/Astro_MemoNew/P...      1


------------------------------------------------------------------------------------------------------

In [None]:
pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import tldextract
from urllib.parse import urlparse

# Step 1: Load the datasets
phishing_data_path = 'phishing_site_urls.csv'  # Update the path
whitelist_data_path = 'top-1m.csv'  # Update the path

# Load the phishing dataset
phishing_data = pd.read_csv(phishing_data_path)

# Load the whitelist dataset
whitelist_data = pd.read_csv(whitelist_data_path, header=None, names=['URL'])


# Display dataset information
print(phishing_data.info())
print(whitelist_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 1 to 1000000
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   URL     1000000 non-null  object
dtypes: object(1)
memory usage: 15.3+ MB
None


In [None]:
import pandas as pd
# Step 1: Load the datasets
phishing = '/content/PhiUSIIL_Phishing_URL_Dataset.csv'  # Update the path
phishing_a = pd.read_csv(phishing)

print(phishing_a.columns)

Index(['FILENAME', 'URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
       'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
       'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain',
       'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
       'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL',
       'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
       'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore',
       'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
       'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame',
       'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
       'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
       'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
       'NoOfEmptyRef', 'NoOf

In [None]:
print(phishing_data.columns)


Index(['URL', 'Label'], dtype='object')


In [None]:
# Step 2: Preprocessing
# Preprocess the whitelist dataset - convert to lowercase and remove duplicates
whitelist_data['URL'] = whitelist_data['URL'].str.lower().str.strip()
whitelist_data = whitelist_data.drop_duplicates()

# Preprocess phishing dataset - convert URLs to lowercase and remove invalid data
phishing_data['URL'] = phishing_data['URL'].str.lower().str.strip()
phishing_data = phishing_data.dropna()

# Separate features and target labels
features = phishing_data.drop(['label', 'FILENAME', 'URL', 'Domain', 'TLD', 'Title'], axis=1)
labels = phishing_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Train the model
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)



KeyError: "['label', 'FILENAME', 'Domain', 'TLD', 'Title'] not found in axis"

In [None]:
# Save the trained model and whitelist for future use
model_path = 'phishing_detection_model.pkl'
joblib.dump(model, model_path)

whitelist_path = 'whitelist_urls.csv'
whitelist_data.to_csv(whitelist_path, index=False)

# Output model evaluation results
print(f"Model accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print(f"Model saved to: {model_path}")
print(f"Whitelist saved to: {whitelist_path}")

# Step 4: Optimize preprocessing by reducing features to the most relevant ones
relevant_features = [
    'URLLength', 'DomainLength', 'IsDomainIP', 'TLDLength', 'NoOfSubDomain',
    'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL',
    'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'SpacialCharRatioInURL',
    'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'DomainTitleMatchScore', 'URLTitleMatchScore',
    'NoOfURLRedirect', 'HasExternalFormSubmit', 'HasPasswordField', 'NoOfJS', 'NoOfImage'
]

# Retain only the relevant features for training
features = phishing_data[relevant_features]

# Retry splitting the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train the model again with optimized features
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model again
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Save the trained model again
model_path = 'phishing_detection_model_optimized.pkl'
joblib.dump(model, model_path)

whitelist_path = 'whitelist_urls_optimized.csv'
whitelist_data.to_csv(whitelist_path, index=False)

# Output model evaluation results
print(f"Optimized Model accuracy: {accuracy}")
print("Optimized Classification Report:\n", classification_rep)
print(f"Optimized Model saved to: {model_path}")
print(f"Optimized Whitelist saved to: {whitelist_path}")



In [None]:
# Step 5: Function to dynamically predict a URL
def extract_base_domain(url):
    parsed_url = urlparse(url)
    base_domain = parsed_url.netloc  # Extract domain with subdomain
    return base_domain.lower().strip()

def extract_features(url):
    url = url.lower().strip()

    # Extract features for model
    url_length = len(url)
    domain_length = len(url.split('/')[0])
    is_domain_ip = int(any(char.isdigit() for char in url.split('/')[0]))
    tld_length = len(url.split('.')[-1]) if '.' in url else 0
    no_of_subdomains = url.count('.') - 1
    has_obfuscation = int('%' in url or '@' in url)

    # Feature vector (can be expanded or adjusted based on available features)
    feature_vector = [
        url_length, domain_length, is_domain_ip, tld_length, no_of_subdomains,
        has_obfuscation, 0, 0.0, 0, 0.0, 0, 0.0, 0.0, 1, 0, 0, 0.0, 0.0, 0, 0, 0, 0, 0
    ]
    return feature_vector

def predict_url(url):
    # Check whitelist first
    input_base_domain = extract_base_domain(url)

    matching_record = [record for record in whitelist if input_base_domain.startswith(record)]
    if matching_record:
        return {
            "classification": "Legitimate (Found in Whitelist)",
            "reason": f"URL base domain '{input_base_domain}' matches the whitelist record '{matching_record[0]}'.",
            "phishy_score": 0,
            "legitimate_score": 1,
            "whitelist_match": True,
            "whitelist_record": matching_record[0]
        }

    # If not in whitelist, check with model
    feature_vector = extract_features(url)
    prediction = model.predict([feature_vector])
    prediction_proba = model.predict_proba([feature_vector])

    phishy_score = prediction_proba[0][1]
    legitimate_score = prediction_proba[0][0]

    classification = "Phishing" if prediction[0] == 1 else "Legitimate"
    reason = f"Model prediction: {classification}. Phishy score: {phishy_score:.4f}, Legitimate score: {legitimate_score:.4f}."

    return {
        "classification": classification,
        "reason": reason,
        "phishy_score": phishy_score,
        "legitimate_score": legitimate_score,
        "whitelist_match": False,
        "whitelist_record": None
    }

# Example usage
input_url = input("Enter the URL to check: ")
result = predict_url(input_url)

# Display the result
print("\nResult:")
print(f"Classification: {result['classification']}")
print(f"Reason: {result['reason']}")
print(f"Phishy Score: {result['phishy_score']:.4f}")
print(f"Legitimate Score: {result['legitimate_score']:.4f}")
print(f"Whitelisted: {'Yes' if result['whitelist_match'] else 'No'}")


In [None]:
pip install requests beautifulsoup4 tldextract numpy pandas




In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import tldextract
import re
import numpy as np
import socket
from urllib.parse import urlparse
from urllib import robotparser
from tqdm import tqdm  # Import tqdm for progress bar

# Function to calculate URL length
def get_url_length(url):
    return len(url)

# Function to get domain details
def get_domain_details(url):
    ext = tldextract.extract(url)
    domain = ext.domain
    subdomains = ext.subdomain.split('.') if ext.subdomain else []
    tld = ext.suffix
    return domain, len(domain), tld, len(tld), len(subdomains)

# Function to check if domain is an IP address
def is_domain_ip(domain):
    try:
        socket.inet_aton(domain)
        return True
    except socket.error:
        return False

# Function to calculate the similarity index of the URL (based on some heuristic)
def get_url_similarity_index(url):
    return np.random.rand()  # Placeholder for a real similarity function

# Function to calculate character continuation rate
def get_char_continuation_rate(url):
    char_count = len(url)
    repeated_chars = len(re.findall(r'([a-zA-Z])\1+', url))
    return repeated_chars / char_count if char_count else 0

# Function to detect obfuscation (simple heuristic based on symbols)
def detect_obfuscation(url):
    obfuscated_chars = re.findall(r'%[0-9A-Fa-f]{2}', url)
    return len(obfuscated_chars), len(obfuscated_chars) / len(url) if len(url) else 0

# Function to extract page title, description, and check if it has a favicon
def extract_html_metadata(url):
    title = ''
    description = ''
    favicon = False
    robots = None  # Initialize robots to None

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.title.string if soup.title else ''
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = description_tag['content'] if description_tag else ''

        favicon_tag = soup.find('link', rel='icon')
        favicon = bool(favicon_tag)

        # Checking robots.txt
        rp = robotparser.RobotFileParser()
        rp.set_url(url + "/robots.txt")
        rp.read()
        robots = rp

    except requests.RequestException:
        pass  # In case of connection issues or invalid URL

    return title, description, favicon, robots

# Main function to process each URL from the dataset
def process_urls_from_csv(input_csv, start_idx, end_idx, output_csv):
    # Read the input CSV file
    df = pd.read_csv(input_csv)

    # Get the URLs from the 'URL' column
    urls = df['URL'].tolist()

    result_data = []

    # Limit the records based on user input
    selected_urls = urls[start_idx:end_idx]

    # Initialize tqdm to visualize the progress
    for url in tqdm(selected_urls, desc="Processing URLs", unit="URL"):
        url_data = {}
        url_data['URL'] = url
        url_data['URLLength'] = get_url_length(url)

        domain, domain_len, tld, tld_len, subdomain_count = get_domain_details(url)
        url_data.update({
            'Domain': domain,
            'DomainLength': domain_len,
            'IsDomainIP': is_domain_ip(domain),
            'TLD': tld,
            'TLDLength': tld_len,
            'NoOfSubDomain': subdomain_count
        })

        url_data['URLSimilarityIndex'] = get_url_similarity_index(url)
        url_data['CharContinuationRate'] = get_char_continuation_rate(url)

        # Obfuscation
        obfuscated_char_count, obfuscation_ratio = detect_obfuscation(url)
        url_data.update({
            'HasObfuscation': bool(obfuscated_char_count),
            'NoOfObfuscatedChar': obfuscated_char_count,
            'ObfuscationRatio': obfuscation_ratio
        })

        # Character statistics
        url_data['NoOfLettersInURL'] = len(re.findall(r'[a-zA-Z]', url))
        url_data['LetterRatioInURL'] = url_data['NoOfLettersInURL'] / url_data['URLLength']
        url_data['NoOfDegitsInURL'] = len(re.findall(r'\d', url))
        url_data['DegitRatioInURL'] = url_data['NoOfDegitsInURL'] / url_data['URLLength']

        url_data['NoOfEqualsInURL'] = len(re.findall(r'=', url))
        url_data['NoOfQMarkInURL'] = len(re.findall(r'\?', url))
        url_data['NoOfAmpersandInURL'] = len(re.findall(r'&', url))
        url_data['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9&=?]', url))
        url_data['SpacialCharRatioInURL'] = url_data['NoOfOtherSpecialCharsInURL'] / url_data['URLLength']

        # Check for HTTPS
        url_data['IsHTTPS'] = url.lower().startswith('https://')

        # Extract HTML metadata
        title, description, favicon, robots = extract_html_metadata(url)
        url_data.update({
            'HasTitle': bool(title),
            'Title': title,
            'HasFavicon': favicon,
            'HasDescription': bool(description),
            'Robots': robots
        })

        # Other page-specific data can be extracted as needed...
        # For now, I will add a placeholder for others
        url_data.update({
            'LineOfCode': 0,  # Placeholder: Count of lines in the page source
            'LargestLineLength': 0,  # Placeholder for max line length in source
            'DomainTitleMatchScore': 0,  # Placeholder, can implement more complex logic
            'URLTitleMatchScore': 0,  # Placeholder for match score
            'NoOfURLRedirect': 0,  # Placeholder
            'NoOfSelfRedirect': 0,  # Placeholder
            'NoOfPopup': 0,  # Placeholder for popup count
            'NoOfiFrame': 0,  # Placeholder for iframe count
            'HasExternalFormSubmit': 0,  # Placeholder
            'HasSocialNet': 0,  # Placeholder
            'HasSubmitButton': 0,  # Placeholder
            'HasHiddenFields': 0,  # Placeholder
            'HasPasswordField': 0,  # Placeholder
            'Bank': 0,  # Placeholder for bank-related info
            'Pay': 0,  # Placeholder for payment-related info
            'Crypto': 0,  # Placeholder for cryptocurrency-related info
            'HasCopyrightInfo': 0,  # Placeholder
            'NoOfImage': 0,  # Placeholder for image count
            'NoOfCSS': 0,  # Placeholder for CSS file count
            'NoOfJS': 0,  # Placeholder for JS file count
            'NoOfSelfRef': 0,  # Placeholder for self-references
            'NoOfEmptyRef': 0,  # Placeholder for empty references
            'NoOfExternalRef': 0,  # Placeholder for external references
            'label': 0  # Placeholder for label
        })

        result_data.append(url_data)

    # Convert the result data to DataFrame
    result_df = pd.DataFrame(result_data)

    # Retain original columns and merge with extracted data
    output_df = pd.concat([df, result_df], axis=1)

    # Save the new DataFrame to CSV
    output_df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

# Example usage
input_csv = '/content/phishing_site_urls.csv'  # Replace with your input CSV file path
start_idx = int(input("Enter the starting record number: ")) - 1  # Convert to zero-indexed
end_idx = int(input("Enter the ending record number: "))
output_csv = f'url_data_{start_idx + 1}_to_{end_idx}.csv'  # Output file name

process_urls_from_csv(input_csv, start_idx, end_idx, output_csv)


Enter the starting record number: 56
Enter the ending record number: 500


Processing URLs: 100%|██████████| 445/445 [00:00<00:00, 3182.00URL/s]


Data saved to url_data_56_to_500.csv


In [None]:
import ssl
import socket
from pprint import pprint
from urllib.parse import urlparse

def get_ssl_certificate(url):
    # Extract host and port from the URL
    parsed_url = urlparse(url)
    host = parsed_url.hostname
    port = parsed_url.port if parsed_url.port else 443

    # Establish a socket connection and fetch the certificate
    context = ssl.create_default_context()
    with socket.create_connection((host, port)) as sock:
        with context.wrap_socket(sock, server_hostname=host) as ssock:
            cert = ssock.getpeercert()

    return cert

if __name__ == '__main__':
    # Get URL input from user
    url = input("Enter the URL (e.g., https://example.com): ")

    try:
        cert_details = get_ssl_certificate(url)
        print("\nSSL Certificate Details:")
        pprint(cert_details)
    except Exception as e:
        print(f"Error retrieving certificate details: {e}")


Enter the URL (e.g., https://example.com): https://colab.research.google.com/drive/11cn9C6GFXKTwkEURlIPxh3C7YsinEpUI?authuser=2#scrollTo=SMLlA7gc2rK4&uniqifier=1

SSL Certificate Details:
{'OCSP': ('http://o.pki.goog/wr2',),
 'caIssuers': ('http://i.pki.goog/wr2.crt',),
 'crlDistributionPoints': ('http://c.pki.goog/wr2/oQ6nyr8F0m0.crl',),
 'issuer': ((('countryName', 'US'),),
            (('organizationName', 'Google Trust Services'),),
            (('commonName', 'WR2'),)),
 'notAfter': 'Jan 13 08:37:12 2025 GMT',
 'notBefore': 'Oct 21 08:37:13 2024 GMT',
 'serialNumber': '01136AFADC160EBF10437D056E6DE700',
 'subject': ((('commonName', 'misc-sni.google.com'),),),
 'subjectAltName': (('DNS', 'misc-sni.google.com'),
                    ('DNS', '*.aiplatform-notebook.cloud.google.com'),
                    ('DNS', '*.aiplatform-training.cloud.google.com'),
                    ('DNS', '*.backupdr.cloud.google.com'),
                    ('DNS', '*.backupdr.cloud.google'),
                 

Phishing_site_URLS_Complete features


In [None]:
pip install pandas tldextract whois requests python-whois scikit-learn


Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting whois
  Downloading whois-1.20240129.2-py3-none-any.whl.metadata (1.3 kB)
Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading whois-1.20240129.2-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installin

In [None]:
pip install dnspython


Collecting dnspython
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/313.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython
Successfully installed dnspython-2.7.0


In [None]:
pip install python-Levenshtein


Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [None]:
pip install tqdm



In [None]:
import tldextract
import whois
import requests
import re
import hashlib
from urllib.parse import urlparse
import pandas as pd
from collections import Counter
import dns.resolver
import dns.exception
import math
from Levenshtein import distance as levenshtein_distance
from tqdm import tqdm

# Helper function to calculate entropy
def calculate_entropy(url):
    url = url.lower()
    p, l = Counter(url), float(len(url))
    return -sum(count / l * math.log2(count / l) for count in p.values())

# Enhanced function to extract more features (including additional ones)
def extract_enhanced_url_features(url, known_legitimate_domains):
    features = {}
    parsed_url = urlparse(url)

    # Existing URL-based features
    features['url_length'] = len(url)
    features['is_https'] = 1 if parsed_url.scheme == 'https' else 0
    domain_info = tldextract.extract(url)
    features['domain'] = domain_info.domain
    features['suffix'] = domain_info.suffix
    features['subdomain'] = domain_info.subdomain
    features['has_ip_address'] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0
    features['subdomain_count'] = len(domain_info.subdomain.split('.')) if domain_info.subdomain else 0
    features['special_char_count'] = len(re.findall(r'[^a-zA-Z0-9]', url))

    # Additional features
    # WHOIS Information
    try:
        w = whois.whois(url)
        features['whois_creation_date'] = w.creation_date if isinstance(w.creation_date, str) else None
        features['whois_domain_age'] = (pd.to_datetime('today') - pd.to_datetime(w.creation_date)).days if w.creation_date else None
    except:
        features['whois_creation_date'] = None
        features['whois_domain_age'] = None

    # SSL Certificate check
    try:
        req = requests.get(url, timeout=5)
        features['is_ssl_valid'] = 1 if req.status_code == 200 else 0
    except:
        features['is_ssl_valid'] = 0

    # WHOIS: Private registration
    try:
        if 'privacy' in str(w.registrar).lower():
            features['whois_private_registration'] = 1
        else:
            features['whois_private_registration'] = 0
    except:
        features['whois_private_registration'] = None

    # Suspicious keywords in URL
    suspicious_keywords = ['login', 'verify', 'update', 'secure', 'account', 'signin', 'payment']
    features['has_suspicious_keywords'] = any(keyword in url.lower() for keyword in suspicious_keywords)

    # Redirect count
    try:
        response = requests.get(url, allow_redirects=False)
        features['redirect_count'] = len(response.history)
    except:
        features['redirect_count'] = 0

    # New additional features

    # Entropy of the URL (randomness measure)
    features['url_entropy'] = calculate_entropy(url)

    # Levenshtein distance to known legitimate domains (check similarity)
    def domain_similarity(url):
        domain = tldextract.extract(url).domain
        min_distance = min([levenshtein_distance(domain, legit_domain) for legit_domain in known_legitimate_domains])
        return min_distance

    features['levenshtein_distance'] = domain_similarity(url)

    # DNS TTL value (Time-to-live for the domain)
    try:
        resolver = dns.resolver.Resolver()
        answers = resolver.resolve(domain_info.domain, 'A')
        # Assuming we want the TTL of the first record
        features['dns_ttl'] = answers.response.answer[0].ttl if answers.response.answer else None
    except dns.exception.DNSException:
        features['dns_ttl'] = None

    # Ratio of digits to characters in URL
    features['digit_char_ratio'] = sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0

    # Check for presence of encoded characters in URL
    features['has_encoded_chars'] = 1 if re.search(r'%[0-9A-Fa-f]{2}', url) else 0

    # Count of slashes in the path (more slashes could indicate more complicated fake paths)
    path_slash_count = urlparse(url).path.count('/')
    features['path_slash_count'] = path_slash_count

    return features

# Function to process a subset of the dataset and extract enhanced features with a progress bar
def analyze_enhanced_urls_with_progress(dataset, known_legitimate_domains, start_idx, end_idx):
    extracted_features = []

    # Use tqdm to visualize the progress
    for _, row in tqdm(dataset.iloc[start_idx:end_idx].iterrows(), total=end_idx-start_idx, desc="Processing URLs", unit="URL"):
        url = row['URL']  # Extract the URL from the 'URL' column
        label = row['Label']  # Extract the label from the 'Label' column
        features = extract_enhanced_url_features(url, known_legitimate_domains)
        features['label'] = label
        features['url'] = url  # Add the complete URL to the features
        extracted_features.append(features)

    return pd.DataFrame(extracted_features)

# Function to get the starting and ending records from the user and save the file
def process_and_save_features(dataset, known_legitimate_domains):
    # Ask user for starting and ending record indices
    start_idx = int(input("Enter the starting index (0-based): "))
    end_idx = int(input(f"Enter the ending index (max {len(dataset)-1}): "))

    # Validate the input range
    if start_idx < 0 or end_idx >= len(dataset) or start_idx > end_idx:
        print("Invalid range! Please make sure the indices are within the dataset bounds.")
        return

    # Process the data and extract features for the specified range
    analysis_result = analyze_enhanced_urls_with_progress(dataset, known_legitimate_domains, start_idx, end_idx)

    # Create a file name based on the starting and ending record indices
    filename = f"url_analysis_result_{start_idx}_{end_idx}.csv"

    # Save the new dataset with the extracted features
    analysis_result.to_csv(filename, index=False)

    print(f"Feature extraction complete. Results saved to '{filename}'.")

# Load your dataset (make sure to include your list of known legitimate domains)
dataset = pd.read_csv('phishing_site_urls.csv')  # Replace with your dataset path

# Known legitimate domains (sample; expand this list based on your use case)
known_legitimate_domains = ["google", "paypal", "facebook", "amazon", "twitter"]

# Call the function to process and save the extracted features
process_and_save_features(dataset, known_legitimate_domains)


Processing URLs:   2%|▏         | 216/9999 [03:43<1:03:22,  2.57URL/s]2024-12-12 17:18:16,884 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
Processing URLs:   3%|▎         | 250/9999 [04:35<2:18:12,  1.18URL/s]2024-12-12 17:19:18,714 - whois.whois - ERROR - Error trying to connect to socket: closing socket - timed out
ERROR:whois.whois:Error trying to connect to socket: closing socket - timed out
Processing URLs:   3%|▎         | 284/9999 [05:16<1:37:58,  1.65URL/s]2024-12-12 17:19:49,876 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno 111] Connection refused
Processing URLs:   3%|▎         | 285/9999 [05:16<1:22:21,  1.97URL/s]2024-12-12 17:19:50,142 - whois.whois - ERROR - Error 

Feature extraction complete. Results saved to 'url_analysis_result_50001_60000.csv'.


In [None]:
print(dataset.columns)


Index(['URL', 'Label'], dtype='object')


In [None]:
#Part 1

import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime

# Replace with your Google API Key
API_KEY = 'YOUR_API_KEY'

# Google Safe Browsing API URL
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

def calculate_entropy(url):
    """Calculate the Shannon entropy of a given URL."""
    frequency = Counter(url)
    total_characters = len(url)
    entropy = -sum((count / total_characters) * math.log2(count / total_characters)
                   for count in frequency.values())
    return entropy

def has_ip_address(url):
    """Check if the URL contains an IP address."""
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    """Count the number of dots (.) in the URL."""
    return url.count('.')

def count_hyphens_in_domain(url):
    """Count the number of hyphens (-) in the domain part of the URL."""
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    return domain.count('-')

def count_special_characters(url):
    """Count the number of special characters in the URL."""
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    """Check if the URL is safe using Google Safe Browsing API."""
    payload = {
        "client": {
            "clientId": "yourClientID",
            "clientVersion": "1.0"
        },
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [
                {"url": url}
            ]
        }
    }

    # Send request to Google Safe Browsing API
    response = requests.post(
        f'{SAFE_BROWSING_URL}?key={API_KEY}',
        json=payload
    )

    result = response.json()

    if 'matches' in result:
        # If matches are found, return the URL as unsafe
        return "Suspicious tld: True"
    else:
        return "Suspicious tld: False"

def check_url_redirection(url):
    """Check for number of redirects in the URL."""
    try:
        # Send a GET request with allow_redirects=True to follow redirects
        response = requests.get(url, allow_redirects=True)

        # Get the number of redirects followed
        redirect_count = len(response.history)

        # Return the redirect count
        return redirect_count

    except requests.exceptions.RequestException as e:
        print(f"Error accessing the URL: {e}")
        return 0

def check_suspicious_ip(url):
    """Check if the URL contains a suspicious IP address instead of a domain name."""

    # Parse the URL to extract the netloc (domain or IP address part)
    parsed_url = urlparse(url)

    # Regex pattern for matching an IPv4 address
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')

    # Regex pattern for matching an IPv6 address
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')

    # Extract the host (domain or IP address) from the URL
    host = parsed_url.hostname

    # Check if the host is an IP address (either IPv4 or IPv6)
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1  # Suspicious IP address found
    else:
        return 0  # No suspicious IP address found

def check_https_token_in_url(url):
    """Check if 'HTTPS' token is present in the URL path, query parameters, or scheme."""

    # Parse the URL to extract components
    parsed_url = urlparse(url)

    # Check if 'https' is in the scheme, path, or query parameters
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1  # 'HTTPS' token found in any component (scheme, path, or query)
    else:
        return 0  # No 'HTTPS' token found

def get_domain_age(url):
    """Calculate the domain age using WHOIS data."""

    # Extract the domain from the URL
    domain = url.split('/')[2]

    try:
        # Fetch WHOIS data for the domain
        w = whois.whois(domain)

        # Extract the domain creation date from the WHOIS data
        creation_date = w.creation_date

        if isinstance(creation_date, list):
            # In some cases, the creation date is returned as a list. We take the first element.
            creation_date = creation_date[0]

        # Calculate the domain's age in years
        current_date = datetime.now()
        domain_age = (current_date - creation_date).days / 365

        # Return the domain age as an integer (rounded down)
        return int(domain_age)

    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def main():
    # Prompt user to enter a URL
    url = input("Enter a URL: ")

    # Check the length of the entered URL
    url_length = len(url)

    # Find and count the characters after the main URL (e.g., after '.com')
    main_domain = ".com"
    if main_domain in url:
        after_main_url = url.split(main_domain, 1)[-1]
        characters_after_main = len(after_main_url)
    else:
        characters_after_main = 0

    # Calculate entropy
    url_entropy = calculate_entropy(url)

    # Check for IP address presence
    ip_found = has_ip_address(url)
    ip_result = 1 if ip_found else 0

    # Count the number of dots in the URL
    num_dots = count_dots_in_url(url)

    # Count the number of hyphens in the domain
    num_hyphens = count_hyphens_in_domain(url)

    # Count the number of special characters in the URL
    num_special_chars = count_special_characters(url)

    # Check the URL safety using Google Safe Browsing API
    safety_status = check_url_safety(url)

    # Check the number of redirects for the URL
    redirect_count = check_url_redirection(url)

    # Check for suspicious IP address in the URL
    suspicious_ip = check_suspicious_ip(url)

    # Check for 'HTTPS' token presence
    https_token_status = check_https_token_in_url(url)

    # Get domain age
    domain_age = get_domain_age(url)

    # Display the results
    print(f"The length of the entered URL is: {url_length}")
    print(f"The number of characters after '{main_domain}' is: {characters_after_main}")
    print(f"The entropy of the entered URL is: {url_entropy}")
    print(f"IP Address Found: {'Yes' if ip_found else 'No'} (Value: {ip_result})")
    print(f"The number of dots in the entered URL '{url}' is: {num_dots}")
    print(f"The number of hyphens in the domain part of the URL is: {num_hyphens}")
    print(f"The number of special characters in the entered URL is: {num_special_chars}")
    print(f"Google Safe Browsing API Status: {safety_status}")
    print(f"Number of redirections: {redirect_count}")
    print(f"Suspicious IP Address Found: {'Yes' if suspicious_ip else 'No'}")
    print(f"HTTPS token found in the URL: {'Yes' if https_token_status else 'No'}")
    print(f"Domain age: {domain_age} years" if domain_age is not None else "Could not retrieve domain age.")

    # Add to a DataFrame for display
    df = pd.DataFrame({
        'URL': [url],
        'Length': [url_length],
        'Characters After .com': [characters_after_main],
        'Entropy': [url_entropy],
        'IP Found': [ip_result],
        'Number of Dots': [num_dots],
        'Number of Hyphens in Domain': [num_hyphens],
        'Number of Special Characters': [num_special_chars],
        'Safe Browsing Status': [safety_status],
        'Number of Redirections': [redirect_count],
        'Suspicious IP': [suspicious_ip],
        'HTTPS Token Found': [https_token_status],
        'Domain Age': [domain_age if domain_age is not None else 'N/A']
    })

    print("\nUpdated DataFrame:")
    print(df)

if __name__ == "__main__":
    main()

In [None]:
#part 2

import whois
import re
import ssl
import socket
import datetime
import OpenSSL
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import language_tool_python

# List of known suspicious registrars (this can be expanded with more examples)
suspicious_registrars = [
    "Namecheap Inc.",
    "GoDaddy.com, LLC",
    "PublicDomainRegistry.com",
    "eNom, Inc.",
    "PDR Ltd.",
    "Tucows Domains Inc.",
    "BigRock Solutions",
    "Dynadot LLC"
]

# List of common privacy protection services
privacy_protection_indicators = [
    "Private Registration",
    "Contact Privacy",
    "Privacy Protected",
    "Domains by Proxy",
    "Privacy Protection",
    "Registrar Privacy"
]

# List of trusted domains and subdomains (e.g., Google)
trusted_domains = ['google.com', 'research.google.com', 'colab.research.google.com']

def get_domain_expiration(url):
    """Calculate the domain expiration time from WHOIS data."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        expiration_date = w.expiration_date

        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]

        current_date = datetime.now()
        time_remaining = expiration_date - current_date
        return time_remaining.days
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_registrar(url):
    """Check the domain registrar and evaluate its reputation."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        registrar = w.registrar
        if isinstance(registrar, list):
            registrar = registrar[0]

        if any(suspicious_registrar.lower() in registrar.lower() for suspicious_registrar in suspicious_registrars):
            return 1  # Suspicious registrar found
        else:
            return 0  # Not a suspicious registrar
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_whois_privacy_protection(url):
    """Check if the domain uses WHOIS privacy protection."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')

        if any(indicator.lower() in registrant_info.lower() for indicator in privacy_protection_indicators):
            return 1  # Privacy protection detected
        else:
            return 0  # No privacy protection
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_owner_details(url):
    """Check the domain owner's details and validate them for suspicious data."""
    domain = url.split('/')[2]
    if domain in trusted_domains or domain.endswith('google.com'):
        return 0  # Not suspicious, skip checks

    try:
        w = whois.whois(domain)
        registrant_name = w.get('registrant_name', '')
        registrant_email = w.get('registrant_email', '')
        registrant_phone = w.get('registrant_phone', '')

        if not registrant_name or not registrant_email or not registrant_phone:
            return 1  # Suspicious due to missing details

        suspicious_email_pattern = r"(admin|contact|info|support|webmaster|postmaster)@.*"
        if re.match(suspicious_email_pattern, registrant_email):
            return 1  # Suspicious email

        suspicious_phone_pattern = r"(000|111|123|999|555)\d*"
        if re.match(suspicious_phone_pattern, registrant_phone):
            return 1  # Suspicious phone number

        return 0  # No suspicious details found
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_ssl_certificate(url):
    """Check if the website uses HTTPS (SSL certificate presence)."""
    if url.lower().startswith("https://"):
        return 1  # SSL certificate present
    else:
        return 0  # SSL certificate not present

def get_certificate_validity(url):
    """Check the SSL certificate validity period and its issuer."""
    domain = url.replace("https://", "").replace("http://", "").split('/')[0]

    try:
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()

        not_before = datetime.datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")

        validity_duration = (not_after - not_before).days
        is_short_validity = validity_duration < 365

        is_self_signed = cert['issuer'] == cert['subject']

        return is_short_validity, is_self_signed, validity_duration
    except Exception as e:
        print(f"Error retrieving certificate: {e}")
        return None, None, None

def get_ssl_certificate_issuer(url):
    """Check the SSL certificate issuer."""
    domain = url.split("//")[-1].split("/")[0]

    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443)) as conn:
            with context.wrap_socket(conn, server_hostname=domain) as secure_conn:
                cert = secure_conn.getpeercert()
                issuer = dict(x[0] for x in cert['issuer'])
                return issuer.get('organizationName', 'Issuer details not found')
    except Exception as e:
        print(f"Error retrieving SSL certificate issuer: {e}")
        return None

def get_page_content(url):
    """Fetch the content of the web page."""
    try:
        # Send GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from the page (can be adjusted for specific elements)
        text = soup.get_text()
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    """Check for grammatical errors and spelling mistakes."""
    tool = language_tool_python.LanguageTool('en-US')

    # Check the text for issues
    matches = tool.check(text)

    # Return the number of issues
    return len(matches), matches

# Dynamically enter a URL
url = input("Enter a URL: ")

# Get domain expiration time
expiration_time = get_domain_expiration(url)

# Check the registrar's reputation
registrar_reputation = check_registrar(url)

# Check if WHOIS privacy protection is used
privacy_protection_status = check_whois_privacy_protection(url)

# Check the owner details for suspicious patterns
owner_details_status = check_owner_details(url)

# Check if SSL certificate is present
ssl_status = check_ssl_certificate(url)

# Check SSL certificate validity
short_validity, self_signed, validity_duration = get_certificate_validity(url)

# Get SSL certificate issuer details
issuer_info = get_ssl_certificate_issuer(url)

# Fetch page content
content = get_page_content(url)

if content:
    # Check for grammar and spelling issues
    num_issues, issues = check_grammar_and_spelling(content)
    print(f"Number of grammatical or spelling issues: {num_issues}")
    if num_issues > 0:
        print(f"Details of issues found:")
        for issue in issues:
            print(f"- {issue.message} at position {issue.offset}")

# Display the time remaining until domain expiration
if expiration_time is not None:
    if expiration_time < 0:
        print("Domain has already expired.")
    else:
        print(f"Time remaining until domain expiration: {expiration_time} days")
else:
    print("Could not retrieve domain expiration information.")

# Display the result for registrar reputation (1 for suspicious, 0 for not suspicious)
if registrar_reputation is not None:
    print(f"Registrar reputation: {'Suspicious' if registrar_reputation == 1 else 'Not Suspicious'}")
else:
    print("Could not retrieve registrar information.")

# Display the result for WHOIS privacy protection (1 for privacy protection, 0 for no protection)
if privacy_protection_status is not None:
    print(f"WHOIS Privacy Protection: {'Yes' if privacy_protection_status == 1 else 'No'}")
else:
    print("Could not retrieve WHOIS data.")

# Display the result for owner details suspiciousness (1 for suspicious, 0 for no suspicious details)
if owner_details_status is not None:
    print(f"Owner details suspicious: {'Yes' if owner_details_status == 1 else 'No'}")
else:
    print("Could not retrieve owner details.")

# Display the result for SSL certificate presence (1 for present, 0 for not present)
if ssl_status is not None:
    print(f"SSL Certificate present: {'Yes' if ssl_status == 1 else 'No'}")
else:
    print("Could not check SSL certificate.")

# Display the result for SSL certificate validity (short, normal) and validity duration
if short_validity is not None:
        print(f"SSL Certificate Validity: {'Short' if short_validity else 'Normal'} ({validity_duration} days)")

    # Check if the certificate is self-signed
    if self_signed:
        print("SSL Certificate is self-signed.")
    else:
        print("SSL Certificate is not self-signed.")
else:
    print("Could not check SSL certificate validity.")

# Display SSL certificate issuer
if issuer_info:
    print(f"SSL Certificate Issuer: {issuer_info}")
else:
    print("Could not retrieve SSL certificate issuer.")

In [None]:
import math
from collections import Counter
import re
import pandas as pd
from urllib.parse import urlparse
import requests
import hashlib
import base64
import json
import whois
from datetime import datetime
import ssl
import socket
import OpenSSL
from bs4 import BeautifulSoup
import language_tool_python

# Replace with your Google API Key
API_KEY = 'YOUR_API_KEY'

# Google Safe Browsing API URL
SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"

# List of known suspicious registrars (this can be expanded with more examples)
suspicious_registrars = [
    "Namecheap Inc.",
    "GoDaddy.com, LLC",
    "PublicDomainRegistry.com",
    "eNom, Inc.",
    "PDR Ltd.",
    "Tucows Domains Inc.",
    "BigRock Solutions",
    "Dynadot LLC"
]

# List of common privacy protection services
privacy_protection_indicators = [
    "Private Registration",
    "Contact Privacy",
    "Privacy Protected",
    "Domains by Proxy",
    "Privacy Protection",
    "Registrar Privacy"
]

# List of trusted domains and subdomains (e.g., Google)
trusted_domains = ['google.com', 'research.google.com', 'colab.research.google.com']

# Function Definitions (from Part 1)

def calculate_entropy(url):
    """Calculate the Shannon entropy of a given URL."""
    frequency = Counter(url)
    total_characters = len(url)
    entropy = -sum((count / total_characters) * math.log2(count / total_characters)
                   for count in frequency.values())
    return entropy

def has_ip_address(url):
    """Check if the URL contains an IP address."""
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return bool(ip_pattern.search(url))

def count_dots_in_url(url):
    """Count the number of dots (.) in the URL."""
    return url.count('.')

def count_hyphens_in_domain(url):
    """Count the number of hyphens (-) in the domain part of the URL."""
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    return domain.count('-')

def count_special_characters(url):
    """Count the number of special characters in the URL."""
    special_characters = r'[@%$&!*^+=?/<>|~]'
    matches = re.findall(special_characters, url)
    return len(matches)

def check_url_safety(url):
    """Check if the URL is safe using Google Safe Browsing API."""
    payload = {
        "client": {
            "clientId": "yourClientID",
            "clientVersion": "1.0"
        },
        "threatInfo": {
            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [
                {"url": url}
            ]
        }
    }

    # Send request to Google Safe Browsing API
    response = requests.post(
        f'{SAFE_BROWSING_URL}?key={API_KEY}',
        json=payload
    )

    result = response.json()

    if 'matches' in result:
        # If matches are found, return the URL as unsafe
        return "Suspicious tld: True"
    else:
        return "Suspicious tld: False"

def check_url_redirection(url):
    """Check for number of redirects in the URL."""
    try:
        response = requests.get(url, allow_redirects=True)
        redirect_count = len(response.history)
        return redirect_count
    except requests.exceptions.RequestException as e:
        print(f"Error accessing the URL: {e}")
        return 0

def check_suspicious_ip(url):
    """Check if the URL contains a suspicious IP address instead of a domain name."""
    parsed_url = urlparse(url)
    ipv4_pattern = re.compile(r'^(?:\d{1,3}\.){3}\d{1,3}$')
    ipv6_pattern = re.compile(r'^[0-9a-fA-F:]{2,39}$')
    host = parsed_url.hostname
    if ipv4_pattern.match(host) or ipv6_pattern.match(host):
        return 1  # Suspicious IP address found
    else:
        return 0  # No suspicious IP address found

def check_https_token_in_url(url):
    """Check if 'HTTPS' token is present in the URL path, query parameters, or scheme."""
    parsed_url = urlparse(url)
    if 'https' in parsed_url.scheme or 'https' in parsed_url.path or 'https' in parsed_url.query:
        return 1
    else:
        return 0

def get_domain_age(url):
    """Calculate the domain age using WHOIS data."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        creation_date = w.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        current_date = datetime.now()
        domain_age = (current_date - creation_date).days / 365
        return int(domain_age)
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

# Function Definitions (from Part 2)

def get_domain_expiration(url):
    """Calculate the domain expiration time from WHOIS data."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        expiration_date = w.expiration_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        current_date = datetime.now()
        time_remaining = expiration_date - current_date
        return time_remaining.days
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_registrar(url):
    """Check the domain registrar and evaluate its reputation."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        registrar = w.registrar
        if isinstance(registrar, list):
            registrar = registrar[0]
        if any(suspicious_registrar.lower() in registrar.lower() for suspicious_registrar in suspicious_registrars):
            return 1  # Suspicious registrar found
        else:
            return 0  # Not a suspicious registrar
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_whois_privacy_protection(url):
    """Check if the domain uses WHOIS privacy protection."""
    domain = url.split('/')[2]
    try:
        w = whois.whois(domain)
        registrant_info = w.get('registrant', '') + ' ' + w.get('contact', '')
        if any(indicator.lower() in registrant_info.lower() for indicator in privacy_protection_indicators):
            return 1  # Privacy protection detected
        else:
            return 0  # No privacy protection
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_owner_details(url):
    """Check the domain owner's details and validate them for suspicious data."""
    domain = url.split('/')[2]
    if domain in trusted_domains or domain.endswith('google.com'):
        return 0  # Not suspicious, skip checks
    try:
        w = whois.whois(domain)
        registrant_name = w.get('registrant_name', '')
        registrant_email = w.get('registrant_email', '')
        registrant_phone = w.get('registrant_phone', '')
        if not registrant_name or not registrant_email or not registrant_phone:
            return 1  # Suspicious due to missing details
        suspicious_email_pattern = r"(admin|contact|info|support|webmaster|postmaster)@.*"
        if re.match(suspicious_email_pattern, registrant_email):
            return 1  # Suspicious email
        suspicious_phone_pattern = r"(000|111|123|999|555)\d*"
        if re.match(suspicious_phone_pattern, registrant_phone):
            return 1  # Suspicious phone number
        return 0  # No suspicious details found
    except Exception as e:
        print(f"Error retrieving WHOIS data: {e}")
        return None

def check_ssl_certificate(url):
    """Check if the website uses HTTPS (SSL certificate presence)."""
    if url.lower().startswith("https://"):
        return 1  # SSL certificate present
    else:
        return 0  # SSL certificate not present

def get_certificate_validity(url):
    """Check the SSL certificate validity period and its issuer."""
    domain = url.replace("https://", "").replace("http://", "").split('/')[0]
    try:
        conn = ssl.create_default_context().wrap_socket(socket.socket(), server_hostname=domain)
        conn.connect((domain, 443))
        cert = conn.getpeercert()
                not_before = datetime.datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y GMT")
        not_after = datetime.datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y GMT")

        validity_duration = (not_after - not_before).days
        is_short_validity = validity_duration < 365

        is_self_signed = cert['issuer'] == cert['subject']

        return is_short_validity, is_self_signed, validity_duration
    except Exception as e:
        print(f"Error retrieving certificate: {e}")
        return None, None, None

def get_ssl_certificate_issuer(url):
    """Check the SSL certificate issuer."""
    domain = url.split("//")[-1].split("/")[0]

    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443)) as conn:
            with context.wrap_socket(conn, server_hostname=domain) as secure_conn:
                cert = secure_conn.getpeercert()
                issuer = dict(x[0] for x in cert['issuer'])
                return issuer.get('organizationName', 'Issuer details not found')
    except Exception as e:
        print(f"Error retrieving SSL certificate issuer: {e}")
        return None

def get_page_content(url):
    """Fetch the content of the web page."""
    try:
        # Send GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from the page (can be adjusted for specific elements)
        text = soup.get_text()
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return ""

def check_grammar_and_spelling(text):
    """Check for grammatical errors and spelling mistakes."""
    tool = language_tool_python.LanguageTool('en-US')

    # Check the text for issues
    matches = tool.check(text)

    # Return the number of issues
    return len(matches), matches

# Dynamically enter a URL
url = input("Enter a URL: ")

# Get domain expiration time
expiration_time = get_domain_expiration(url)

# Check the registrar's reputation
registrar_reputation = check_registrar(url)

# Check if WHOIS privacy protection is used
privacy_protection_status = check_whois_privacy_protection(url)

# Check the owner details for suspicious patterns
owner_details_status = check_owner_details(url)

# Check if SSL certificate is present
ssl_status = check_ssl_certificate(url)

# Check SSL certificate validity
short_validity, self_signed, validity_duration = get_certificate_validity(url)

# Get SSL certificate issuer details
issuer_info = get_ssl_certificate_issuer(url)

# Fetch page content
content = get_page_content(url)

if content:
    # Check for grammar and spelling issues
    num_issues, issues = check_grammar_and_spelling(content)
    print(f"Number of grammatical or spelling issues: {num_issues}")
    if num_issues > 0:
        print(f"Details of issues found:")
        for issue in issues:
            print(f"- {issue.message} at position {issue.offset}")

# Display the time remaining until domain expiration
if expiration_time is not None:
    if expiration_time < 0:
        print("Domain has already expired.")
    else:
        print(f"Time remaining until domain expiration: {expiration_time} days")
else:
    print("Could not retrieve domain expiration information.")

# Display the result for registrar reputation (1 for suspicious, 0 for not suspicious)
if registrar_reputation is not None:
    print(f"Registrar reputation: {'Suspicious' if registrar_reputation == 1 else 'Not Suspicious'}")
else:
    print("Could not retrieve registrar information.")

# Display the result for WHOIS privacy protection (1 for privacy protection, 0 for no protection)
if privacy_protection_status is not None:
    print(f"WHOIS Privacy Protection: {'Yes' if privacy_protection_status == 1 else 'No'}")
else:
    print("Could not retrieve WHOIS data.")

# Display the result for owner details suspiciousness (1 for suspicious, 0 for no suspicious details)
if owner_details_status is not None:
    print(f"Owner details suspicious: {'Yes' if owner_details_status == 1 else 'No'}")
else:
    print("Could not retrieve owner details.")

# Display the result for SSL certificate presence (1 for present, 0 for not present)
if ssl_status is not None:
    print(f"SSL Certificate present: {'Yes' if ssl_status == 1 else 'No'}")
else:
    print("Could not check SSL certificate.")

# Display the result for SSL certificate validity (short, normal) and validity duration
if short_validity is not None:
        print(f"SSL Certificate Validity: {'Short' if short_validity else 'Normal'} ({validity_duration} days)")

    # Check if the certificate is self-signed
    if self_signed:
        print("SSL Certificate is self-signed.")
    else:
        print("SSL Certificate is not self-signed.")
else:
    print("Could not check SSL certificate validity.")

# Display SSL certificate issuer
if issuer_info:
    print(f"SSL Certificate Issuer: {issuer_info}")
else:
    print("Could not retrieve SSL certificate issuer.")
# Display the result for SSL certificate validity (short, normal) and validity duration
if short_validity is not None:
    print(f"SSL Certificate Validity: {'Short' if short_validity else 'Normal'} ({validity_duration} days)")

    # Check if the certificate is self-signed
    if self_signed:
        print("SSL Certificate is self-signed.")
    else:
        print("SSL Certificate is not self-signed.")
else:
    print("Could not check SSL certificate validity.")

# Display SSL certificate issuer
if issuer_info:
    print(f"SSL Certificate Issuer: {issuer_info}")
else:
    print("Could not retrieve SSL certificate issuer.")

# After completing the checks and displaying the results, you can also store this information
# in a data structure like a dictionary or pandas DataFrame for further analysis or saving to a file.
# Here's an example of storing results in a dictionary:

result = {
    'URL': url,
    'Domain Expiration Time (days)': expiration_time if expiration_time is not None else 'N/A',
    'Registrar Reputation': 'Suspicious' if registrar_reputation == 1 else 'Not Suspicious',
    'WHOIS Privacy Protection': 'Yes' if privacy_protection_status == 1 else 'No',
    'Owner Details Suspicious': 'Yes' if owner_details_status == 1 else 'No',
    'SSL Certificate Present': 'Yes' if ssl_status == 1 else 'No',
    'SSL Certificate Validity': f"{'Short' if short_validity else 'Normal'} ({validity_duration} days)" if short_validity is not None else 'N/A',
    'SSL Certificate Self-Signed': 'Yes' if self_signed else 'No',
    'SSL Certificate Issuer': issuer_info if issuer_info else 'N/A',
    'Grammar and Spelling Issues': num_issues if content else 'N/A',
}

# Optionally, store the result in a DataFrame
df = pd.DataFrame([result])
print("\nResult DataFrame:")
print(df)

# You can also save the results to a CSV file for record-keeping
df.to_csv('url_analysis_results.csv', index=False)



IndentationError: unindent does not match any outer indentation level (<tokenize>, line 366)