In [12]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

def isSimilar(sitename, similarity_threshold):
    # Read whitelist
    whitelist = pd.read_csv("./domain-names.csv")
    whitelist["Domain"] = whitelist["Domain"].str.strip()
    whitelist_array = whitelist["Domain"].values

    # Create 2d array with columns domain name and similarity index
    new_whitelist_array = np.zeros((len(whitelist_array), 2), dtype='object')
    new_whitelist_array[:, 0] = whitelist_array

    # If legit, exit if not continue
    if sitename in whitelist_array:
        print(sitename + " is present")
        return

    # Compare every character of input name and sitename
    for i in range(len(new_whitelist_array)):
        new_whitelist_array[i][1] = SequenceMatcher(None, new_whitelist_array[i][0], sitename).ratio()

    # Sort by similarity and filter by threshold
    filtered_results = [row for row in new_whitelist_array if row[1] >= similarity_threshold]
    filtered_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)

    # Print the first ten websites that meet the threshold
    for row in filtered_results[:10]:
        print(row)

# Example usage
isSimilar("sbi.co.in", 0.5) # Replace 0.8 with your desired threshold


['slbit.com' 0.6666666666666666]
['tsbid.com' 0.6666666666666666]
['bilcod.online' 0.6363636363636364]
['sgvbzi.com' 0.631578947368421]
['sbjwio.vip' 0.631578947368421]
['sbjoio.vip' 0.631578947368421]
['schmov.ing' 0.631578947368421]
['bionx.info' 0.631578947368421]
['vsbiow.vip' 0.631578947368421]
['sonic55go.info' 0.6086956521739131]


In [17]:
import tldextract

def isSimilar(sitename, similarity_threshold):
    # Extract the main part of the domain
    extracted = tldextract.extract(sitename)
    main_part = extracted.domain

    # Read whitelist
    whitelist = pd.read_csv("./domain-names.csv")
    whitelist["Domain"] = whitelist["Domain"].str.strip()
    whitelist_array = whitelist["Domain"].values

    # Filter domains containing the main part
    filtered_domains = [domain for domain in whitelist_array if main_part in tldextract.extract(domain).domain]

    # Create 2d array for these domains with similarity scores
    filtered_array = np.zeros((len(filtered_domains), 2), dtype='object')
    filtered_array[:, 0] = filtered_domains
    for i in range(len(filtered_array)):
        filtered_array[i][1] = SequenceMatcher(None, filtered_array[i][0], sitename).ratio()

    # Sort by similarity and filter by threshold
    filtered_results = [row for row in filtered_array if row[1] >= similarity_threshold]
    filtered_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)

    # If less than 10 results, consider additional domains from the original list
    if len(filtered_results) < 10:
        additional_results = [row for row in new_whitelist_array if row[1] >= similarity_threshold 
                              and tldextract.extract(row[0]).domain != main_part]
        additional_results = sorted(additional_results, key=lambda x: x[1], reverse=True)
        filtered_results.extend(additional_results[:10 - len(filtered_results)])

    # Print the first ten websites that meet the threshold
    for row in filtered_results[:10]:
        print(row)

# Example usage
isSimilar("sbi.co.in", 0.5)  # Replace 0.5 with your desired threshold


['tsbid.com' 0.6666666666666666]
['vsbiow.vip' 0.631578947368421]
['kasbile.com' 0.6]
['gtsbill.com' 0.6]
['sbitkit.com' 0.6]
['fsbio.org' 0.5555555555555556]
['storesbit.com' 0.5454545454545454]
['swissbity.com' 0.5454545454545454]
['kosbizm.click' 0.5454545454545454]
['1wsbit.top' 0.5263157894736842]


In [16]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

def isSimilar(sitename):
    
    #read whitelist
    # Specify the path to your text file
    file_path = "domain-names.txt"
    #Open the file in read mode
    with open(file_path, "r") as file:
    # Read the file line by line and create a 2D array
        new_whitelist_array = [[line.strip(), 0] for line in file]


    #if legit exit if not continue
    if sitename not in new_whitelist_array:
        print("absent")
    else:
        print(sitename +" is present")
        exit()

    #compare every character of input name and sitename
    for i in range(len(new_whitelist_array)):
        new_whitelist_array[i][1] = SequenceMatcher(None, new_whitelist_array[i][0], sitename).ratio()

    '''#find maximum and display
    max=0
    for i in range(len(new_whitelist_array)):
        if new_whitelist_array[i][1]>new_whitelist_array[max][1]: #and len(new_whitelist_array[max][0])>=new_whitelist_array[max][1]:
            max = i'''

    #percent=((new_whitelist_array[max][1])/len(new_whitelist_array[max][0]))*100

    new_whitelist_array = sorted(new_whitelist_array, key=lambda x: x[1], reverse=True)

    #print("resembles "+ new_whitelist_array[max][0]+" by "+str(new_whitelist_array[max][1]))

    for i in range(len(new_whitelist_array)):
        if new_whitelist_array[i][1]>0.57:
            print(new_whitelist_array[i])
    #for row in new_whitelist_array[:10]:
     #   print(row)

print("the similar website to irctc can be:")
isSimilar("irctc.gov.in")
print()
print("the similar website to icici can be:")
isSimilar("icicibank.co.in")

the similar website to irctc can be:
absent
['irctc2023.co.in', 0.7407407407407407]
['iirtc.in', 0.7]
['beirut.vin', 0.6363636363636364]
['girlchat.online', 0.5925925925925926]
['firsatca.online', 0.5925925925925926]
['first-tac.online', 0.5714285714285714]
['icac.bond', 0.5714285714285714]

the similar website to icici can be:
absent
['iciicibanking.co', 0.7741935483870968]
['1cicicbnk.in', 0.7407407407407407]
['citicsbanks.com', 0.6666666666666666]
['iiiak.com', 0.6666666666666666]
['pricobank.com', 0.6428571428571429]
['morccbank.com', 0.6428571428571429]
['vicban.com', 0.64]
['ivorycbank.com', 0.6206896551724138]
['icoastbank.com', 0.6206896551724138]
['irkbank.com', 0.6153846153846154]
['viiiasn.com', 0.6153846153846154]
['xiiiasn.com', 0.6153846153846154]
['cmcrank.com', 0.6153846153846154]
['iconicb.com', 0.6153846153846154]
['irqbank.com', 0.6153846153846154]
['xiiiask.com', 0.6153846153846154]
['kiwiban.com', 0.6153846153846154]
['irctc2023.co.in', 0.6]
['iridian.company', 0.6