### Imports et Functions utiles 

In [1]:
# Import de packages / Librairies 
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
from wordcloud import WordCloud

from datetime import datetime
import xml.etree.ElementTree as ET

from fuzzywuzzy import fuzz
from fuzzywuzzy import process


In [2]:
# Variable 
XML_URL = 'https://red.flag.domains/index.xml'

In [3]:
def download_xml(url):
    response = requests.get(url)
    return response.text

In [4]:
def get_metadata(url): 
    data = download_xml(url)
    root = ET.fromstring(data)
    
    items = root.findall('.//item')
    titles = []
    links = []
    dates = []
    descriptions = []
    for item in items:
        #print('=============')
        #print(title)
        #print(item)
        titles.append(item.find('title').text)
        links.append(item.find('link').text)
        dates.append(item.find('pubDate').text)
        descriptions.append(item.find('description').text)
        
    data = {'date': dates, 'url': links, 'info': titles, "descriptions": descriptions}
    df = pd.DataFrame(data)
    return df

In [5]:
def get_all_domain(url): 
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')

    p_list = []
    for p in paragraphs:
        p_list.append(p.text)
        
    p_list = p_list[1:]


    p_list = [el.split('\n') for el in p_list]
    
    
    flat_list = [item for sublist in p_list for item in sublist]
    
    return flat_list

In [6]:
def clean_domain(x): 
    x = x.split()[0]
    x = x.replace('[', '').replace(']', '')
    return x

In [7]:
def convert_date(date): 
    return datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %z')

### Recuperation des données 

In [8]:
df_xml = get_metadata(XML_URL)

In [9]:
df_xml.head()

Unnamed: 0,date,url,info,descriptions
0,"Tue, 25 Jun 2024 00:00:00 +0000",https://red.flag.domains/posts/2024-06-25/,List of 30 new domains,.fr adista-groupe[.fr] (registrar: IONOS SE)\n...
1,"Mon, 24 Jun 2024 00:00:00 +0000",https://red.flag.domains/posts/2024-06-24/,List of 13 new domains,.fr administration-vitale[.fr] (registrar: KEY...
2,"Sun, 23 Jun 2024 00:00:00 +0000",https://red.flag.domains/posts/2024-06-23/,List of 4 new domains,.fr gemo-services[.fr] (registrar: Hosting Con...
3,"Sat, 22 Jun 2024 00:00:00 +0000",https://red.flag.domains/posts/2024-06-22/,List of 15 new domains,.fr assurancemaladie-vital[.fr] (registrar: On...
4,"Fri, 21 Jun 2024 00:00:00 +0000",https://red.flag.domains/posts/2024-06-21/,List of 18 new domains,.fr bourseramabanque[.fr] (registrar: )\ncaf-l...


In [10]:
# Suppresion des 3 derniers lignes 
df_xml = df_xml[:-3]

In [11]:
# Scrapping jour par jour
df_xml['domains'] = df_xml['url'].apply(lambda url: get_all_domain(url))

KeyboardInterrupt: 

In [None]:
df_xml.head()

Unnamed: 0,date,url,info,descriptions,domains
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,"[11securiteabofr[.fr] (registrar: GANDI), 12se..."
1,"Wed, 24 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-24/,List of 22 new domains,.fr 1sendsecuritefr[.fr] (registrar: GANDI)\n2...,"[1sendsecuritefr[.fr] (registrar: GANDI), 2sen..."
2,"Tue, 23 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-23/,List of 13 new domains,.fr amende-antai[.fr] (registrar: KEY-SYSTEMS ...,[amende-antai[.fr] (registrar: KEY-SYSTEMS Gmb...
3,"Mon, 22 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-22/,List of 16 new domains,.fr chronopost-colistracking[.fr] (registrar: ...,[chronopost-colistracking[.fr] (registrar: KEY...
4,"Sun, 21 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-21/,List of 27 new domains,.fr ameli-verifs[.fr] (registrar: KEY-SYSTEMS ...,[ameli-verifs[.fr] (registrar: KEY-SYSTEMS Gmb...


In [None]:
df_xml['nb of domain'] = df_xml['domains'].apply(lambda x: len(x))

In [None]:
df_xml.head()

Unnamed: 0,date,url,info,descriptions,domains,nb of domain
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,"[11securiteabofr[.fr] (registrar: GANDI), 12se...",29
1,"Wed, 24 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-24/,List of 22 new domains,.fr 1sendsecuritefr[.fr] (registrar: GANDI)\n2...,"[1sendsecuritefr[.fr] (registrar: GANDI), 2sen...",22
2,"Tue, 23 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-23/,List of 13 new domains,.fr amende-antai[.fr] (registrar: KEY-SYSTEMS ...,[amende-antai[.fr] (registrar: KEY-SYSTEMS Gmb...,13
3,"Mon, 22 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-22/,List of 16 new domains,.fr chronopost-colistracking[.fr] (registrar: ...,[chronopost-colistracking[.fr] (registrar: KEY...,16
4,"Sun, 21 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-21/,List of 27 new domains,.fr ameli-verifs[.fr] (registrar: KEY-SYSTEMS ...,[ameli-verifs[.fr] (registrar: KEY-SYSTEMS Gmb...,27


In [None]:
# Explode de la DB pour avoir une ligne par domaine 
df_explode = df_xml.explode('domains')

In [None]:
df_domains = df_xml.explode('domains')

In [None]:
df_domains.head()

Unnamed: 0,date,url,info,descriptions,domains,nb of domain
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,11securiteabofr[.fr] (registrar: GANDI),29
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,12securiteabofr[.fr] (registrar: GANDI),29
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,13securitebro[.fr] (registrar: KEY-SYSTEMS GmbH),29
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,1sfcpart-impotsgouv[.fr] (registrar: 1API GmbH),29
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,actualisationinfo[.fr] (registrar: KEY-SYSTEMS...,29


In [None]:
df_domains.drop_duplicates('domains', inplace = True)

In [None]:
# Clean des noms de domains 
df_domains['clean_dom'] = df_domains['domains'].apply(lambda x: clean_domain(x))

In [None]:
df_domains[['date', 'url', 'info', 'domains','nb of domain', 'clean_dom']].head()

Unnamed: 0,date,url,info,domains,nb of domain,clean_dom
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,11securiteabofr[.fr] (registrar: GANDI),29,11securiteabofr.fr
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,12securiteabofr[.fr] (registrar: GANDI),29,12securiteabofr.fr
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,13securitebro[.fr] (registrar: KEY-SYSTEMS GmbH),29,13securitebro.fr
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,1sfcpart-impotsgouv[.fr] (registrar: 1API GmbH),29,1sfcpart-impotsgouv.fr
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,actualisationinfo[.fr] (registrar: KEY-SYSTEMS...,29,actualisationinfo.fr


In [None]:
# split des mots contenue dans un domain 
df_domains['words'] = df_domains['clean_dom'].apply(lambda x: x.split('.')[0])
df_domains['words'] = df_domains['words'].apply(lambda x: x.split('-'))

In [None]:
df_domains.head()

Unnamed: 0,date,url,info,descriptions,domains,nb of domain,clean_dom,words
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,11securiteabofr[.fr] (registrar: GANDI),29,11securiteabofr.fr,[11securiteabofr]
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,12securiteabofr[.fr] (registrar: GANDI),29,12securiteabofr.fr,[12securiteabofr]
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,13securitebro[.fr] (registrar: KEY-SYSTEMS GmbH),29,13securitebro.fr,[13securitebro]
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,1sfcpart-impotsgouv[.fr] (registrar: 1API GmbH),29,1sfcpart-impotsgouv.fr,"[1sfcpart, impotsgouv]"
0,"Thu, 25 May 2023 00:00:00 +0000",https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,actualisationinfo[.fr] (registrar: KEY-SYSTEMS...,29,actualisationinfo.fr,[actualisationinfo]


In [None]:
# Transformation de la date 
df_domains['date'] = df_domains['date'].apply(lambda x: convert_date(x))
df_domains['date'] = df_domains['date'].apply(lambda x: str(x).split()[0])

In [None]:
df_domains.to_excel('data.xlsx')

### Analyse des mots 

In [None]:
mots_populaire = Counter(all_words).most_common(2000)

NameError: name 'all_words' is not defined

In [None]:
mots_populaire = [mot for mot, _ in mots_populaire]


In [None]:
print(mots_populaire)

['ameli', 'chronopost', 'info', 'netflix', 'service', '', 'xn', 'support', 'antai', 'espace', 'client', 'vitale', 'france', 'renouvellement', 'amendes', 'compte', 'colis', 'mon', 'amende', 'secure', 'suivi', 'carte', 'sfr', 'gouv', 'chrono', 'assurance', 'sg', 'fr', 'paiement', 'societegenerale', 'bnpparibas', 'critair', 'livraison', 'connexion', 'sante', 'aide', 'abonnement', 'infos', 'cartevitale', 'assistance', 'cybertek', 'securite', 'particuliers', 'amazon', 'formulaire', 'caisse', 'ma', 'maladie', 'netfiix', 'cpam', 'clients', 'auth', 'particulier', 'ca', 'colissimo', 'contravention', 'air', 'epargne', 'orange', 'moncompte', 'verif', 'regularisation', 'verification', 'services', 'leboncoin', 'banque', 'paypal', 'assure', 'certificat', 'renouv', 'credit', 'monespace', 'login', 'activation', 'edf', 'amelie', 'authentification', 'disney', 'bnp', 'facturation', 'esim', 'espaceclient', 'informations', 'contact', 'secu', 'vignette', 'acheminement', 'reglement', 'generale', 'information

In [None]:
mots_a_supprimer = ['compte','espace','mon','client','assistance','info', 'service', '', 'xn', 'support', 'secure', 'suivi', 'fr', 'connexion', 'aide', 'abonnement', 'infos', 'securite', 'particuliers', 'formulaire', 'ma', 'clients', 'auth', 'particulier', 'epargne', 'moncompte', 'verif', 'regularisation', 'verification', 'services', 'assure', 'certificat', 'renouv', 'credit', 'monespace', 'login', 'activation', 'authentification', 'facturation', 'espaceclient', 'informations', 'contact', 'vignette', 'acheminement', 'reglement', 'information', 'mes', 'account', 'ac', 'app', 'portail', 'acces', 'suivis', 'www', 'dossier', 'pass', 'connect', 'paiements', 'redirection', 'groupe', 'sms', 'web', 'help', 'e', 'annulation', 'nord', 'instruction', 'paris', 'du', 'confirmation', 'abonnements', 'votre', 'suivre', 'scurit', 'actualisation', 'demande', 'supports', 'remboursement', 'express', '2023', 'cledigitale', 'la', 'shop', 'a', 'securpass', 'digitale', 'relivraison', 'securisation', 'infractions', 'gestion', 'messagerie', 'nouvelle', 'aides', 'accueil', 'mobile', 'espaceclients', 'commande', 'u', 'reactivation', 'online', 'publique', 'comptes', 'pro', 'securise', 'group', 'livraisons', 'store', 'identification', 'jour' ]
word_list = list(set(mots_populaire).difference(mots_a_supprimer))
print(word_list)

NameError: name 'mots_populaire' is not defined

TEST MMK

In [None]:
df_domains['match'] = "NA"

In [None]:
#filtered_df = df[df['match'].str.contains('instagram')]

In [None]:
process.extract("cpf", liste_mots, scorer=fuzz.ratio)

NameError: name 'liste_mots' is not defined

In [None]:
#keywords = ['netflix','amazon','instagram','snapchat','facebook','chronopost','colissimo','ameli','sfr','orange','bouygues','free','societegenerale','sg','bnpparibas','bnp','lcl','ca','laposte','cpam','caf','ameli','gouv','securitesociale','secu','cartevitale','vitale','cartevitale','antai','ants', 'amendes','assurance','cybertek','caisseepargne','cpf','assurancemaladie','assurance','carrefour','contraventions','labanquepostale','creditmutuelle','boursorama','nike','navigo','urssaf','banquedefrance','critair','gendarmerie','franprix','sncf','castorama','timberland','google','leboncoin']

In [None]:
def find_similar_words(keyword, dataframe, threshold=80):
    similar_words = []
    
    # Rechercher les occurrences similaires dans la colonne "clean_dom"
    matches = process.extract(keyword, dataframe['clean_dom'], scorer=fuzz.partial_ratio, limit=None)
    
    # Filtrer les résultats en fonction du seuil de similarité
    matches = [match for match in matches if match[1] >= threshold]
    
    # Ajouter les dérivés similaires à la liste des mots similaires
    for match in matches:
        similar_words.append(match[0])
    
    return similar_words

# Liste de mots clés à rechercher
keywords = ['netflix','amazon','instagram','snapchat','facebook','chronopost','colissimo','ameli','sfr','orange','bouygues','free','societegenerale','sg','bnpparibas','bnp','lcl','creditagricole','caissedepargne','paypal','microsoft','youtube','tiktok','bershka','laposte','cpam','caf','ameli','gouv','securitesocial','cartevitale','vitale','cartevitale','antai','ants', 'amendes','assurance','cybertek','caisseepargne','cpf','assurancemaladie','assurance','carrefour','contraventions','labanquepostale','creditmutuelle','boursorama','nike','navigo','urssaf','banquedefrance','critair','gendarmerie','franprix','sncf','castorama','timberland','google','leboncoin','dhl','airbnb','bershka','sfr','fnac','disneyplus','maladie']

# Créer un dictionnaire pour stocker les mots similaires pour chaque mot clé
similar_words_dict = {}

# Parcourir les mots clés et trouver les dérivés similaires
for keyword in keywords:
    similar_words = find_similar_words(keyword, df_domains, threshold=80)
    similar_words_dict[keyword] = similar_words

# Mettre à jour la colonne "clean_dom" du DataFrame avec les mots similaires correspondants
for keyword, similar_words in similar_words_dict.items():
    df_domains.loc[df_domains['clean_dom'].isin(similar_words), 'match'] = keyword

# Afficher le DataFrame mis à jour
#print(df_domains)

In [None]:
test = df_domains[df_domains['match'].str.contains('bnp')]
test

Unnamed: 0,date,url,info,descriptions,domains,nb of domain,clean_dom,words,match
39,2023-04-16,https://red.flag.domains/posts/2023-04-16/,List of 55 new domains,.fr adoctolib[.fr] (registrar: OVH)\nagencenat...,bnp-paiement-annulation[.fr] (registrar: KEY-S...,55,bnp-paiement-annulation.fr,"[bnp, paiement, annulation]",bnp
52,2023-04-03,https://red.flag.domains/posts/2023-04-03/,List of 20 new domains,.fr acheter-viagra[.fr] (registrar: EPAG Domai...,www-bnpparibas[.fr] (registrar: Realtime Regis...,20,www-bnpparibas.fr,"[www, bnpparibas]",bnp
57,2023-03-29,https://red.flag.domains/posts/2023-03-29/,List of 48 new domains,.fr acceuil-amendes[.fr] (registrar: KEY-SYSTE...,support-techniqueassistancebnpp[.fr] (registra...,48,support-techniqueassistancebnpp.fr,"[support, techniqueassistancebnpp]",bnp
76,2023-03-09,https://red.flag.domains/posts/2023-03-09/,List of 40 new domains,.fr ameli-assurance-maladie[.fr] (registrar: K...,monespaceclientbnp[.fr] (registrar: KEY-SYSTEM...,40,monespaceclientbnp.fr,[monespaceclientbnp],bnp
81,2023-03-04,https://red.flag.domains/posts/2023-03-04/,List of 33 new domains,.fr amendes-information[.fr] (registrar: KEY-S...,bnp-espace-particuliers[.fr] (registrar: KEY-S...,33,bnp-espace-particuliers.fr,"[bnp, espace, particuliers]",bnp
...,...,...,...,...,...,...,...,...,...
431,2022-03-18,https://red.flag.domains/posts/2022-03-18/,List of 15 new domains,.fr assurance-maladie-assistance[.fr]\nbnp-gro...,bnp-group[.fr],15,bnp-group.fr,"[bnp, group]",bnp
431,2022-03-18,https://red.flag.domains/posts/2022-03-18/,List of 15 new domains,.fr assurance-maladie-assistance[.fr]\nbnp-gro...,bnpparibas-securisation[.fr],15,bnpparibas-securisation.fr,"[bnpparibas, securisation]",bnp
431,2022-03-18,https://red.flag.domains/posts/2022-03-18/,List of 15 new domains,.fr assurance-maladie-assistance[.fr]\nbnp-gro...,cib-bnpparibas[.fr],15,cib-bnpparibas.fr,"[cib, bnpparibas]",bnp
436,2022-03-13,https://red.flag.domains/posts/2022-03-13/,List of 6 new domains,.fr caisse-dep[.fr]\nlabanquepopulaires[.fr]\n...,service-mabanque-bnpparibas[.fr],6,service-mabanque-bnpparibas.fr,"[service, mabanque, bnpparibas]",bnp


In [None]:
df_domains

Unnamed: 0,date,url,info,descriptions,domains,nb of domain,clean_dom,words,match
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,11securiteabofr[.fr] (registrar: GANDI),29,11securiteabofr.fr,[11securiteabofr],
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,12securiteabofr[.fr] (registrar: GANDI),29,12securiteabofr.fr,[12securiteabofr],
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,13securitebro[.fr] (registrar: KEY-SYSTEMS GmbH),29,13securitebro.fr,[13securitebro],
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,1sfcpart-impotsgouv[.fr] (registrar: 1API GmbH),29,1sfcpart-impotsgouv.fr,"[1sfcpart, impotsgouv]",gouv
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,actualisationinfo[.fr] (registrar: KEY-SYSTEMS...,29,actualisationinfo.fr,[actualisationinfo],
...,...,...,...,...,...,...,...,...,...
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,secure-snapchat[.fr],10,secure-snapchat.fr,"[secure, snapchat]",snapchat
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,verificatio-dsp2-secure[.fr],10,verificatio-dsp2-secure.fr,"[verificatio, dsp2, secure]",
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,verification-secu-secure[.fr],10,verification-secu-secure.fr,"[verification, secu, secure]",
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,votreconseillerorangeenvisio[.fr],10,votreconseillerorangeenvisio.fr,[votreconseillerorangeenvisio],orange


In [None]:

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Supposons que vous ayez un DataFrame nommé df contenant la colonne "match"

# Filtrer les lignes où la valeur de la colonne "match" est "NA"
#df_filtered = df_domains[df_domains["match"] == "NA"]
df_filtered = df_domains["match"]

# Concaténer toutes les valeurs de la colonne "match" dans une seule chaîne de caractères
text = " ".join(df_filtered["clean_dom"])

# Créer un objet WordCloud avec les paramètres souhaités
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

# Afficher le nuage de mots
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

KeyError: 'clean_dom'

In [None]:
import requests

def verifier_urls(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return "oui"
    except requests.exceptions.RequestException:
        return "non"

df_domains['is_accessible'] = df_domains['clean_dom'].apply(verifier_urls)

In [None]:
print(df_domains['is_accessible']=="oui")

0      False
0      False
0      False
0      False
0      False
       ...  
448    False
448    False
448    False
448    False
448    False
Name: is_accessible, Length: 17437, dtype: bool


In [None]:
df_domains

Unnamed: 0,date,url,info,descriptions,domains,nb of domain,clean_dom,words,match,is_accessible
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,11securiteabofr[.fr] (registrar: GANDI),29,11securiteabofr.fr,[11securiteabofr],,non
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,12securiteabofr[.fr] (registrar: GANDI),29,12securiteabofr.fr,[12securiteabofr],,non
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,13securitebro[.fr] (registrar: KEY-SYSTEMS GmbH),29,13securitebro.fr,[13securitebro],,non
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,1sfcpart-impotsgouv[.fr] (registrar: 1API GmbH),29,1sfcpart-impotsgouv.fr,"[1sfcpart, impotsgouv]",gouv,non
0,2023-05-25,https://red.flag.domains/posts/2023-05-25/,List of 29 new domains,.fr 11securiteabofr[.fr] (registrar: GANDI)\n1...,actualisationinfo[.fr] (registrar: KEY-SYSTEMS...,29,actualisationinfo.fr,[actualisationinfo],,non
...,...,...,...,...,...,...,...,...,...,...
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,secure-snapchat[.fr],10,secure-snapchat.fr,"[secure, snapchat]",snapchat,non
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,verificatio-dsp2-secure[.fr],10,verificatio-dsp2-secure.fr,"[verificatio, dsp2, secure]",,non
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,verification-secu-secure[.fr],10,verification-secu-secure.fr,"[verification, secu, secure]",,non
448,2022-03-01,https://red.flag.domains/posts/2022-03-01/,List of 10 new domains,.fr amazon-regist[.fr]\nauth-leboncoin[.fr]\np...,votreconseillerorangeenvisio[.fr],10,votreconseillerorangeenvisio.fr,[votreconseillerorangeenvisio],orange,non


In [None]:
af = df_domains

In [None]:
len(df_domains) 

17437

In [None]:
############## FONCTION DE BASE
def find_similar_words(keyword, dataframe, threshold=80):
    similar_words = set()
    
    # Rechercher les occurrences similaires dans la colonne "clean_dom"
    matches = process.extract(keyword, dataframe['clean_dom'], scorer=fuzz.partial_ratio, limit=None)
    
    # Filtrer les résultats en fonction du seuil de similarité
    matches = [match for match in matches if match[1] >= threshold]
    
    # Ajouter les dérivés similaires à l'ensemble des mots similaires
    for match in matches:
        similar_words.add(match[0])
    
    return similar_words

# Mot clé à rechercher
keyword = 'chronopost'

# Appeler la fonction pour trouver les dérivés similaires dans la colonne "clean_dom"
similar_words = find_similar_words(keyword, df_domains, threshold=80)

# Afficher les dérivés similaires
print(f"Similar words for '{keyword}':")
for word in similar_words: 
    print(word)

Similar words for 'chronopost':
infoscolis-chronopost.fr
service-livraison-chronopost.fr
colissimo-chronopost.fr
chronopost-remise.fr
chronopost-facturations.fr
services-chronopost.fr
chronopostpickup.fr
chronopost-suivi-express.fr
chronopostt-service.fr
chronopost-ma-livraison.fr
livraison-commande-chronopost.fr
chronopost-aide-colis.fr
chronopost-restrictions.fr
centrelivraison-chronopost.fr
chronopost-suivis-colis.fr
colis-chronopost-livraison.fr
chronopost-collect.fr
chronopost-suivit.fr
chronopost-clients-suivi.fr
chronopostenligne.fr
chronopostpickups.fr
livraison-chrono-post.fr
chronopostale.fr
chronopost-supportcolis.fr
chronopostcolisprobleme.fr
chronopostsuivre.fr
chronopost-douanes.fr
chronopost-suivremoncolis.fr
chronopost-malivraison.fr
chronopost-supports.fr
contact-chronopost.fr
coiis-chronopost.fr
relaispickup-chronopost.fr
chronopostgestion.fr
chronopost-info-livraison.fr
chronopost-suivi-coiis.fr
votre-colis-chronopost.fr
espacechronopost.fr
mon-suivi-chronopost-clien

In [None]:
df_domains.to_csv('v2_pishing.csv', index=False)

### Analyse des tendances 

In [None]:
df_domains['is_chonopost'] = df_domains['words'].apply(lambda x: "chronopost" in x)
df_domains['is_ameli'] = df_domains['words'].apply(lambda x: "ameli" in x)

In [None]:
df_domains[df_domains['is_chonopost'] == True].head()

Unnamed: 0,date,url,info,descriptions,domains,nb of domain,clean_dom,words,is_chonopost
1,2023-05-22,https://red.flag.domains/posts/2023-05-22/,List of 16 new domains,.fr chronopost-colistracking[.fr] (registrar: ...,chronopost-colistracking[.fr] (registrar: KEY-...,16,chronopost-colistracking.fr,"[chronopost, colistracking]",True
1,2023-05-22,https://red.flag.domains/posts/2023-05-22/,List of 16 new domains,.fr chronopost-colistracking[.fr] (registrar: ...,erreur-acheminement-chronopost[.fr] (registrar...,16,erreur-acheminement-chronopost.fr,"[erreur, acheminement, chronopost]",True
3,2023-05-20,https://red.flag.domains/posts/2023-05-20/,List of 26 new domains,.fr amende-en-attente[.fr] (registrar: KEY-SYS...,reglement-chronopost[.fr] (registrar: KEY-SYST...,26,reglement-chronopost.fr,"[reglement, chronopost]",True
4,2023-05-19,https://red.flag.domains/posts/2023-05-19/,List of 58 new domains,.fr adcco[.fr] (registrar: OVH)\namende-servic...,chronopost-relivraisons[.fr] (registrar: OVH),58,chronopost-relivraisons.fr,"[chronopost, relivraisons]",True
4,2023-05-19,https://red.flag.domains/posts/2023-05-19/,List of 58 new domains,.fr adcco[.fr] (registrar: OVH)\namende-servic...,relivraisons-chronopost[.fr] (registrar: OVH),58,relivraisons-chronopost.fr,"[relivraisons, chronopost]",True


In [None]:
len(df_domains)

17386

In [None]:
len(df_domains[df_domains['is_ameli'] == True] + df_domains[df_domains['is_chonopost'] == True])

1535

In [None]:
dict_association = {
  "ameli": ["declarationameli", "proameli", "rediaameli", "ameli-renouvellements-vitale", "espace-carte-ameli", "ameli-gestion", "portail-ameli", "espace-ameli-portail", "avertissement-ameli", "accueil-ameli", "ameli-clientcartevitale", "ameli-mobile", "assistance-compte-ameli", "amelicpam", "amelicartevitale", "amelis-france", "ameliss-france"],
  "bnpparibas": ["espace-mabanque-bnpparibas", "mabanque-bnp-paribas-digitale", "secure-mabanque-bnpparibas", "servicesecurebnpparimacledigital", "supportmacleditalemiseajour"],
  "netflix": ["netflix-aides", "netflix-maj", "netflix-verification-login", "netflix-checksecu", "aides-netflix"],
  "paypal": ["support-paypal-securisation", "paypal-reconfiguration"],
  "caisse-epargne": ["caisse-epargne-particuliers", "espace-caisse", "la-caisse-epargne"],
  "fnacdarty": ["fnacdarty-groupe"],
  "orange": ["connexion-orange", "reseauorange-verif-rio"],
  "ag2rlemondiale": ["ag2rlemondiale"],
  "airbnb": ["airbnb-support", "airbnb-suspension", "airbnb-team", "airbnb-verif"],
  "apple": ["apple-assistance", "apple-findmypone", "apple-localisation", "apple-support", "apple-localiser"],
  "carrefour": ["carrefour-banque-service", "carrefour-connexion-clients", "carrefoour"],
  "cpam": ["cpamcartevitale"],
  "finances-gouv": ["financesgouv", "finances-gouv-info"],
  "instagram": ["instagramfrance"],
  "labanquepostale": ["la-banquepostales", "labqpostale"],
  "leboncoin": ["leboncoin-p2pdealsecure-payin"],
  "microsoft": ["microsoft-pro"],
  "pornhub": ["pornhubfr"],
  "spotify": ["spotifypayment"],
  "ca": ["client-caisse-epargne", "ca-connexion-clients", "caissesepargne", "caissee-epargnes"],
  "caf": ["votrecaissecaf", "auth-caf"],
  "gouv": ["retraitegouv", "culturegouv", "defense-gov", "etudiantgouv"],
  "groupedarty": ["group-darty"],
  "intermarche": ["intermarche-express"],
  "microsoft": ["microsoft-pro"],
  "orangebank": ["orangebank-immobilier"],
  "airbnb": ["airbnb-suspension"],
  "bnpparibas": ["bnpparibas-securisation", "infosecubanquenationaledeparis", "sg-conexion-particuliers"]
}
