# **Web Scrapping**

In [None]:
!pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from bs4 import BeautifulSoup

html_code = '''
<table class="matrix side">
    <thead>
        <tr>
            <td class="tactic name"><a href="/tactics/TA0043" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0043">Reconnaissance</a></td>
            <td class="tactic name"><a href="/tactics/TA0042" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0042">Resource Development</a></td>
            <td class="tactic name"><a href="/tactics/TA0001" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0001">Initial Access</a></td>
            <td class="tactic name"><a href="/tactics/TA0002" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0002">Execution</a></td>
            <td class="tactic name"><a href="/tactics/TA0003" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0003">Persistence</a></td>
            <td class="tactic name"><a href="/tactics/TA0004" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0004">Privilege Escalation</a></td>
            <td class="tactic name"><a href="/tactics/TA0005" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0005">Defense Evasion</a></td>
            <td class="tactic name"><a href="/tactics/TA0006" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0006">Credential Access</a></td>
            <td class="tactic name"><a href="/tactics/TA0007" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0007">Discovery</a></td>
            <td class="tactic name"><a href="/tactics/TA0008" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0008">Lateral Movement</a></td>
            <td class="tactic name"><a href="/tactics/TA0009" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0009">Collection</a></td>
            <td class="tactic name"><a href="/tactics/TA0011" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0011">Command and Control</a></td>
            <td class="tactic name"><a href="/tactics/TA0010" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0010">Exfiltration</a></td>
            <td class="tactic name"><a href="/tactics/TA0040" data-toggle="tooltip" data-placement="top" title="" data-original-title="TA0040">Impact</a></td>
        </tr>
    </thead>
</table>
'''

soup = BeautifulSoup(html_code, 'html.parser')

header_row = soup.find('table', class_='matrix side').find('tr')
cells = header_row.find_all('td')

links_dict = {}
link_text_list = []


for cell in cells:
    link = cell.find('a')
    link_text = link.text.strip()
    link_href = link['href']
    links_dict[link_text] = link_href
    link_text_list.append(link_text)



print(links_dict)

for i in range(len(link_text_list)):

 print(links_dict[link_text_list[i]])

print(len(link_text_list))


{'Reconnaissance': '/tactics/TA0043', 'Resource Development': '/tactics/TA0042', 'Initial Access': '/tactics/TA0001', 'Execution': '/tactics/TA0002', 'Persistence': '/tactics/TA0003', 'Privilege Escalation': '/tactics/TA0004', 'Defense Evasion': '/tactics/TA0005', 'Credential Access': '/tactics/TA0006', 'Discovery': '/tactics/TA0007', 'Lateral Movement': '/tactics/TA0008', 'Collection': '/tactics/TA0009', 'Command and Control': '/tactics/TA0011', 'Exfiltration': '/tactics/TA0010', 'Impact': '/tactics/TA0040'}
/tactics/TA0043
/tactics/TA0042
/tactics/TA0001
/tactics/TA0002
/tactics/TA0003
/tactics/TA0004
/tactics/TA0005
/tactics/TA0006
/tactics/TA0007
/tactics/TA0008
/tactics/TA0009
/tactics/TA0011
/tactics/TA0010
/tactics/TA0040
14


# **Individual Data CSV**

In [None]:
import csv
import requests
from bs4 import BeautifulSoup


# Send a GET request to the website
url = "https://attack.mitre.org"+links_dict[link_text_list[13]]
response = requests.get(url)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.text, "html.parser")


# Find the table body
table_body = soup.find('table', class_='table-techniques').find('tbody')

# Prepare the CSV file
csv_file = open(link_text_list[13]+'.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['ID', 'Technique', 'Description'])

# Initialize the id_value variable
id_value = ''


# Iterate over the rows in the table body
for row in table_body.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == 3:
        if 'technique' in row.get('class'):
            # Get the value from the first <a> tag in the technique row
            id_value = cells[0].find('a').text.strip() if cells[0].find('a') else ''

        # Extract the text from the cells
        technique = cells[1].find('a').text.strip() if cells[1].find('a') else ''
        description = cells[2].text.strip()

    elif 'sub' in row.get('class'):
            id_value = cells[1].find('a').text.strip() if cells[1].find('a') else ''
            technique = cells[2].find('a').text.strip() if cells[1].find('a') else ''
            description = cells[3].text.strip()

    # Write the extracted data to the CSV file
    csv_writer.writerow([id_value, technique, description])


# Close the CSV file
csv_file.close()


# **Combined Data CSV**

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Prepare the CSV file
csv_file = open('combined_data.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['ID', 'Technique', 'Description'])

# Iterate over the URLs in link_text_list
for url_key in link_text_list:
    # Get the URL from links_dict
    url = "https://attack.mitre.org" + links_dict[url_key]

    # Send a GET request to the website
    response = requests.get(url)

    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table body
    table_body = soup.find('table', class_='table-techniques').find('tbody')

    # Initialize the id_value variable
    id_value = ''

    # Iterate over the rows in the table body
    for row in table_body.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) == 3:
            if 'technique' in row.get('class'):
                # Get the value from the first <a> tag in the technique row
                id_value = cells[0].find('a').text.strip() if cells[0].find('a') else ''

            # Extract the text from the cells
            technique = cells[1].find('a').text.strip() if cells[1].find('a') else ''
            description = cells[2].text.strip()

        elif 'sub' in row.get('class'):
            id_value = cells[1].find('a').text.strip() if cells[1].find('a') else ''
            technique = cells[2].find('a').text.strip() if cells[1].find('a') else ''
            description = cells[3].text.strip()

        # Write the extracted data to the CSV file
        csv_writer.writerow([id_value, technique, description])

# Close the CSV file
csv_file.close()


In [None]:
import re

def replace_wanted(string):
    email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
    string = email_regex.sub('email', string)
    ip_regex = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
    replaced_string = ip_regex.sub('ip', string)
    return replaced_string


# Example usage



In [None]:
import pandas as pd
import numpy as np


In [None]:
data=pd.read_csv('combined_data.csv')

In [None]:
data

Unnamed: 0,ID,Technique,Description
0,T1595,Active Scanning,Adversaries may execute active reconnaissance ...
1,.001,Scanning IP Blocks,Adversaries may scan victim IP blocks to gathe...
2,.002,Vulnerability Scanning,Adversaries may scan victims for vulnerabiliti...
3,.003,Wordlist Scanning,Adversaries may iteratively probe infrastructu...
4,T1592,Gather Victim Host Information,Adversaries may gather information about the v...
...,...,...,...
780,.001,Direct Network Flood,Adversaries may attempt to cause a denial of s...
781,.002,Reflection Amplification,Adversaries may attempt to cause a denial of s...
782,T1496,Resource Hijacking,Adversaries may leverage the resources of co-o...
783,T1489,Service Stop,Adversaries may stop or disable services on a ...


In [None]:
data['Description']=data['Description'].apply(replace_wanted)

In [None]:
id_clear=np.array(data['ID'])

In [None]:
def id_cleaner(string):
  add=""
  for i in range(len(string)):

    if string[i][0]=='T':
      add=string[i]

    else:
      string[i]=add+string[i]
  return string


In [None]:
data['ID']=pd.DataFrame(id_cleaner(id_clear))

In [None]:
data

Unnamed: 0,ID,Technique,Description
0,T1595,Active Scanning,Adversaries may execute active reconnaissance ...
1,T1595.001,Scanning IP Blocks,Adversaries may scan victim IP blocks to gathe...
2,T1595.002,Vulnerability Scanning,Adversaries may scan victims for vulnerabiliti...
3,T1595.003,Wordlist Scanning,Adversaries may iteratively probe infrastructu...
4,T1592,Gather Victim Host Information,Adversaries may gather information about the v...
...,...,...,...
780,T1498.001,Direct Network Flood,Adversaries may attempt to cause a denial of s...
781,T1498.002,Reflection Amplification,Adversaries may attempt to cause a denial of s...
782,T1496,Resource Hijacking,Adversaries may leverage the resources of co-o...
783,T1489,Service Stop,Adversaries may stop or disable services on a ...


In [None]:
data['Tactic']=data['ID']
data['Procedure']=data['Description']

In [None]:
data=data.drop(columns=['ID','Description'])

In [None]:
order=['Tactic','Technique','Procedure']

In [None]:
data=data.reindex(columns=order)

In [None]:
data2=pd.read_csv('ttpmitreWhole.csv')

In [None]:
data2

Unnamed: 0,Tactic,Technique,Procedure
0,T0800,Activate Firmware Update Mode,Adversaries may activate firmware update mode ...
1,T0830,Adversary-in-the-Middle,Adversaries with privileged network access may...
2,T0878,Alarm Suppression,Adversaries may target protection function ala...
3,T0802,Automated Collection,Adversaries may automate collection of industr...
4,T0803,Block Command Message,Adversaries may block a command message from r...
...,...,...,...
551,M0918,User Account Management,"Manage the creation, modification, use, and pe..."
552,M0917,User Training,Train users to be aware of access or manipulat...
553,M0818,Validate Program Inputs,Devices and programs designed to interact with...
554,M0916,Vulnerability Scanning,Vulnerability scanning is used to find potenti...


In [None]:
concatenated_df = pd.concat([data, data2], ignore_index=True)


In [None]:
concatenated_df

Unnamed: 0,Tactic,Technique,Procedure
0,T1595,Active Scanning,Adversaries may execute active reconnaissance ...
1,T1595.001,Scanning IP Blocks,Adversaries may scan victim IP blocks to gathe...
2,T1595.002,Vulnerability Scanning,Adversaries may scan victims for vulnerabiliti...
3,T1595.003,Wordlist Scanning,Adversaries may iteratively probe infrastructu...
4,T1592,Gather Victim Host Information,Adversaries may gather information about the v...
...,...,...,...
1336,M0918,User Account Management,"Manage the creation, modification, use, and pe..."
1337,M0917,User Training,Train users to be aware of access or manipulat...
1338,M0818,Validate Program Inputs,Devices and programs designed to interact with...
1339,M0916,Vulnerability Scanning,Vulnerability scanning is used to find potenti...


In [None]:
concatenated_df.to_csv("concatenated_df.csv", index=False)