# Web Scrapping 

In [1]:
import os

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
url = 'https://www.unitstatistics.com/dota2/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [4]:
# find function since there is only one table on the website
table = soup.find('table')

In [5]:
# find all headers(titles)
html_titles = table.find_all('th')

In [6]:
# For each 'title' in 'html_titles', accesses its text, sptrip and append to a list
table_titles = [title.text.strip() for title in html_titles]

In [7]:
table_titles[0]

'Hero\nHero Name'

In [8]:
#function to extract each title and its description from the already extracted html titles. 
def extract_titles(html_list):
    titles = []
    description =[]
    for item in html_list:
        # Split the string at the newline character
        parts = item.split('\n')
        # Add the first part (before the newline) to the titles list
        # Add the second part to the description list
        if parts:
            titles.append(parts[0])
            description.append(parts[1:])
    return titles,description

In [9]:
titles, description = extract_titles(table_titles)

In [10]:
#Save a data frame for the description of each title in case we need to refer back to it. However, titles are intuitive. 
Hero_attributes_description = pd.DataFrame({
    'Title': titles,
    'Description': description
})

In [11]:
Hero_attributes = pd.DataFrame(columns = titles)

In [12]:
# find all rows in the table
column_data = table.find_all('tr')

In [13]:
# This loop iterates over 'column_data', starting from the second item (skipping the first one).
for row in column_data[1:]:
    
    # For each row, find all table data (cell) elements within that row.
    # 'row_data' will be a list of these 'td' elements.
    row_data = row.find_all('td')

    # This list comprehension iterates over each 'td' element in 'row_data'.
    # For each element, '.text.strip()' is used to extract and clean the text content.
    # The resulting list, 'individual_row_data', contains the cleaned text of each cell in the current row.
    individual_row_data = [data.text.strip() for data in row_data]

    # 'len(Hero_attributes)' gets the current number of rows in the DataFrame.
    length = len(Hero_attributes)

    # A new row is added to the 'Hero_attributes' DataFrame.
    # 'individual_row_data', which contains the data for the current row, is assigned to the new row at the end of the DataFrame.
    # The 'loc' property is used for indexing and assigning the new data.
    Hero_attributes.loc[length] = individual_row_data


In [14]:
Hero_attributes

Unnamed: 0,Hero,Attribute,Complexity,Strength,Strength Gain,Strength Max,Agility,Agility Gain,Agility Max,Intelligence,...,Attack Speed,Base Attack Time,Attack Point,Attack Backswing,Vision Day,Vision Night,Turn Rate,Collision Size,HP/s,MP/s
0,Abaddon,Universal,♦,22,2.2,85.8,23,1.3,60.7,19,...,95,1.5,0.56,0.41,1800,800,0.6,27,3.2,1.2
1,Alchemist,Strength,♦,23,2.7,101.3,22,1.5,65.5,25,...,100,1.7,0.35,0.65,1800,800,0.6,27,2.55,1.25
2,Ancient Apparition,Intelligence,♦♦,20,1.9,75.1,20,2.2,83.8,23,...,100,1.7,0.45,0.3,1800,800,0.6,27,2.25,1.15
3,Anti-Mage,Agility,♦,19,1.6,65.4,24,2.8,105.2,12,...,100,1.4,0.3,0.3,1800,800,0.6,27,2.65,0.6
4,Arc Warden,Agility,♦♦♦,22,2.6,97.4,20,3,107,24,...,100,1.7,0.3,0.7,1800,800,0.7,27,2.45,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Windranger,Universal,♦♦,18,2,76,19,1.4,59.6,21,...,90,1.5,0.4,0.3,1800,800,0.8,27,2.05,1.3
120,Winter Wyvern,Universal,♦♦,20,2.2,83.8,16,1.5,59.5,26,...,90,1.7,0.25,0.8,1800,800,0.6,27,2.25,1.55
121,Witch Doctor,Intelligence,♦,20,2.1,80.9,13,1.4,53.6,22,...,100,1.7,0.4,0.5,1800,800,0.6,27,2.25,1.1
122,Wraith King,Strength,♦,22,2.8,103.2,16,1.7,65.3,18,...,100,1.7,0.56,0.44,1800,800,0.6,27,2.7,0.9


# Merging Heroes's Json File

In [15]:
# Define the relative paths to the data directories
data_dir = "../data"
raw_data_dir = os.path.join(data_dir, "raw")
processed_data_dir = os.path.join(data_dir, "processed")


In [16]:
# Load the raw JSON file
json_path = os.path.join(raw_data_dir, "Heroes.json")
Jsonfile = pd.read_json(json_path)

In [17]:
Jsonfile

Unnamed: 0,id,name,localized_name,primary_attr,attack_type,roles,legs
0,1,npc_dota_hero_antimage,Anti-Mage,agi,Melee,"[Carry, Escape, Nuker]",2
1,2,npc_dota_hero_axe,Axe,str,Melee,"[Initiator, Durable, Disabler, Carry]",2
2,3,npc_dota_hero_bane,Bane,all,Ranged,"[Support, Disabler, Nuker, Durable]",4
3,4,npc_dota_hero_bloodseeker,Bloodseeker,agi,Melee,"[Carry, Disabler, Nuker, Initiator]",2
4,5,npc_dota_hero_crystal_maiden,Crystal Maiden,int,Ranged,"[Support, Disabler, Nuker]",2
...,...,...,...,...,...,...,...
119,129,npc_dota_hero_mars,Mars,str,Melee,"[Carry, Initiator, Disabler, Durable]",2
120,135,npc_dota_hero_dawnbreaker,Dawnbreaker,str,Melee,"[Carry, Durable]",2
121,136,npc_dota_hero_marci,Marci,all,Melee,"[Support, Carry, Initiator, Disabler, Escape]",2
122,137,npc_dota_hero_primal_beast,Primal Beast,str,Melee,"[Initiator, Durable, Disabler]",2


## Column Renaming

In [18]:
# Renaming Column for clarity and consistency 
Jsonfile.rename(columns={'localized_name': 'Hero'}, inplace=True)

## Column Dropping

In [19]:
# Drop Id, name and primary_attr since they are repetitive. 
Jsonfile.drop(columns=['name', 'primary_attr'], inplace=True)


## Merging 

In [20]:
#Merging both dataframes 
Hero_merged = pd.merge(Hero_attributes, Jsonfile, on="Hero")

## Standardizing Headers 

In [21]:
#Standardize column titles 
Hero_merged.columns = [column.replace('_', ' ').title() for column in Hero_merged.columns]  

## Set Index

In [22]:
Hero_merged.set_index('Hero', inplace=True)

# Mapping Complexity Column 

In [23]:
Hero_merged['Complexity'].unique() 

array(['♦', '♦♦', '♦♦♦'], dtype=object)

In [24]:
# Define a mapping dict
complexity_mapping = {'♦':1, '♦♦':2, '♦♦♦':3}

In [25]:
# Map the 'Complexity' column
Hero_merged['Complexity'] = Hero_merged['Complexity'].map(complexity_mapping)

In [26]:
Hero_merged.head()

Unnamed: 0_level_0,Attribute,Complexity,Strength,Strength Gain,Strength Max,Agility,Agility Gain,Agility Max,Intelligence,Intelligence Gain,...,Vision Day,Vision Night,Turn Rate,Collision Size,Hp/S,Mp/S,Id,Attack Type,Roles,Legs
Hero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abaddon,Universal,1,22,2.2,85.8,23,1.3,60.7,19,1.6,...,1800,800,0.6,27,3.2,1.2,102,Melee,"[Support, Carry, Durable]",2
Alchemist,Strength,1,23,2.7,101.3,22,1.5,65.5,25,1.8,...,1800,800,0.6,27,2.55,1.25,73,Melee,"[Carry, Support, Durable, Disabler, Initiator,...",2
Ancient Apparition,Intelligence,2,20,1.9,75.1,20,2.2,83.8,23,3.1,...,1800,800,0.6,27,2.25,1.15,68,Ranged,"[Support, Disabler, Nuker]",0
Anti-Mage,Agility,1,19,1.6,65.4,24,2.8,105.2,12,1.8,...,1800,800,0.6,27,2.65,0.6,1,Melee,"[Carry, Escape, Nuker]",2
Arc Warden,Agility,3,22,2.6,97.4,20,3.0,107.0,24,2.6,...,1800,800,0.7,27,2.45,1.2,113,Ranged,"[Carry, Escape, Nuker]",2


In [27]:
Hero_merged.reset_index(inplace=True)

In [28]:
# Save the cleaned CSV file into the processed folder
cleaned_csv_path = os.path.join(processed_data_dir, "Hero_merged.csv")
Hero_merged.to_csv(cleaned_csv_path, index=False)

# Ethical Implications

In conclusin to the ethical considerations related to this project, it is crucial to address several key points. Firstly, it's important to note that the data from the website (https://www.unitstatistics.com/dota2/) was extracted without obtaining explicit permission from the site's owners. This action raises significant ethical concerns, especially since the website is protected by copyright laws. Furthermore, the website itself disclaims any guarantee regarding the accuracy of its content, which adds another layer of complexity when using this data for analysis.
When it comes to data processing, the methods used for cleaning and transforming the data aim to preserve its integrity and reliability. However, these processes are subject to certain assumptions that might affect the data's authenticity and lead to potential biases in interpretation. This aspect is particularly sensitive because the accuracy of data analysis is paramount, and any misinterpretation could mislead users or researchers.
Another critical point to consider is the dynamism of the data source. Specifically, the dataset containing Hero attributes may not reflect the most current information due to the game's frequent updates and patches. Such changes can significantly alter Hero attributes, making the dataset potentially obsolete and misleading for anyone relying on it for up-to-date information.
