## Project Milestone 3 DSC 420 Cleaning/Formating Web Site Data
create by: David Hatchett  
created: 2024-01-29  

In [17]:
import pandas as pd
import numpy as np
import sqlite3
import requests
import bs4

from datetime import datetime

In [18]:
def parse_gdq_data(soup) -> list[str]|list[str]:
    '''
    It takes in the bs4 parse object and searches and splits the data 
    specifically for the runs pages. For loop deals with the different types of 
    headers and HTML objects that appear.

    This is also transform 1 string manipulation.
    '''
    ## this is where the first  transfromation occurs
    hdrs = [i.string.strip().replace(' ','_').lower() for i in soup.table.thead.find_all('th')]

    data_list = list()
    for tr in soup.table.find_all('tr', class_='small'):
        row_list = list()
        for i in tr.find_all('td'):
            if i.string == None and i.a != None:
                row_list.append(i.a.string.strip())
            elif i.string == None and i.b !=None:
                row_list.append(i.b.string.strip())
            else:
                row_list.append(i.string.strip())

        data_list.append(row_list)
            
    return hdrs, data_list


In [19]:
## access the page and parse it
page = requests.get('https://tracker.gamesdonequick.com/tracker/runs/AGDQ2022')
soup = bs4.BeautifulSoup(page.content, "html.parser")

In [20]:
## run the data set through the funtion
hdr, data = parse_gdq_data(soup)

In [21]:
## create a dataframe from the parsing
run_df = pd.DataFrame(data, columns=hdr)

In [22]:
run_df.head()

Unnamed: 0,name,runners,hosts,commentators,description,start_time,run_time,bid_wars
0,Pre-Show Get Hype,"spikevegeta, feasel, Keizaron, JHobz, Kungfufr...",PurpleGhostKasper,,,2022-01-09T11:30:00-05:00,0:30:00,No
1,Nioh 2 Any% Featherless,AxelSanGo,PurpleGhostKasper,,,2022-01-09T12:07:00-05:00,1:27:15,No
2,Metroid Prime Hunters All Items,Mr_Shasta,PurpleGhostKasper,,,2022-01-09T13:53:00-05:00,1:20:12,Yes
3,Donkey Kong Country 2: Diddy's Kong Quest Any%,"Eazinn, V0oid, SBDWolf, Tonkotsu",Skybilz,,,2022-01-09T15:50:00-05:00,0:41:00,No
4,DEATHLOOP Any%,CreeperHntr,Skybilz,,,2022-01-09T16:57:00-05:00,0:26:46,No


In [23]:
## update the text columns to category data type. Transform 2
update_list = ('name','runners','hosts','commentators','description')

for val in update_list:
    run_df[val] = run_df[val].astype('category')

In [24]:
## Change the date field to a date time object transform 3
run_df['start_time'] = pd.to_datetime(run_df['start_time'])

In [25]:
## add a time delta field base on run time transfrom 4
run_df['run_time_delta'] = pd.to_timedelta(run_df['run_time'])

In [26]:
### convert the bid wars column to 1 and 0 transfrom 5
run_df['bid_wars'] = run_df['bid_wars'].apply(lambda x: 1 if x =='Yes' else 0)

In [30]:
run_df

Unnamed: 0,name,runners,hosts,commentators,description,start_time,run_time,bid_wars,run_time_delta
0,Pre-Show Get Hype,"spikevegeta, feasel, Keizaron, JHobz, Kungfufr...",PurpleGhostKasper,,,2022-01-09 11:30:00-05:00,0:30:00,0,0 days 00:30:00
1,Nioh 2 Any% Featherless,AxelSanGo,PurpleGhostKasper,,,2022-01-09 12:07:00-05:00,1:27:15,0,0 days 01:27:15
2,Metroid Prime Hunters All Items,Mr_Shasta,PurpleGhostKasper,,,2022-01-09 13:53:00-05:00,1:20:12,1,0 days 01:20:12
3,Donkey Kong Country 2: Diddy's Kong Quest Any%,"Eazinn, V0oid, SBDWolf, Tonkotsu",Skybilz,,,2022-01-09 15:50:00-05:00,0:41:00,0,0 days 00:41:00
4,DEATHLOOP Any%,CreeperHntr,Skybilz,,,2022-01-09 16:57:00-05:00,0:26:46,0,0 days 00:26:46
...,...,...,...,...,...,...,...,...,...
144,Bonus Game 7 - Deltarune Chapter 2 Main Route,Shayy,ateatree,,,2022-01-15 22:38:00-05:00,0:30:37,1,0 days 00:30:37
145,Event Recap Recap 100%,"Darkman78, Sent, JHobz, sumichu",ateatree,,,2022-01-15 23:17:37-05:00,0:22:00,0,0 days 00:22:00
146,3 Million Milestone - Deltarune Chapter 2 Sno...,Shayy,,,,2022-01-15 23:44:37-05:00,0:32:26,0,0 days 00:32:26
147,"Metal Gear Solid Console, All Bosses, Extreme",dlimes13,ateatree,,,2022-01-16 00:27:03-05:00,1:21:20,1,0 days 01:21:20


In [29]:
### load data to the database
with sqlite3.connect("game_data.db") as con:
    run_df.to_sql('gdq_games_runs',con,if_exists = 'replace')

  run_df.to_sql('gdq_games_runs',con,if_exists = 'replace')


# Steps Completed and Ethical Implications

Overall, I didn't have to make many changes to the data. It came reasonably clean from the source site. The first change we made was to clean up the header records, lower them, and eliminate spaces. I also decided to convert all the text fields to a categories data type as it should be more efficient. Then, I ensured the date objects were actually date objects, which helps when saved to the database. Ethically, there isn't much to talk about. We are not misrepresenting the data in any way, and it can't really be used to harm a population. I may need to do extra work for part four of the project to match the names up with other games. I'm planning to do some fuzzy matching to accomplish this.