In [None]:
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import time; time.sleep(5)
from io import StringIO


In [None]:
import math
import numpy as np
from scipy.stats import shapiro 
from scipy.stats import lognorm


# Importing Data Tables From Goonhammer

In [26]:
url_dict = {
    "winPCT_faction" : "https://40kstats.goonhammer.com/#GbF",
    "scoring_faction" : "https://40kstats.goonhammer.com/#SbF",
    "mission_scoring" : "https://40kstats.goonhammer.com/#Ps"
}

In [25]:
def fetch_tables(url_dict, wait_time=5):
    results = {}

    for name, url in url_dict.items():
        print(f"Fetching: {name} from {url}")
        
        # Start WebDriver
        driver = webdriver.Chrome()
        driver.get(url)

        # Wait for JavaScript content to load
        time.sleep(wait_time)

        # Parse HTML
        soup = BeautifulSoup(driver.page_source, "html.parser")
        html_io = StringIO(str(soup))
        tables = pd.read_html(html_io)

        driver.quit()

        # Store first table in dictionary with the given name
        results[name] = tables[0]

    return results

In [27]:
dataframes = fetch_tables(url_dict)

Fetching: winPCT_faction from https://40kstats.goonhammer.com/#GbF
Fetching: scoring_faction from https://40kstats.goonhammer.com/#SbF
Fetching: mission_scoring from https://40kstats.goonhammer.com/#Ps


In [28]:
winPCT_faction_df = dataframes["winPCT_faction"]
scoring_faction_df = dataframes["scoring_faction"]
mission_scoring_df = dataframes["mission_scoring"]

# Checking Data Quality

In [8]:
def quality(df):
    print("Shape:", df.shape)
    print("\nColumn Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nDuplicate Rows:", df.duplicated().sum())
    print("\nBasic Stats:\n", df.describe(include='all'))

In [33]:
quality(winPCT_faction_df)

Shape: (28, 9)

Column Types:
 Faction        object
Games           int64
VP            float64
Opp VP        float64
Win %         float64
Wins            int64
Losses          int64
Draws           int64
Real Win %    float64
dtype: object

Missing Values:
 Faction       0
Games         0
VP            0
Opp VP        0
Win %         0
Wins          0
Losses        0
Draws         0
Real Win %    0
dtype: int64

Duplicate Rows: 0

Basic Stats:
                  Faction         Games         VP     Opp VP      Win %  \
count                 28     28.000000  28.000000  28.000000  28.000000   
unique                28           NaN        NaN        NaN        NaN   
top     Adepta Sororitas           NaN        NaN        NaN        NaN   
freq                   1           NaN        NaN        NaN        NaN   
mean                 NaN   7397.214286  59.820714  60.404286  48.402857   
std                  NaN   3943.105923   2.628175   1.815300   4.180407   
min                  Na

In [31]:
quality(scoring_faction_df)

Shape: (28, 7)

Column Types:
 Faction               object
Player Primary       float64
Player Tactical      float64
Player Fixed         float64
Opponent Primary     float64
Opponent Tactical    float64
Opponent Fixed       float64
dtype: object

Missing Values:
 Faction              0
Player Primary       0
Player Tactical      0
Player Fixed         0
Opponent Primary     0
Opponent Tactical    0
Opponent Fixed       0
dtype: int64

Duplicate Rows: 0

Basic Stats:
                  Faction  Player Primary  Player Tactical  Player Fixed  \
count                 28       28.000000        28.000000     28.000000   
unique                28             NaN              NaN           NaN   
top     Adepta Sororitas             NaN              NaN           NaN   
freq                   1             NaN              NaN           NaN   
mean                 NaN       30.737857        25.238929     22.092857   
std                  NaN        1.445630         1.135499      2.416095   
m

In [19]:
quality(mission_scoring_df)

Shape: (63, 9)

Column Types:
 Mission         object
Deployment      object
Games            int64
Avg, Winner    float64
Avg, Loser     float64
Margin         float64
Avg, Go 1st    float64
Avg, Go 2nd    float64
FTA            float64
dtype: object

Missing Values:
 Mission        0
Deployment     0
Games          0
Avg, Winner    0
Avg, Loser     0
Margin         0
Avg, Go 1st    0
Avg, Go 2nd    0
FTA            0
dtype: int64

Duplicate Rows: 0

Basic Stats:
                 Mission Deployment         Games  Avg, Winner  Avg, Loser  \
count                63         63     63.000000    63.000000   63.000000   
unique                9          7           NaN          NaN         NaN   
top     Burden Of Trust  All Games           NaN          NaN         NaN   
freq                  7          9           NaN          NaN         NaN   
mean                NaN        NaN   3265.714286    34.799841   19.334762   
std                 NaN        NaN   4717.813003     4.643414    5.4

# Cleanup

In [None]:
winPCT_faction_df["Win %"] = winPCT_faction_df["Win %"].str.replace('%', '').astype('float64')

In [None]:
scoring_faction_df['Faction'] = scoring_faction_df['Faction'].astype(str)

In [None]:
scoring_faction_df = scoring_faction_df.loc[:, ~scoring_faction_df.columns.str.contains('^Unnamed', na=False)]

In [18]:
mission_scoring_df['Mission'] = mission_scoring_df['Mission'].astype(str)
mission_scoring_df['Deployment'] = mission_scoring_df['Deployment'].astype(str)

# Normality Testing

In [34]:
def check_normality(df, alpha=0.05):
    for col in df.select_dtypes(include='number').columns:
        data = df[col].dropna()
        if len(data) < 3:
            print(f"{col}: Not enough data for Shapiro test")
            continue
        stat, p = shapiro(data)
        if p > alpha:
            print(f"{col}:  Normally distributed (p = {p:.4f})")
        else:
            print(f"{col}:  Not normal (p = {p:.4f})")

In [35]:
check_normality(winPCT_faction_df)

Games:  Normally distributed (p = 0.3520)
VP:  Normally distributed (p = 0.5298)
Opp VP:  Not normal (p = 0.0394)
Win %:  Normally distributed (p = 0.1924)
Wins:  Normally distributed (p = 0.2318)
Losses:  Normally distributed (p = 0.2747)
Draws:  Normally distributed (p = 0.1679)
Real Win %:  Normally distributed (p = 0.2283)


In [36]:
check_normality(scoring_faction_df)

Player Primary:  Normally distributed (p = 0.3538)
Player Tactical:  Normally distributed (p = 0.2485)
Player Fixed:  Normally distributed (p = 0.0619)
Opponent Primary:  Normally distributed (p = 0.4756)
Opponent Tactical:  Not normal (p = 0.0005)
Opponent Fixed:  Normally distributed (p = 0.2501)


In [37]:
check_normality(mission_scoring_df)

Games:  Not normal (p = 0.0000)
Avg, Winner:  Normally distributed (p = 0.0810)
Avg, Loser:  Not normal (p = 0.0423)
Margin:  Not normal (p = 0.0029)
Avg, Go 1st:  Not normal (p = 0.0208)
Avg, Go 2nd:  Normally distributed (p = 0.1599)
FTA:  Not normal (p = 0.0001)
