In [5]:
import polars as pl

In [6]:
def clean_data(company: str) -> pl.DataFrame:
    df = pl.read_csv(f"./companies/org_chart_data_{company}.csv").with_columns(
        pl.col("chart_status").cast(pl.Int64)
    )

    initial_shape = df.shape

    df = df.unique(subset=["name", "title", "reports_to"]).with_columns(
        pl.col("image_src").replace("no_match", None),
        pl.lit(company).alias("company"),
    )

    print(
        f"== {company.upper()} ==\nDuplicates Removed: {initial_shape[0] - df.shape[0]}\nTotal Records: {df.shape[0]}\n"
    )
    return df

In [13]:
def transform_data(df: pl.DataFrame) -> pl.DataFrame:
    name_to_manager = dict(zip(df["name"], df["reports_to"]))

    def get_hierarchy_level(name):
        level = 1
        while name in name_to_manager and name_to_manager[name] != name:
            name = name_to_manager[name]
            level += 1
        return level

    df = df.with_columns(
        pl.col("title")
        .str.replace_all("&amp;", "&")
        .str.replace_all('"', "")
        .alias("title"),
    ).with_columns(
        pl.col(
            ["name", "title", "reports_to"]
        ).str.strip_chars(),  # .str.to_lowercase(),
    )
    return df.with_columns(
        pl.col("name")
        .map_elements(get_hierarchy_level, return_dtype=pl.Int64)
        .alias("org_status"),
    ).select(["company", "name", "title", "reports_to", "org_status", "image_src"])

In [14]:
df = clean_data("nike")

== NIKE ==
Duplicates Removed: 51
Total Records: 573



In [15]:
df.head()

name,title,image_src,reports_to,chart_status,company
str,str,str,str,i64,str
"""Dina Stults""","""Public Policy & Social Impact …",,"""Rob Leinwand""",3,"""nike"""
"""Shelby Cabral Smither""","""Human Resources Director""",,"""Bernard Bedon""",8,"""nike"""
"""Liz Weldon""","""VP Global Womens Brand Managem…","""https://cdn.theorg.com/67c22be…","""Rami Jabaji""",6,"""nike"""
"""Frank 🦄 Huffener""","""Senior Engineer""","""https://cdn.theorg.com/318eefe…","""Jingzi Tan""",6,"""nike"""
"""Gabby Cool""","""Storytelling Art Director, Glo…","""https://cdn.theorg.com/86d2b9d…","""Jesse P. Stollak""",4,"""nike"""


In [16]:
name_to_manager = dict(zip(df["name"], df["reports_to"]))


def get_hierarchy_level(name):
    level = 1
    while name in name_to_manager and name_to_manager[name] != name:
        name = name_to_manager[name]
        level += 1
    return level

In [17]:
df.with_columns(
    pl.col("name")
    .map_elements(get_hierarchy_level, return_dtype=pl.Int64)
    .alias("chart_status_2")
).with_columns(
    (pl.col("chart_status") == pl.col("chart_status_2")).alias("diff")
).filter(
    pl.col("diff") == False
)

name,title,image_src,reports_to,chart_status,company,chart_status_2,diff
str,str,str,str,i64,str,i64,bool
"""Frank 🦄 Huffener""","""Senior Engineer""","""https://cdn.theorg.com/318eefe…","""Jingzi Tan""",6,"""nike""",7,false
"""Prithvi Narina""","""Sr. Principal Product Manager""",,"""Aaron Cain""",5,"""nike""",4,false
"""Monika Folske""","""Senior Materials Developer""","""https://cdn.theorg.com/ac55bd8…","""Gopi Parasurama""",6,"""nike""",7,false
"""Anna Schoborg""","""VP Global Nike Air Manufacturi…","""https://cdn.theorg.com/9a5e99d…","""Dr. Muge Erdirik Dogan""",7,"""nike""",6,false
"""Elliot Hill""",""" ""President & CEO""""","""https://cdn.theorg.com/127901e…",,1,"""nike""",2,false
…,…,…,…,…,…,…,…
"""Mimi Hunter""","""VP, Corporate Secretary & Corp…",,"""Aaron Cain""",5,"""nike""",4,false
"""Ananda Shenoy""","""Vice President Of Technology""","""https://cdn.theorg.com/82257ed…","""Dr. Muge Erdirik Dogan""",7,"""nike""",6,false
"""Seungin Han""","""Senior Product Engineer""",,"""Jingzi Tan""",6,"""nike""",7,false
"""Attilio D'Onofrio""","""Senior Devops Engineer""","""https://cdn.theorg.com/468fda9…","""Jingzi Tan""",6,"""nike""",7,false


In [18]:
def concat_datasets(company_list: list[str]):
    dfs = [clean_data(company).pipe(transform_data) for company in company_list]
    return pl.concat(dfs)

In [19]:
df = concat_datasets(
    [
        "apple",
        "disney",
        "google",
        "microsoft",
        "nike",
        "amazon",
        "meta",
        "netflix",
        "tesla",
        "stripe",
    ]
)
df.describe()

== APPLE ==
Duplicates Removed: 8
Total Records: 756

== DISNEY ==
Duplicates Removed: 27
Total Records: 768

== GOOGLE ==
Duplicates Removed: 705
Total Records: 1647

== MICROSOFT ==
Duplicates Removed: 21
Total Records: 408

== NIKE ==
Duplicates Removed: 51
Total Records: 573

== AMAZON ==
Duplicates Removed: 0
Total Records: 1567

== META ==
Duplicates Removed: 1
Total Records: 970

== NETFLIX ==
Duplicates Removed: 15
Total Records: 623

== TESLA ==
Duplicates Removed: 0
Total Records: 306

== STRIPE ==
Duplicates Removed: 0
Total Records: 619



statistic,company,name,title,reports_to,org_status,image_src
str,str,str,str,str,f64,str
"""count""","""8237""","""8236""","""8236""","""8233""",8236.0,"""6673"""
"""null_count""","""0""","""1""","""1""","""4""",1.0,"""1564"""
"""mean""",,,,,4.732516,
"""std""",,,,,1.206123,
"""min""","""amazon""","""A Charles Thomas""","""(cmo) Head Of Marketing Latam …","""1""",1.0,""" https://cdn.theorg.com/751821…"
"""25%""",,,,,4.0,
"""50%""",,,,,5.0,
"""75%""",,,,,5.0,
"""max""","""tesla""","""Álvaro Maruenda Rodrigo""","""sdr""","""Zak Burka""",10.0,"""https://theorg.com/org/amazon/…"


In [20]:
df.head()

company,name,title,reports_to,org_status,image_src
str,str,str,str,i64,str
"""apple""","""Koussalya Balasubramanian""","""Senior Engineering Manager, Ip…","""Madiha Chan""",5,"""https://cdn.theorg.com/65f0bfe…"
"""apple""","""Saul Jackman""","""Senior Data Science Manager""","""Daphne Luong""",4,
"""apple""","""Maggie Wang""","""Global Supply Manager""","""Christine Defilippo""",4,"""https://cdn.theorg.com/f2e1d3b…"
"""apple""","""Tiewen Han""","""Hardware Engineering Manager""","""Paul Meade""",4,
"""apple""","""Ben Borders""","""Director, Revenue Accounting""","""Donal Conroy""",4,


In [22]:
df.unique(
    subset=["company", "name", "title", "org_status", "image_src", "reports_to"]
).write_csv("./companies/org_chart_data.csv")