In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
word_counts_results = pd.read_csv()

In [None]:
length_results = pd.read_csv()

In [None]:
length_results.head()

In [None]:
word_counts_results['Framework'] = word_counts_results['Filename'].str.split('_').str[0]
word_counts_results['Date'] = word_counts_results['Filename'].str.split('_').str[1]
new_order = ['Filename', 'Framework', 'Date', "risk", "safe", "bias", "security", "ethic",
                      "accountab", "transparen", "explainab", "policy",
                      "compliance", "governance", "protect", "sustainab",
                      "fair", "catastroph", "responsib", "prepare"]
word_counts_results = word_counts_results[new_order]

word_counts_results = word_counts_results.merge(length_results, on = 'Filename')
word_counts_results.head()

In [None]:
group_0_frameworks = ['Anthropic', 'Cohere', 'Deepmind', 'OpenAI', 'Naver', 'META', 'xAI', 'Microsoft']
group_1_frameworks = ['Deloitte', 'G42', 'Grammarly', 'IBM', 'KPMG', 'Magic', 'NVDIA', 'PaloAlto', 'PwC', 'Amazon']

group_0_results = word_counts_results[word_counts_results['Framework'].isin(group_0_frameworks)]
group_1_results = word_counts_results[word_counts_results['Framework'].isin(group_1_frameworks)]

word_totals_group_0 = group_0_results.iloc[:, 3:].sum(axis = 0).reset_index()
word_totals_group_0.columns = ['Word', 'Total']
word_totals_group_0 = word_totals_group_0[word_totals_group_0['Word'] != 'Length']  
word_totals_group_0_sorted = word_totals_group_0.sort_values(by = 'Total', ascending = False)

word_totals_group_1 = group_1_results.iloc[:, 3:].sum(axis = 0).reset_index()
word_totals_group_1.columns = ['Word', 'Total']
word_totals_group_1 = word_totals_group_1[word_totals_group_1['Word'] != 'Length']
word_totals_group_1_sorted = word_totals_group_1.sort_values(by = 'Total', ascending = False)

In [None]:
word_mapping = {
    'risk': 'Risk', 
    'security': 'Security',
    'safe': 'Safe/Safety',
    'responsib': 'Responsible/Responsibility',
    'ethic': 'Ethics/Ethical',
    'transparen': 'Tranaparent/Transparency',
    'governance': 'Governance',
    'policy': 'Policy/Policymaker',
    'bias': 'Bias',
    'protect': 'Protect/Protection',
    'compliance': 'Compliance',
    'catastroph': 'Catastrophe/Catastrophic',
    'fair': 'Fair/Fairness',
    'prepare': 'Prepare/Preparedness',
    'accountab': 'Accountable/Accountability',
    'explainab': 'Explainable/Explainability',
    'sustainab': 'Sustainable/Sustainability',
}


word_totals_group_0_sorted['Word'] = word_totals_group_0_sorted['Word'].replace(word_mapping)
word_totals_group_1_sorted['Word'] = word_totals_group_1_sorted['Word'].replace(word_mapping)

In [None]:
plt.figure(figsize = (10, 6), dpi = 500)
plt.bar(word_totals_group_0_sorted['Word'], word_totals_group_0_sorted['Total'], color = 'skyblue')
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel('Word')
plt.ylabel('Total Count')
plt.title('Total Word Counts Across All Frameworks')
#plt.gca().invert_yaxis()
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi = 500)
plt.bar(word_totals_group_1_sorted['Word'], word_totals_group_1_sorted['Total'], color = 'skyblue')
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel('Word')
plt.ylabel('Total Count')
plt.title('Total Word Counts Across All Frameworks')
#plt.gca().invert_yaxis()
plt.show()

In [None]:
melted_df = word_counts_results.melt(
    id_vars=["Filename", "Framework", "Date", 'Length'], 
    var_name="Word", 
    value_name="Count"
)

melted_df['Word'] = melted_df['Word'].replace(word_mapping)

melted_df['Normalised_Count'] = melted_df['Count'] / melted_df['Length']

melted_df.head()

In [None]:
frontier_mapping = {
    'Amazon': 0, 'Anthropic': 1, 'Cohere': 1,
    'Deepmind': 1, 'Deloitte': 0, 'G42': 0,
    'Grammarly': 0, 'IBM': 0, 'KPMG': 0, 'Magic': 0,
    'META': 1, 'Microsoft': 1,'Naver': 1, 'NVDIA': 0,
    'OpenAI': 1, 'PaloAlto': 0, 'PwC': 0, 'xAI': 1
}

melted_df['Frontier'] = melted_df['Framework'].map(frontier_mapping)

grouped_words = melted_df.groupby(['Frontier', 'Word'])['Normalised_Count'].mean().reset_index()
grouped_words['Frontier'] = grouped_words['Frontier'].astype(int)

top_words = (
    grouped_words.groupby('Frontier', group_keys = False)
    .apply(lambda x: x.nlargest(8, 'Normalised_Count'))
)

top_words

In [None]:
heatmap_data = grouped_words.pivot(index = 'Word', columns = 'Frontier', values = 'Normalised_Count').fillna(0)

plt.figure(figsize = (10, 8), dpi = 500)
sns.heatmap(heatmap_data, annot = True, 
            fmt = ".4f", cmap = "Reds", 
            cbar_kws = {'label': 'Normalized Count (%)'}, 
            vmin = 0, vmax = heatmap_data.values.max() * 1.5
)
plt.title('Word Usage Heatmap: Frontier vs Non-Frontier Companies')
plt.ylabel('Word')
plt.xlabel('Frontier')
plt.tight_layout()
plt.show()

In [None]:
top_words = (
    melted_df.groupby("Framework", group_keys=False)
    .apply(lambda x: x.nlargest(5, "Count"))
)

top_words.head(5)