# Aggregating some cluster attributes
We want to obtain a cluster name, its top 10 websites and its top 10 speakers.
#### Useful imports

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

### Load data

In [2]:
CLEAN_QUOTES = '../../data/clean_quotes.csv.bz2'
CLUSTERS = '../../data/clusters.csv.bz2'
QUOTES_PATH = '../../data/quotes-2020.json.bz2'
SPEAKERS_PATH = '../../data/speaker_attributes.parquet'

#### Quotes

In [3]:
clean_quotes = pd.read_csv(CLEAN_QUOTES).drop_duplicates()
clean_quotes = clean_quotes[['quoteID', 'qids', 'journal']] # Do not need date column
clean_quotes.head(2)

Unnamed: 0,quoteID,qids,journal
0,2020-01-24-000168,Q20684375,people.com
3,2020-01-21-031706,Q20684375,people.com


In [4]:
print(f'Number of speakers: {len(clean_quotes.qids.unique())}')

Number of speakers: 40753


#### Speaker attributes

In [5]:
speakers = pd.read_parquet(SPEAKERS_PATH)
speakers.head(2)

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,


In [6]:
# Keep only columns we are interested in to map ID -> name
speakers = speakers[['id', 'label']]

In [7]:
# Create a dictionary for fast lookup of 'id' (Q????) -> 'label' (name)
qid_to_speaker = dict(speakers.values)
qid_to_speaker['Q23']    

'George Washington'

#### Map QIDs to speaker names

In [8]:
clean_quotes.qids = clean_quotes.qids.map(lambda x: qid_to_speaker.get(x, x)) # Map QID to name
clean_quotes.sample(5)

Unnamed: 0,quoteID,qids,journal
5099578,2020-01-30-086957,Jenna Ortega,jhunewsletter.com
2641593,2020-03-24-039168,Jack Ziebell,theage.com.au
4632005,2020-03-12-008707,Leo Varadkar,wandsworthguardian.co.uk
1127977,2020-01-28-005396,Ayaan Hirsi Ali,timescall.com
1544514,2020-01-29-022345,Madeline Singas,news12.com


#### Cluster assignments

In [9]:
cluster_assignments = pd.read_csv(CLUSTERS, index_col=0)['cluster_id']
# dictionary {journal_name -> cluster_id}
cluster_dict = dict(cluster_assignments)
cluster_assignments.head(2)

journal
1011now.com      -1.0
1070thefan.com   -1.0
Name: cluster_id, dtype: float64

In [10]:
n_clusters = len(cluster_assignments.unique()) - 1 # ignore noise cluster

In [11]:
# Assign each quote to its cluster
clean_quotes['cluster'] = clean_quotes.journal.map(lambda j: cluster_dict[j])
clean_quotes.head(1)

Unnamed: 0,quoteID,qids,journal,cluster
0,2020-01-24-000168,Meghan King Edmonds,people.com,6.0


### Creating cluster attributes
#### Cluster names
Below we choose our cluster names based on our LDA, semantic analysis, and feature classification.

In [12]:
cluster_names = {
    0: 'Video games',
    1: 'Religion',
    2: 'Ireland',
    3: 'UK Sport - Football',
    4: 'US Sport - Baseball, Hockey, Motorsport',
    5: 'US Sport - American Football, Golf',
    6: 'Celebrities',
    7: 'Australia',
    8: 'UK local journals',
    9: 'Science, health & technology',
    10: 'US West Coast',
    11: 'India',
    12: 'Canada',
    13: 'New York',
    14: 'US Politics'
}

#### Most common speakers
We now find the 10 most common speakers along with their frequency in the cluster

In [13]:
top_speakers = []
cluster_quotes = clean_quotes.groupby('cluster')
for i in range(n_clusters):
    speaking = cluster_quotes.get_group(i).qids
    n_qs = len(speaking) # Number of quotes
    top = [(spkr, freq/n_qs) for spkr, freq in Counter(speaking).most_common(10)]
    top_speakers.append(top)

In [14]:
for spkrs in top_speakers:
    print([s for s, _ in spkrs])

['Hideki Kamiya', 'Mark Cerny', 'Atsushi Inaba', 'Marty Stratton', 'Shuntaro Furukawa', 'Yoshinori Kitase', 'Casey Hudson', 'Neil Druckmann', 'Geoff Keighley', 'Jeff Kaplan']
['Francis', 'Donald Trump', 'Robert Sarah', 'Hans Zollner', 'Benedict XVI', 'Nelson J. Perez', 'Robert W. McElroy', 'Joe Biden', 'George Pell', 'Mike Pence']
['Leo Varadkar', 'Micheál Martin', 'Mary Lou McDonald', 'Donald Trump', 'Simon Coveney', 'Paschal Donohoe', 'Boris Johnson', 'José Mourinho', 'Jürgen Klopp', 'Pep Guardiola']
['Jürgen Klopp', 'José Mourinho', 'Mikel Arteta', 'Pep Guardiola', 'Carlo Ancelotti', 'Zinedine Zidane', 'Quique Setién', 'Steve Bruce', 'Maurizio Sarri', 'Gary Neville']
['Steve Kerr', 'Jim Boylen', 'Kyle Shanahan', 'Doc Rivers', 'Zach LaVine', 'Kyle Busch', 'Brandon Hyde', 'Kemba Walker', 'Jayson Tatum', 'Gabe Kapler']
['Tiger Woods', 'Rory McIlroy', 'Brooks Koepka', 'Phil Mickelson', 'Kevin Colbert', 'Roger Goodell', 'Art Rooney II', 'Jay Monahan', 'Jack Nicklaus', 'Mike Tomlin']
['Ki

In [15]:
# Assign the occupation/description of each speaker
speaker_occupations = [
    ['Video Game Designer', 'Video Game Designer', 'Video Game Producer', 'Producer/Developer', 'Nintendo president', 'Game director', 'Game developer', 'Writer, Game designer & executive', 'Video Game journalist', 'Video Game Designer'],
    ['Pope', 'Former US President', 'Catholic prelate', 'Priest, Theologian, Philosopher', 'Former Pope', 'Catholic prelate', 'Catholic prelate', 'US President', 'Australian cardinal', 'Former US Vice-President'],
    ['Tánaiste of Ireland', 'Ireland Taoiseach', 'Teachta Dála of Ireland', 'Former US President', 'Minister for Foreign Affairs of Ireland', 'Teachta Dála of Ireland', 'UK Prime Minister', 'Football manager', 'Football manager', 'Football manager'],
    ['Football Manager', 'Football Manager', 'Football Manager', 'Football Manager', 'Football Manager', 'Football Manager', 'Football Manager', 'Football Manager', 'Football Manager', 'Football Manager'],
    ['Basketball coach', 'Basketball coach', 'American football coach', 'Basketball coach', 'Basketball player', 'Race car driver', 'Baseball manager', 'Basketball player', 'Basketball player', 'Baseball manager'],
    ['Golfer', 'Golfer', 'Golfer', 'Golfer', 'American football player', 'NFL Commissioner', 'American Football team owner', 'Golf PGA Tour Commissioner', 'Golfer', 'American football coach'],
    ['Reality star', 'Singer', 'Singer', 'Singer', 'Singer', 'Actress & Former member of UK Royalty ', 'Singer', 'Singer', 'Reality star', 'Basketball player'],
    ['Former Premier of New South Wales and leader of the New South Wales Liberal Party', 'Premier of Queensland', 'Australian Minister for Health and Aged Care', 'Treasurer of Australia', 'Leader of the Opposition of Australia', 'Premier of Tasmania', 'Deputy Premier of Queensland', 'Minister for Health and Medical Research of New South Wales', 'Commissioner of the New South Wales Police Force', 'Former US President'],
    ['UK Prime Minister', 'UK Member of Parliament & Former Secretary of State for Health and Social Care', 'Former US President', 'Football Manager', 'Football Manager', 'British politician, UK Deputy Prime Minister, Secretary of State for Justice & Lord Chancellor', 'First Minister of Scotland', 'Tánaiste of Ireland', 'British politician & Chancellor of the Exchequer', 'Football Manager'],
    ['Director-General of the World Health Organization', 'Former US President', 'Nigerian economist & President of the African Development Bank', 'Prime Minister of Singapore', 'Secretary-General of the United Nations', '?', 'President of Ghana', 'Chief Medical Advisor to the President of United States', 'South African Rugby player', 'Former Prime Minister of Malaysia'],
    ['Governor of California', 'Former US President', 'Los Angeles Mayor', 'Basketball player', 'Basketball Coach', 'Baseball Manager', 'Los Angeles Sheriff', 'Los Angeles politician', 'US Senator', 'Basketball player'],
    ['India Prime Minister', 'Former US President', 'Chief Minister of Delhi', 'Indian cricketer', 'Chief Minister of West Bengal', 'Indian Politician', 'Director-General of the World Health Organization', 'Chief Minister of Maharashtra', 'Indian diplomat', 'Indian monk & politician'],
    ['Canada Prime Minister', 'Former US President', 'Premier of Alberta', 'Chief Public Health Officer of Canada', 'Member of the House of Commons of Canada', 'Director-General of the World Health Organization', 'Member of the Legislative Assembly of British Columbia', 'Minister of Finance of Canada', 'Premier of Quebec', 'Member of the House of Commons of Canada'],
    ['Former Governor of New York', 'Mayor of New York City', 'Former US President', 'Attorney General of New York', 'County Executive of Suffolk County, New York', 'American basketball coach', 'Basketball coach', 'US Senate Majority Leader', 'Former Mayor of New York City & Businessman', 'Chief Medical Advisor to the President of United States'],
    ['Former US President', 'US President', 'US Senator', 'Chief Medical Advisor to the President of United States', 'Former Governor of New York', 'US Senator', 'Speaker of the United States House of Representatives', 'US Politician', 'Director-General of the World Health Organization', 'US Senate Majority Leader'],
]

for i in range(len(top_speakers)):
    for j in range(len(top_speakers[i])):
        name, freq = top_speakers[i][j]
        top_speakers[i][j] = (name, speaker_occupations[i][j], freq)

#### Top websites/journals

In [16]:
journal_counts = clean_quotes.groupby('journal').size()
journal_counts

journal
1011now.com          289
1070thefan.com        60
107jamz.com          193
10news.com          1132
1130thetiger.com      60
                    ... 
zalebs.com            32
zawya.com           3116
zdnet.com            625
zeibiz.com            13
zerotackle.com        33
Length: 5734, dtype: int64

In [17]:
cluster_journals = cluster_assignments.groupby(cluster_assignments)
top_journals = []
for i in range(n_clusters):
    jrnls = cluster_journals.groups[i]
    cluster_counts = journal_counts[[j in jrnls for j in journal_counts.index]]
    cluster_counts = cluster_counts.sort_values(ascending=False)
    top_journals.append(cluster_counts.head(10).index.values)

#### Create attributes dataframe

In [18]:
cluster_top = {}
for i in range(n_clusters):
    cluster_top[i] = [cluster_names[i], list(top_journals[i]), top_speakers[i]]

df = pd.DataFrame.from_dict(cluster_top, 'index', columns=['name', 'journals', 'speakers'])
df.index.name = 'cluster_id'
df

Unnamed: 0_level_0,name,journals,speakers
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Video games,"[gamesindustry.biz, usgamer.net, gamingbolt.co...","[(Hideki Kamiya, Video Game Designer, 0.040353..."
1,Religion,"[ncregister.com, catholicnewsagency.com, cruxn...","[(Francis, Pope, 0.3792224763149298), (Donald ..."
2,Ireland,"[breakingnews.ie, irishexaminer.com, independe...","[(Leo Varadkar, Tánaiste of Ireland, 0.0429631..."
3,UK Sport - Football,"[skysports.com, ghanasoccernet.com, sportsmole...","[(Jürgen Klopp, Football Manager, 0.0369725558..."
4,"US Sport - Baseball, Hockey, Motorsport","[nbcsports.com, nesn.com, mlb.com, racer.com, ...","[(Steve Kerr, Basketball coach, 0.019252128841..."
5,"US Sport - American Football, Golf","[cbssports.com, swxrightnow.com, usatoday.com,...","[(Tiger Woods, Golfer, 0.02111017997525839), (..."
6,Celebrities,"[people.com, cheatsheet.com, etonline.com, fem...","[(Kim Kardashian, Reality star, 0.011869738275..."
7,Australia,"[smh.com.au, brisbanetimes.com.au, sunshinecoa...","[(Gladys Berejiklian, Former Premier of New So..."
8,UK local journals,"[bucksfreepress.co.uk, expressandstar.com, eve...","[(Boris Johnson, UK Prime Minister, 0.03251130..."
9,"Science, health & technology","[asiaone.com, sciencecodex.com, africanews.com...","[(Tedros Adhanom Ghebreyesus, Director-General..."


In [19]:
# Save as JSON object to use in datastory website
df.reset_index().to_json('../../datasets/cluster-info.json', orient='records')