In [3]:
import numpy as np
import pandas as pd
import json
import csv
import regex as re
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Setup

#### Establishing Dataframes

In [5]:
# Reading a fixed version GNI88.csv (replaced one double quote with a single quote in line 369292) to a Pandas df
qtes = pd.read_csv("/content/drive/MyDrive/data/GNI88_fixed.csv")

In [6]:
# Reading gni88.json to a Pandas df
arts = pd.read_json("/content/drive/MyDrive/data/gni88.json", lines=True)

In [7]:
df = qtes

#### Speaker name cleaning

In [8]:
# credit: Lana
def remove_prefix(text):
    for prefix in prefixes:
      if text.lower().startswith(prefix):
        slicer = len(prefix)
        return text[slicer:]
    return text

def remove_suffix(text):
    for suffix in suffixes:
      if text.endswith(suffix):
        slicer = len(suffix)
        return text[:-slicer]
    return text


def regex_trim(rx_list, column, df, replace_value=""):
    '''Takes a list of regex patterns, and joins the patterns with an OR (|) separator. 
    Searches the specified column/df for the pattern and replaces it with value specified (default value-nothing)'''
    df[column] = df[column].replace(to_replace="|".join(rx_list), value=replace_value, regex=True)
    return df

def remove_accents(txt):
    """Certain outlets (CTV News) do not use accented characters in person names.
       Others (CBC News and Global news), always use accented characters in names.
       To help normalize these names and get accurate counts of sources, we replace 
       accented characters with their regular English equivalents.
       Example names that are normalized across different outlets using this method:
        * François Legault <-> Francois Legault
        * Valérie Plante <-> Valerie Plante
        * Jean Chrétien <-> Jean Chretien 
    """
    txt = re.sub("[àáâãäå]", 'a', txt)
    txt = re.sub("[èéêë]", 'e', txt)
    txt = re.sub("[ìíîïı]", 'i', txt)
    txt = re.sub("[òóôõö]", 'o', txt)
    txt = re.sub("[ùúûü]", 'u', txt)
    txt = re.sub("[ýÿ]", 'y', txt)
    txt = re.sub("ç", 'c', txt)
    txt = re.sub("ğ", 'g', txt)
    txt = re.sub("ñ", 'n', txt)
    txt = re.sub("ş", 's', txt)

    # Capitals
    txt = re.sub("[ÀÁÂÃÄÅ]", 'A', txt)
    txt = re.sub("[ÈÉÊË]", 'E', txt)
    txt = re.sub("[ÌÍÎÏİ]", 'I', txt)
    txt = re.sub("[ÒÓÔÕÖ]", 'O', txt)
    txt = re.sub("[ÙÚÛÜ]", 'U', txt)
    txt = re.sub("[ÝŸ]", 'Y', txt)
    txt = re.sub("Ç", 'C', txt)
    txt = re.sub("Ğ", 'G', txt)
    txt = re.sub("Ñ", 'N', txt)
    txt = re.sub("Ş", 'S', txt)
    return txt

def remove_titles(txt):
    """Method to clean special titles that appear as prefixes or suffixes to
       people's names (common especially in articles from British/European sources).
       The words that are marked as titles are chosen such that they can never appear
       in any form as a person's name (e.g., "Mr", "MBE" or "Headteacher").
    """
    honorifics = ["Dr", "Sir", "Dame", "Professor", "Prof", "Rev"]
    titles = ["QC", "CBE", "MBE", "BM", "MD", "DM", "BHB", "CBC", "Rep", "Rep.",
              "Reverend", "Recorder", "Headteacher", "Councillor", "Cllr", "Father", "Fr",
              "Mother", "Grandmother", "Grandfather", "Creator", "U.S. Rep", "Senator", "Sen", "Rabbi", "Imam"] # could add "Judge" but that could also be someone's name
    extras = ["et al", "www", "href", "http", "https", "Ref"]
    banned_words = r'|'.join(honorifics + titles + extras)
    # Ensure only whole words are replaced (\b is word boundary)
    pattern = re.compile(r'\b({})\b'.format(banned_words)) 
    txt = pattern.sub('', txt)
    txt = re.sub("^\.","",txt)
    return txt.strip()

def lnfn_parse(txt):
    """Converts names with "Last, First" pattern to "First Last" pattern.
       Works with multiple "Last, First" names, returns "First Last, First Last, ..."
    """
    lnfn_split = txt.split(", ")
    fnln_split = lnfn_split[::-1]
    fnln = ", ".join([" ".join(x) for x in zip(fnln_split[0::2], fnln_split[1::2])])
    return fnln

In [9]:
# credit: Lana
#looks for phone number and optional leading spaces/punctuation
phonenum_regex = '((?: |, |; |\. |\| )?\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}(?: |, |; |\. |\| )?)'
#looks for email address and optional leading spaces/punctuation
email_regex = "((?: |, |; |\. |\| )?[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+(?: |, |; |\. |\| )?)"
#looks for title words (case insensitive) and optional leading spaces/punctuation
title_regex = '((?: |, |; |\. |\| | - )?(?i)(?:Staff Writers?|Editor\-in\-Chief|Managing Editor|Political Editor|Editor\-at\-large|Columnist|Correspondent|Opinion contributors?|special.*|Capital Bureau)(?: |, |; |\. |\| )?)'
#capture -, anything after | 
symbol_regex = ' -|\|.*$'
#capture firstname.lastinitial pattern at end of AJC bylines, "; .. is . ." pattern with bios 
specialpatterns_regex = "(?: \w{4,}\.\w$)|(?i); .*(?:\.$| is.*)"
#capture non-name entries including anything after 'from,' and anything containing 'editorial', 'readers', or 'editors' 
non_name_regex = ".*(?:staff$|staff ).*|Letters to the Editor|from.*|(?i).*editorial.*|(?i).*editors.*|No by-line,|(?i).*readers.*"
#look for news outlets, case insensitive, including optional leading 'the'/connectors/punctuation
#For CNN captures anything that comes after
outlet_regex = '(?i)(?:, |; | and | for | ?The )?(?i)(?:CNN.*$|Associated Press|New York Times|Washington Times|USA Today|AJC|Green Bay Press-Gazette|Daily Beast|Nation|Houston Chronicle|Sarasota Herald-Tribune|Augusta Chronicle|Arizona Republic|Texas Tribune|Chicago Tribune)'
#capture non-comma connectors ('and', ';and', ';', '\n', '&')
connector_regex = '((?i)(?: ;and | and |; *|\\n * | & *))'
#capture double comma patterns
dbl_comma_regex = ', *,+ *'
#capture last name, first name pattern
# edited to capture names with punctuation (ie. hyphenated names, or names with middle initial)
lnfn_regex = "(^(?:[\w\.\'-])*, (?:[\w\.\'-])*|(?:[\w\.\'-] [\w\.])*$)"
#looks for "The" preceded and followed by a space, with optional leading comma
the_regex = ',? The .*'
#looks for "The" preceded and followed by a space, with optional leading comma
start_the_regex = '^The .*'

In [10]:
# credit: Lana
rx_patterns = [phonenum_regex,
               email_regex,
               title_regex, 
               symbol_regex, 
               specialpatterns_regex, 
               outlet_regex, 
               non_name_regex,
               the_regex,
               start_the_regex]

In [11]:
# credit: Lana
state_list = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District ", "of Columbia", 
              "Delaware", "Florida", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
              "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", 
              "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", 
              "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virgin Islands", "Vermont", "Wisconsin", 
              "West Virginia", "Wyoming"]

#Patterns of non-name Source Name entries
notname_regex = r"unnamed|editorial|\bthe\b|\bof\b|opponents|election|vote|liberal |conservatives?| for |documents?|expert|citizens|research|voting|financial|journal|reuters|cnn|bulletin| and |newswire| memo|\bpoll\b|spokesperson|[0-9]|\busa\b"
org_regex= "statement|committee|institute|report|groups?|association|university|college|center|coalition|advocate|national|league|associated|american|daily"
govt_regex = "^gop |federal|u\.s\.|supreme court|officials?|administration|department|office|congress|campaign|census|white house|democrat|republican|senate |registrar|secretary|commission|agency|us police|government"
court_regex = "appeals|circuit|lawyer|attorney|records|\bcourts?\b|lawsuit"
long_regex = "\S+\s\S+\s\S+\s\S+\s\S+"

#Look for strings that do not contain this
short_regex = "\S+\s+\S+"

# regex for one-letter first names
one_letter_regex = "^(\w\.)\W(\w+)"

notname_regex_list = [notname_regex, org_regex, govt_regex, court_regex, long_regex]

In [13]:
# credit: Lana
test_strings = ['Mark', 'By Mark', 'No by-line', 'Opinion by Mark', 'Analysis by Mark']

#for test in df.head()['author']:
#  print(author_cleaning(test))

prefixes = ['letter to the editor by ', 'by ', 'opinion by ', 'analysis by ', 'compiled by ', 'por ']

suffixes = [';Editor', ' Florida Times-Union', ' Jacksonville Florida Times-Union', ' Milwaukee Journal Sentinel',
            ' Capitol Media Services', ' -- Times Staff Writer', 'Appleton Post-Crescent',  
            '; Richmond Times-Dispatch', ' SUN STAFF WRITER', ' News Service Of Florida',
            ', Palm Beach Post', '; Editor', '; WPR NEWS', 
            ' Richmond Times-Dispatch', ' -- Times/Herald Tallahassee Bureau', ', RealClearWire', 
            '  -- Times Political Editor', '; Austin Bureau', ' Tribune News Service', ' Guest Columnist', 
            '; LA CROSSE TRIBUNE', ', Omaha World-Herald', ' USA TODAY NETWORK',  
            ' InsideSources.com', ' Yuma Sun Editor', ', Capitol Beat News Service', ' South Florida Sun Sentinel',
            ' Orlando Sentinel', '; Murphy teaches writing at Virginia Tech', " Washington Bureau", '; Contributing Writer', '  -- Times/Herald',  
            ' Capitol Beat News Service', ' -- PolitiFact', '; Now News Group', ' Tribune Content Agency', 
            '; WISCONSIN STATE JOURNAL', '; Washington Bureau Chief', ' The Heritage Foundation',
            ', Associated Press; The New York Times contributed.', ', Los Angeles Times', ' Atlanta Journal-Constitution', 
            ' of Capital News Service', 'Por']

In [14]:
# credit: Lana
# Drop "OLD" labels from name strings
df["Source Name"] = df["Source Name"].str.split(" OLD", expand =True)[0]
df["Source Name"] = df["Source Name"].str.split(r" \(OLD\)", expand =True)[0]

#Remove any names labelled "Organization"
df['cleaned_name'] = np.where(df['Source Gender'] == "Organization", "not_name", df['Source Name']) 

#Fill empty cells
df["cleaned_name"] = df["Source Name"].replace(np.nan, "none").apply(remove_prefix).apply(remove_suffix).str.title()

# replace " , " with ", " to fix some HuffPost author formats
df["cleaned_name"] = df["cleaned_name"].replace(" , ", ", ")

# removing stray "Por" prefixes
df["cleaned_name"] = df["cleaned_name"].replace("Por ", "")

#Remove rx pattern matches
df = regex_trim(rx_patterns, column="cleaned_name", df=df)

#find non-comma connectors and convert to comma
df = regex_trim([connector_regex], "cleaned_name", df=df, replace_value=", ")

#after comma conversion, check for multiple commas together and convert to single comma
df = regex_trim([dbl_comma_regex], "cleaned_name", df=df, replace_value=", ")

#strip trailing commas, and leading and trailing whitespace, then check for trailing commas again
df['cleaned_name'] = df['cleaned_name'].str.rstrip(",").str.strip().str.rstrip(",")

#Format names with last name, first name pattern
df['cleaned_name'] = np.where(df['cleaned_name'].str.match(lnfn_regex), 
                                  df['cleaned_name'].apply(lnfn_parse),
                                  df['cleaned_name'])

#Re-run searches to strip out names starting with 'The'
df = regex_trim([the_regex, start_the_regex], "cleaned_name", df=df)

#Remove accents and titles
df['cleaned_name'] = df['cleaned_name'].apply(remove_accents).apply(remove_titles)

#Remove non-name regex matches
df['cleaned_name'] = np.where(df['cleaned_name'].str.lower().str.contains(
    "|".join(notname_regex_list), regex=True), 
    "not_name", df['cleaned_name'])

#Remove state matches
df['cleaned_name'] = np.where(df['cleaned_name'].str.contains("|".join(state_list), regex=True), 
                                  "not_name", df['cleaned_name']) 

#Remove one word names
df['cleaned_name'] = np.where(df['cleaned_name'].str.contains(short_regex, regex=True), 
                                  df['cleaned_name'], "not_name")

# remove one letter (abbreviated) first names
df['cleaned_name'] = np.where(df['cleaned_name'].str.contains(one_letter_regex, regex=True),
                                  "not_name", df['cleaned_name'])



#### Group Labeling

In [15]:
# lookup dictionary which can convert a speaker type to its classification group in constant time
source_to_group = {'Foreign Gov/Mil Official': "Foreign Government",
                   'Media/Journalist': "External Commentator",
                   'Analyst/Commentator': "External Commentator",
                   'Citizen': "External Commentator",
                   'Blogger': "External Commentator",
                   'Public Polling': "External Commentator",
                   'Partisans/Fmr. Politicians': "External Commentator",
                   'Nuke Organization': "Organization",
                   'International Orgs': "International",
                   'Non-Profit/NGO': "Organization",
                   'Think Tanks': "Organization",
                   'Nuke Organization - Other': "Organization",
                   'US Rep. & Staff': "US Congress",
                   'US Senate & Staff': "US Congress",
                   'Federal Official': "US Federal Officials",
                   'State/Local Official': "US Federal Officials",
                   'Judicial Official': "US Federal Officials", 
                   'Former Admin. Officials': "US Federal Officials", 
                   'Regulator': "International",
                   'US Military': "US Defense",
                   'Defense Forces': "US Defense",
                   'Defense': "US Defense",
                   'US Police': "US Defense",
                   'Deputy': "US Defense",
                   'Academic': "Academic",
                   'Nuke Organization - Academic': "Organization",
                   'Nuclear Scientist': "Academic",
                   'Other': "Other",
                   'Chairman': "Other",
                   'Terrorist/Extremist': "Other",
                   'Corporate Official': "Other",
                   'Information minister': "Other",
                   'Religious/Clerical': "Other",
                   'Attorney': "Other", 
                   'Ambassador': "Other", 
                   'Nuclear Official': "Other"
                  }

In [16]:
# credit: Daniel
def assign_source_to_group(source_type):
  """Inputs:
     - source_type: str, source type from quote datafame
     Outputs:
     - str of the bigger speaker category to which source_type belongs"""
  if type(source_type) != str:
    return "Other"
  else:
    return source_to_group[source_type]

In [17]:
# credit: Tiffany
df["speaker_group"] = df.apply(lambda row: assign_source_to_group(row["Source Type"]),axis=1)

In [19]:
# Narrowing the quotes dataframe down to just records where the source was an 
# actual name, not a non_name
df = df[df["cleaned_name"] != "not_name"]

# Context Extraction Functions

In [18]:
# credit: Daniel
# helper function to extract full article text, useful for debugging cases where the context extraction function fails
def content_of(art_id):
  """Inputs:
     - art_id: id of the article to get the full text of
     Outputs:
     - str: the full article content as a string"""
  just_id = arts[arts["Article ID"] == art_id]["Content"]
  if len(just_id) > 0:
    return arts[arts["Article ID"] == art_id]["Content"].iloc[0]
  else: 
    return ""
  #return arts[arts["Article ID"] == art_id]["Content"].iloc[0].replace("\"","'")

In [31]:
# credit: Tiffany
def find_sentence(df, article_id, name, sentence_num=0):
    try:
        text = df[df['Article ID'] == article_id]['Content'].iloc[0]
    except:
        #print("Article ID not found in dataset.")
        return ""
    text = text.lower()
    name = name.lower()
    # convert " to ' and ´ to ' to account for fancy names
    text = text.replace("\"","'")
    text = text.replace("-"," ")
    name = name.replace("´","'")
    name = name.replace("-", " ")
    try:
        sentence = [s+ '.' for s in text.split('.') if name in s][sentence_num]
        return sentence
    except:
        #print("Sentence not found for " + name)
        return ""

def extract_comma_addendum_context(df, article_id, name, sentence_num=0):
    if type(name) != str:
      name = ""
    name = name.lower()
    content = content_of(article_id)
    content = content.lower()
    #only get info from <name>, <speaker info>, ...
    try:
        search_obj = re.search(name+',(\W+(?:\w+\W+){0,12})', content) #re.search(r',(?<=,)[^,]+(?=,)')
        return search_obj.group(1)#.group() #turns object into string
    except:
        return ""

In [23]:
# credit: Daniel Chung
# Helper function to find the context surrounding the first time a given
# speaker was mentioned in an article, with the purpose being to isolate
# the text most critical to classifying the speaker type
def find_first_context(art_ID, name, range=12):
  """Inputs:
     - art_ID: int, the ID of the article to be searched for context
     - name: str, the name of the quote speaker
     - range: int, how many words both before and after the first mention of the 
              speaker we wish to see
     Outputs:
     - str: <range> words before the first mention of the speaker, and <range> 
            words after the first mention of the speaker. Conceptually we are
            returning the context surrounding the first time <name> was
            mentioned in article <art_ID>"""
  # get the full text of the article
  text = ""
  just_id = arts[arts['Article ID'] == art_ID]
  if len(just_id) > 0:
    text = just_id['Content'].iloc[0]
  # convert to lower caps
  text = text.lower()
  name = name.lower()
  # convert " to ' and ´ to ' to account for fancy names
  text = text.replace("\"","'")
  text = text.replace("-"," ")
  name = name.replace("´","'")
  name = name.replace("-", " ")
  # split the full text into a list of paragraphs
  paragraphs =  re.split(r"\n\n", text)
  # find the paragraph where the name is first mentioned
  first_mention = ""
  #last_name = name.split()[-1]
  for paragraph in paragraphs:
    if name in paragraph:
      first_mention = paragraph
      break
  # return the context (surrounding words before and after) using regex
  context_pattern = '(\W+(?:\w+\W+){0,'+str(range)+'}' + name + '\W+(?:\w+\W+){0,'+str(range)+'})'
  matches = re.search(context_pattern, first_mention)
  if matches:
    context = re.search(context_pattern, first_mention)[0]
  else:
    context = ""
  return context

In [24]:
# credit: Tiffany
# added benefit of directly classifying self-explanatory names (no need to pre-screen)
# also should account for common abbrevations like 'sen.','rep.'
def classifyFromPrefix(name, text): 
    """Classifies name if the three words preceding it matches a known label like 'President','General', 'Professor', etc."""    
    try: 
        prefix = re.search(r'((?:\S+\s+){0,3}\b)' + name.lower(), text.lower()).group(0)
    except:
        return ""
    else:
        if any(word in prefix for word in ['former','foreign','media','news','chinese','iran','south korea']):
            return "External Commentator"
        if any(word in prefix for word in ['justice','governor','president']):
            return "US Federal Officials"
        if any(word in prefix for word in ['senator','representative','sen.','rep.','house','democrat','republican']):
            return "US Congress"
        us_defense = ['admiral','adm.','general','gen.','major','maj.','captain','capt.','lieutenant','lm.',
                      'colonel','col.','military','commander','cmdr.','air force','marine']
        if any(word in prefix for word in us_defense):
            return "US Defense"
        if any(word in prefix for word in ['professor','scholar','university']):
            return "Academic"
        if any(word in prefix for word in ['pastor','rev.','reverend','minister','bishop','pope']):
            return "Other" #religious
        return ""

#### Latest Context Extraction Function: TIffany's pipeline method

First search for pattern "name is ___", if no match then search for "name who is _____ ", if no match then search for "name, ______ ,", and if nothing matches then take the words preceding and following the first mention of the name.

In [45]:
# credit: Tiffany
def getContext(df, article_id, name, noPattern=False):
    #sentence = find_sentence(df, article_id, name)
    #if sentence == "": #can't find sentence
    #    return sentence
    content = content_of(article_id)
    content = content.lower()
    
    name = name.lower()
    name = name.replace("´","'")
    name = name.replace("-", " ")
    
    # pattern match <name> is ______ (,.:;\-!?"()
    match = re.search(name + r'is [^,.:;\-!?"(]+', content) 
    if match is None:
        # pattern match <name> who is _____(,.)
        match = re.search(name + r'who is [^,.:;\-!?"(]+', content)
    if match is None:
        # pattern match <name>, _____ , ... MAKE SURE NO QUOTES
        match = re.search(name + r',((?<=,)[^,"]+(?=,))', content)
    if match is None:
        # pattern match <name>, _____ . 
        match = re.search(name + r',(\W+(?:\w+\W+)).', content)   #(\W+(?:\w+\W+) any text before name
    if match is None and noPattern == True:
        #simple get the words around name in text
        return find_first_context(name, article_id, range=12)
    try:
        return match.group(1)
    except:
        return ""

In [46]:
# Taking a random sample of quotes to keep computations managable my memory standards
df_sample = df.sample(n=20000)

In [47]:
# Creating a column of context for each name for the quote in that record
df_sample['Context'] = df_sample.apply(lambda x: getContext(df_sample, x['Article ID'], x['Source Name']), axis=1)

# Feature brainstorming

#### Using TF-IDF scores to get feature ideas. Will be comparing the context extracted from speakers in each speaker category.

In [48]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(df_sample['Context'])

In [49]:
# Credit: https://github.com/dlab-berkeley/Python-Text-Analysis-Fundamentals/blob/main/day-2/02-unsupervised-solutions.ipynb
tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names())
tfidf['speaker_group'] = df_sample["speaker_group"]
tfidf.head()



Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,zealand,zecurion,zero,zheng,zinke,zionist,zone,zuckerman,zug,speaker_group
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Foreign Government
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [60]:
# Supposed best features within quote context that distinguish foreign government sources
foreign_gov = tfidf[tfidf['speaker_group']=="Foreign Government"]
tfi_ranked = foreign_gov.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

the          1.000000
in           1.000000
north        1.000000
testified    1.000000
said         1.000000
left         1.000000
calif        1.000000
according    1.000000
mo           1.000000
iran         1.000000
stoddard     1.000000
director     1.000000
an           1.000000
illinois     0.820028
monica       0.784528
acosta       0.742304
lashed       0.728848
henry        0.725427
mossad       0.718491
gloria       0.712842
blitzer      0.708914
karadsheh    0.707107
jomana       0.707107
wolf         0.705295
borger       0.701325
book         0.697112
even         0.695537
ed           0.688299
nato         0.687388
supreme      0.677096
dtype: float64

In [61]:
# Supposed best features within quote context that distinguish external commentator sources
external_commentator = tfidf[tfidf['speaker_group']=="External Commentator"]
tfi_ranked = external_commentator.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

trump        1.000000
72           1.000000
insisted     1.000000
said         1.000000
president    1.000000
the          1.000000
wash         1.000000
east         1.000000
came         1.000000
ohio         1.000000
fox          1.000000
example      0.915210
bolton       0.786207
frum         0.785599
fudan        0.780660
cia          0.749074
cabinet      0.720577
bash         0.712247
reza         0.710256
jill         0.707107
dougherty    0.707107
kyung        0.707107
kate         0.707107
lah          0.707107
bolduan      0.707107
sayah        0.703944
dana         0.701929
iaea         0.623445
david        0.618736
john         0.617963
dtype: float64

In [62]:
# Supposed best features within quote context that distinguish organization sources
organization = tfidf[tfidf['speaker_group']=="Organization"]
tfi_ranked = organization.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

the               1.000000
fox               1.000000
washington        1.000000
was               1.000000
calif             1.000000
this              1.000000
now               1.000000
belgium           0.934991
king              0.774024
kurtz             0.746106
palin             0.707107
sarah             0.707107
functioning       0.669931
howard            0.665827
john              0.633157
superintendent    0.552270
keynote           0.544787
carrier           0.542017
country           0.534437
russian           0.529527
integrated        0.512213
raytheon          0.507343
systems           0.478164
minister          0.467742
later             0.458913
deputy            0.457597
speaker           0.444915
added             0.444915
charge            0.437144
strike            0.431876
dtype: float64

In [63]:
# Supposed best features within quote context that distinguish US congress sources
us_congress = tfidf[tfidf['speaker_group']=="US Congress"]
tfi_ranked = us_congress.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

senior            1.000000
minn              1.000000
accused           1.000000
former            1.000000
though            1.000000
meanwhile         1.000000
kan               1.000000
said              1.000000
however           1.000000
legal             1.000000
israel            1.000000
weapons           0.791250
blitzer           0.708914
starr             0.708747
barbara           0.705462
wolf              0.705295
thanks            0.645914
ambassador        0.642264
the               0.630978
iaea              0.623445
nuclear           0.611492
twice             0.576314
economics         0.553268
administration    0.549258
phone             0.542240
unification       0.539544
via               0.506169
usa               0.505767
strong            0.474918
met               0.467820
dtype: float64

In [64]:
# Supposed best features within quote context that distinguish US federal official sources
us_fed_officials = tfidf[tfidf['speaker_group']=="US Federal Officials"]
tfi_ranked = us_fed_officials.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

director     1.000000
said         1.000000
ala          1.000000
told         1.000000
author       1.000000
the          1.000000
texas        1.000000
fox          1.000000
former       1.000000
who          1.000000
74           1.000000
fla          1.000000
ph           1.000000
56           1.000000
co           1.000000
european     1.000000
calif        1.000000
arkansas     0.824032
mattingly    0.785599
cardenas     0.760928
baer         0.752146
turkey       0.734195
candy        0.729244
henry        0.725427
anderson     0.719979
gloria       0.712842
reza         0.710256
starr        0.708747
brian        0.708747
dougherty    0.707107
dtype: float64

In [65]:
# Supposed best features within quote context that distinguish US defense sources
us_defense = tfidf[tfidf['speaker_group']=="US Defense"]
tfi_ranked = us_defense.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

43              1.000000
mcconnell       0.738287
elam            0.707107
stephanie       0.707107
spokesperson    0.674487
publisher       0.534350
concerned       0.531514
appearances     0.523369
white           0.500238
staff           0.500238
scientists      0.497631
series          0.487839
magazine        0.479414
friday          0.472168
union           0.467852
office          0.467439
throughout      0.467056
of              0.458522
house           0.456159
38              0.449838
nation          0.441818
stimson         0.436159
chief           0.422060
pentagon        0.421528
day             0.410743
editor          0.408024
times           0.374420
his             0.366757
fellow          0.340687
washington      0.306564
dtype: float64

In [66]:
# Supposed best features within quote context that distinguish academic sources
academic = tfidf[tfidf['speaker_group']=="Academic"]
tfi_ranked = academic.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

who           1.000000
an            1.000000
said          1.000000
large         0.727603
editor        0.593099
nations       0.550154
rachel        0.544085
engel         0.528122
integrated    0.512213
raytheon      0.507343
ukraine       0.499142
systems       0.478164
ambassador    0.467275
maddow        0.467159
richard       0.454776
visit         0.448662
united        0.441601
back          0.440264
just          0.424868
that          0.424056
former        0.362016
at            0.344713
honeymoon     0.341104
trial         0.341104
from          0.338472
defense       0.332534
murder        0.327558
contain       0.327558
judge         0.317947
to            0.316596
dtype: float64

In [67]:
# Supposed best features within quote context that distinguish other sources
other = tfidf[tfidf['speaker_group']=="Other"]
tfi_ranked = other.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

ashton        0.457302
syria         0.448090
ben           0.427338
wedeman       0.427338
sesay         0.408322
isha          0.408322
scholar       0.403268
drew          0.394830
griffin       0.381338
middle        0.360059
ambassador    0.335831
east          0.334385
institute     0.297764
attacks       0.290056
austrian      0.287127
included      0.275725
former        0.260182
since         0.254482
capital       0.248143
catherine     0.232406
hour          0.230466
to            0.227539
european      0.227046
six           0.217249
and           0.216809
wednesday     0.209159
meeting       0.202189
union         0.198349
the           0.197043
between       0.196052
dtype: float64

In [69]:
# Supposed best features within quote context that distinguish international sources
international = tfidf[tfidf['speaker_group']=="International"]
tfi_ranked = international.max(numeric_only=True).sort_values(ascending=False)
tfi_ranked.head(30)

vice          1.000000
illinois      0.780742
republican    0.544904
watching      0.538237
cnfi          0.501157
richard       0.463487
going         0.434598
clinton       0.393311
aide          0.390657
what          0.387987
100th         0.381500
white         0.335880
has           0.320680
spokesman     0.317120
house         0.306283
of            0.305814
is            0.294635
on            0.263177
former        0.235537
for           0.200862
and           0.196273
brigade       0.190750
crews         0.190750
keeping       0.190750
biggest       0.183175
ready         0.177801
colorado      0.177801
overseas      0.177801
jump          0.177801
healthy       0.177801
dtype: float64