# Data Collecting Part2:

In [78]:
import pandas as pd
import numpy as np
import re
import csv
import wikipedia
import requests
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load("en_core_web_sm")

## files

In [5]:
huac_df = pd.read_csv('data-tolera/494matches.csv')
huac_df.tail(8)

Unnamed: 0,pageid,title,full_name,first_name,last_name2,comments
486,73214047,Eda Lou Walton,Eda Lou Walton,Eda Lou,Walton,see also Edna Lou Walton
487,74516481,Joe Dallet,Joe Dallet,Joe,Dallet,0
488,74616421,Jerry Tyler,Jerry Tyler,Jerry,Tyler,0
489,75456258,Annette Rubinstein,Annette Rubinstein,Annette,Rubinstein,0
490,76402919,Milton Herndon,Milton Herndon,Milton,Herndon,0
491,77527111,Guy Emery Shipler,Guy Emery Shipler,Guy Emery,Shipler,0
492,78698426,Joseph North (writer),Joseph North,Joseph,North,0
493,78698728,Joseph Clark (journalist),Joseph Clark,Joseph,Clark,0


## create `wiki-info.csv`

### f1: `get_live_years()`

In [6]:
def get_live_years(name):
    try:
        person_page = wikipedia.page(name, auto_suggest=False)
        categories = person_page.categories

        birth_year = None
        death_year = None

        birth_pattern = re.compile(r"(\d{4}) births")
        death_pattern = re.compile(r"(\d{4}) deaths")

        for cat in categories:
            if not birth_year:
                birth_match = birth_pattern.search(cat)
                if birth_match:
                    birth_year = int(birth_match.group(1))
            if not death_year:
                death_match = death_pattern.search(cat)
                if death_match:
                    death_year = int(death_match.group(1))

            if birth_year and death_year:
                break

        return {"name": name, "birth_year": birth_year, "death_year": death_year}
    except wikipedia.exceptions.PageError:
        return {"error": f"Page for '{name}' not found."}
    except wikipedia.exceptions.DisambiguationError as e:
        return {"error": f"Disambiguation error: {e}"}
    except Exception as e:
        return {"error": str(e)}


In [7]:
get_live_years('Aaron_Copland')

{'name': 'Aaron_Copland', 'birth_year': 1900, 'death_year': 1990}

## function 1: `dates_info()`

In [38]:
def dates_info(page):
    categories = page.categories

    birth_year = None
    death_year = None

    birth_pattern = re.compile(r"(\d{4}) births")
    death_pattern = re.compile(r"(\d{4}) deaths")

    for cat in categories:
        if not birth_year:
            birth_match = birth_pattern.search(cat)
            if birth_match:
                birth_year = int(birth_match.group(1))
        if not death_year:
            death_match = death_pattern.search(cat)
            if death_match:
                death_year = int(death_match.group(1))

        if birth_year and death_year:
            break

    return {"birth_year": birth_year, "death_year": death_year}

### test with copland_page

In [39]:
copland_page = wikipedia.page('Aaron_Copland', auto_suggest=False)
dates_info(copland_page)

{'birth_year': 1900, 'death_year': 1990}

## function 2: `sexuality_info()`

In [12]:
def sexuality_info(page):
    categories = page.categories
    bio_sex = "NOT_FOUND"
    sexuality = "NOTA"

    male_keywords = ["male"]
    female_keywords = ["female", 'women', 'actresses']
    lgbtq_keywords = ["lgbtq", "gay", "lesbian", "bisexual", "transgender"]

    br0 = 0
    br1 = 0
    for cat in categories:

        for kw in female_keywords:
            if kw in cat.lower():
                bio_sex = 'FEMALE'
                br0 = 1
                break
        else:
            if br0 == 0:
                bio_sex = 'MALE'

        if any(keyword in cat.lower() for keyword in lgbtq_keywords):
            sexuality = "LGBTQ"

    return {"bio_sex": bio_sex, "sexuality": sexuality}

In [15]:
sexuality_info(copland_page)

{'bio_sex': 'MALE', 'sexuality': 'LGBTQ'}

## function 3: `race_info()

In [25]:
def race_info(page):
    categories = page.categories
    race = "non-black"

    black_keywords = ["african-american", "black"]
    
    for cat in categories:
        category_lower = cat.lower()
        if any(keyword in category_lower for keyword in black_keywords):
            race = "black"
            break

    return {"race": race}

In [26]:
hoskins_page = wikipedia.page('Allen_"Farina"_Hoskins', auto_suggest = False)
race_info(hoskins_page)

{'race': 'black'}

## function 4: `occupation_info()`

In [96]:
def occupation_info(page):
    summary = page.summary

    occupation = None
    occupation_match = re.search(
        r"(was|is) (an?|the) ([A-Za-z-]+) ([A-Za-z]+(?: [A-Za-z]+)?)",
        summary
        )
    if occupation_match:
        occupation = occupation_match.group(4).strip()
    
    remove_list = ['and', 'who', 'known', 'American']
    if occupation != None:
        for w in remove_list:
            if w in occupation:
                occupation = occupation.replace(w, '')
                occupation = occupation.strip()

    return {"occupation": occupation}


In [97]:
may_page = wikipedia.page('Kenneth_O._May', auto_suggest=False)
occupation_info(may_page)

{'occupation': 'mathematician'}

## extra function: `regex_match()`

In [40]:
def regex_match(re_patterns, lines0, group_i):
    re_objects = []
    for i in re_patterns:
        object0 = re.compile(i, re.IGNORECASE)
        re_objects.append(object0)
    for j in range(len(re_objects)):
        match0 = re_objects[j].search(lines0)
        if match0:
            match_str = match0.group(group_i[j]).strip()
            break
        else:
            match_str = None
    return match_str

## function 5: `descent_info()`

In [45]:
def descent_info(page):
    categories = page.categories
    descent_list = []

    descent_patterns = [
        r"of ([A-Za-z-]+(?: [A-Za-z-]+)?) descent",
        r"([A-Za-z-]+) emigrants",
        r"Emigrants from the ([A-Za-z-]+) Empire",  # russian, german, aus
        r"([A-Za-z-]+) American military personnel"
    ]
    descent_groups = [1, 1, 1, 1]

    for cat in categories:
        matched_str = regex_match(descent_patterns, cat, descent_groups)
        # print(matched_str)
        if matched_str != None:
            descent_list.append(matched_str)

    return {"descent": descent_list}


In [47]:
oppenheimer_page = wikipedia.page('Robert Oppenheimer')
chevalier_page = wikipedia.page('Haakon_Chevalier')
print(descent_info(oppenheimer_page))
descent_info(chevalier_page)

{'descent': ['German-Jewish']}


{'descent': ['French', 'Norwegian']}

## function 6: `education_info()`

In [50]:
def education_info(page):
    categories = page.categories

    edu_list = []
    edu_patterns = [
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) alumni", # uni Göttingen
        r"Alumni of ([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+)",
    ]
    edu_groups = [1, 1]

    for cat in categories:
        matched_str = regex_match(edu_patterns, cat, edu_groups)
        # print(matched_str)
        if matched_str != None:
            edu_list.append(matched_str)

    return {"education": edu_list}

In [53]:
rand_page = wikipedia.page('Ayn_Rand', auto_suggest=False)
print(education_info(rand_page))
education_info(oppenheimer_page)

{'education': ['Saint Petersburg State University']}


{'education': ["Christ's College, Cambridge",
  'Ethical Culture Fieldston School',
  'Harvard College',
  'University of Göttingen']}

## function 7: `conformity_info()`

In [58]:
conformity_keywords = [
        'atheist', 'anti-communist', 'communist', 'socialist', 'humanist',
        'anti-fascist', 'fascist', 'zionist', 'roman catholic', 
        'marxist','critics of marxism', 'critics of christianity', 
        'critics of religions'
                        ]

In [59]:
def conformity_info(page):
    categories = page.categories
    # need 
    conform_list = []
    for cat in categories:
        for keyword in conformity_keywords:
            if keyword.lower() in cat.lower():
                conform_list.append(keyword)
                break       # so that does not anti- does not account for communist
    
    conform_list = list(set(conform_list))

    return {'conformity': conform_list}


In [61]:
kennedy_page = wikipedia.page('John_F._Kennedy', auto_suggest=False)
hathaway_page = wikipedia.page('Clarence_Hathaway', auto_suggest=False)
print(conformity_info(rand_page))
print(conformity_info(hathaway_page))
print(conformity_info(kennedy_page))
print(conformity_info(rand_page))

{'conformity': ['zionist', 'anti-communist', 'atheist', 'critics of christianity', 'critics of religions', 'anti-fascist']}
{'conformity': ['communist', 'marxist', 'socialist']}
{'conformity': ['roman catholic', 'anti-communist']}
{'conformity': ['zionist', 'anti-communist', 'atheist', 'critics of christianity', 'critics of religions', 'anti-fascist']}


## function 8: `party_info()`

In [65]:
# rand_page.url

'https://en.wikipedia.org/wiki/Ayn_Rand'

In [100]:
from bs4 import BeautifulSoup
import requests
import re

def party_info(page):
    url = page.url
    response = requests.get(url)
    bs = BeautifulSoup(response.text, 'html.parser')

    infobox = bs.find('table', {'class': 'infobox'})
    party_list = []

    if infobox:
        party_row = infobox.find('th', string='Political party')
        if party_row:
            party_data = party_row.find_next('td')
            if party_data:
                party_links = party_data.find_all('a')
                for link in party_links:
                    party_name = link.get_text().strip()
                    party_list.append(party_name)

        # 'Other political affiliations' not working
        other_affiliations_row = infobox.find('th', string=re.compile(r'Other political'))
        if other_affiliations_row:
            affiliations_data = other_affiliations_row.find_next('td')
            if affiliations_data:

                affiliations_links = affiliations_data.find_all('a')
                for link in affiliations_links:
                    affiliation_name = link.get_text().strip()
                    party_list.append(affiliation_name)

    party_list = list(set(party_list))
    if not party_list:
        party_list.append("DON'T KNOW")

    return {"party": party_list}


In [101]:
eisenhower_page = wikipedia.page('Dwight Eisenhower', auto_suggest=False)
benson_page = wikipedia.page('Elmer_A._Benson', auto_suggest=False)
mccarthy_page = wikipedia.page('Joseph Mccarthy', auto_suggest=False)
print(party_info(mccarthy_page))
print(party_info(eisenhower_page))
party_info(benson_page)

{'party': ['Republican', 'Democratic']}
{'party': ['Republican']}


{'party': ['Democratic-Farmer-Labor']}

## function 9: `birthplace_info()`

In [74]:
import geonamescache

gc = geonamescache.GeonamesCache()
gc_countries = gc.get_countries()
gc_us_states = gc.get_us_states()
gc_cities = gc.get_cities()

country_names = {data['name'] for data in gc_countries.values()}
state_names = {data['name'] for data in gc_us_states.values()}
city_names = {data['name'] for data in gc_cities.values()}

In [75]:
def lower_placenames(place_names):
    lower_list = []
    input_list = list(place_names)
    for i in input_list:
        i0 = i.lower()
        lower_list.append(i0)
    return lower_list
country_names0 = lower_placenames(country_names)
state_names0 = lower_placenames(state_names)
city_names0 = lower_placenames(city_names)

In [76]:
def birthplace_info(page):
    url = page.url
    response = requests.get(url)
    bs = BeautifulSoup(response.text, 'html.parser')

    # infobox-label
    infobox = bs.find('table', {'class': 'infobox'})
    if not infobox:
        return {"birthplace": None, "birthcity": None, "birthstate": None, "birthcountry": None}

    birthplace = None
    birthcity = None
    birthstate = None
    birthcountry = None

    born_row = infobox.find('th', string="Born")
    if born_row:
        data_cell = born_row.find_next_sibling('td')
        if data_cell:
            birthplace_div = data_cell.find('div', {'class': 'birthplace'})
            birth_info_text = data_cell.get_text(separator=" ", strip=True)
            # deal with both links and text content in birthplace division
            if birthplace_div:
                location_parts = []
                # print('birth-div')
                for item in birthplace_div.contents:
                    if item.name == 'a':
                        for parts in item.text.split(','):
                            location_parts.append(parts.strip().lower())
                    elif isinstance(item, str):
                        cleaned_parts = []
                        for part in item.split(','):
                            cleaned_parts.append(part.strip().lower())
                        location_parts.extend(cleaned_parts)
                # if location_parts[0]:
                #     for state in state_names:
                #         if state in location_parts[0]:
                #             birthstate = state
                #             birthcountry = 'USA'
                br = 0
                br2 = 0
                br3 = 0
                if isinstance(location_parts, list):
                    for place in location_parts:
                        # birthplace = location_parts[0]
                        if place not in state_names0 and place not in country_names0 and place not in city_names0 and place != 'u.s.' and place != '':
                            if br == 0:
                                birthplace = place
                                # print(place)
                                br = 1
                        if place in city_names0:
                            if br3 == 0:
                                birthcity = place
                                br3 += 1
                                # print(place)
                        if place in state_names0:
                            birthstate = place
                            birthcountry = 'united states'
                            br2 = 1
                        if br2 == 0 and place in country_names0:
                            birthcountry = place
            # if class birthplace not found, use spacy GPE
            else:
                doc = nlp(birth_info_text)
                gpe_set = set()
                for ent in doc.ents:
                    if ent.label_ == 'GPE':
                        gpe_str = ent.text.lower()
                        gpe_set.add(gpe_str)
                # print(gpe_set)
                gpe_list = list(gpe_set)
                b = 0
                if isinstance(gpe_list, list):
                    for place in gpe_list:
                        if place not in state_names0 and place not in country_names0 and place not in city_names0 and place != 'u.s.':
                            birthplace = place
                        if place in city_names0:
                            birthcity = place
                        if place in state_names0:
                            birthstate = place
                            birthcountry = 'united states'
                            b = 1
                        if b == 0 and place in country_names0:
                            birthcountry = place
    return {
        "birthplace": birthplace,
        "birthcity": birthcity,
        "birthstate": birthstate,
        "country": birthcountry
    }

In [79]:
hellman_page = wikipedia.page('Lillian Hellman', auto_suggest=False)
truman_page = wikipedia.page('Harry Truman', auto_suggest=False)
print(birthplace_info(chevalier_page))
print(birthplace_info(rand_page))
print(birthplace_info(hellman_page))
print(birthplace_info(may_page))
birthplace_info(truman_page)

{'birthplace': 'lakewood township', 'birthcity': None, 'birthstate': 'new jersey', 'country': 'united states'}
{'birthplace': 'russian empire', 'birthcity': 'saint petersburg', 'birthstate': None, 'country': None}
{'birthplace': None, 'birthcity': 'new orleans', 'birthstate': 'louisiana', 'country': 'united states'}
{'birthplace': None, 'birthcity': 'portland', 'birthstate': 'oregon', 'country': 'united states'}


{'birthplace': 'lamar',
 'birthcity': None,
 'birthstate': 'missouri',
 'country': 'united states'}

## function 10: `spatial_info()`

In [80]:
def spatial_info(page):
    content = page.content
    doc = nlp(content)
    countries = set()
    states = set()
    cities = set()
    
    for ent in doc.ents:
        if ent.label_ == "GPE":
            if ent.text in state_names:
                states.add(ent.text)
            elif ent.text in country_names:
                countries.add(ent.text)
            elif ent.text in city_names:
                cities.add(ent.text)
    
    dict0 = {
        "list_countries": list(countries),
        "list_states": list(states),
        "list_cities": list(cities)
        }

    return dict0

In [82]:
hiss_page = wikipedia.page('Alger Hiss', auto_suggest=False)
print(spatial_info(oppenheimer_page))
print(spatial_info(hoskins_page))
spatial_info(hiss_page)

{'list_countries': ['United States', 'France', 'Greece', 'Netherlands', 'Germany', 'Japan', 'Mexico'], 'list_states': ['Washington', 'New Mexico', 'Oregon', 'New Hampshire', 'New Jersey', 'California', 'Nevada', 'Colorado', 'New York', 'Massachusetts'], 'list_cities': ['Hiroshima', 'Reno', 'Berlin', 'Manhattan', 'Berkeley', 'Dachau', 'Coventry', 'Boston', 'New York City', 'Tokyo', 'Leiden', 'Princeton', 'Seattle', 'Warsaw', 'Pasadena', 'Saint John', 'Los Angeles', 'London', 'Hamburg', 'Alamogordo', 'Exeter', 'Munich', 'Cambridge', 'Nagasaki', 'Dresden']}
{'list_countries': ['Canada', 'Australia'], 'list_states': ['Texas', 'Ohio', 'California', 'New York', 'Missouri'], 'list_cities': ['Monterey', 'Culver City', 'Lima', 'Odessa', 'Santa Rosa', 'Boston', 'Ontario', 'Hayward', 'Alameda', 'Los Angeles', 'Jefferson City', 'Oakland', 'Hollywood', 'Toronto', 'Mango']}


{'list_countries': ['France',
  'Austria',
  'Switzerland',
  'Germany',
  'Hungary',
  'Poland',
  'Canada'],
 'list_states': ['Washington',
  'Arkansas',
  'North Carolina',
  'New Hampshire',
  'West Virginia',
  'California',
  'Vermont',
  'New York',
  'Maryland'],
 'list_cities': ['New Haven',
  'Prague',
  'Boston',
  'Moore',
  'New York City',
  'Baltimore',
  'Ottawa',
  'Moscow',
  'San Francisco',
  'Bern',
  'Mexico City',
  'Vienna',
  'Geneva',
  'Yalta',
  'London',
  'Wheeling',
  'Harvey']}

## function 11: `institutions_info()`

In [85]:
def institutions_info(page):
    categories = page.categories
    inst_list = []

    inst_patterns = [
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) faculty",
        r"Academics of ([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+)",
        r"Members of ([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+)",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) members",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) politicians",
        r"Fellows of ([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) ",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) fellows",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) officials",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) informants",
        r"People of the([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+)",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) people",
        r"([A-Za-zÀ-ÖØ-öø-ÿ-\s&,']+) personnel"
    ]

    group_index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

    for cat in categories:
        matched_str = regex_match(inst_patterns, cat, group_index)
        # print(matched_str)
        if matched_str != None and matched_str != 'American':
            inst_list.append(regex_match(inst_patterns, cat, group_index))
    
    return {"institutions": inst_list}


In [86]:
hammarsmark_page = wikipedia.page('Samuel_Hammersmark', auto_suggest=False)
herstein_page = wikipedia.page('Lillian_Herstein', auto_suggest=False)
print(institutions_info(hammarsmark_page))
print(institutions_info(hiss_page))
print(institutions_info(hellman_page))
print(institutions_info(herstein_page))

{'institutions': ['Chicago Federation of Labor', 'Industrial Workers of the World', 'Labor Party of the United States', 'the Communist Party USA']}
{'institutions': ['United States Department of Agriculture', 'United States Department of State']}
{'institutions': ['the American Academy of Arts and', 'the American Academy of Arts and Letters']}
{'institutions': ['American Federation of Teachers', 'Chicago Federation of Labor']}


In [87]:
fuchs_page = wikipedia.page('Klaus_Fuchs', auto_suggest=False)
institutions_info(fuchs_page)

{'institutions': ['the University of Birmingham',
  'British',
  'Communist Party of Germany',
  'Manhattan Project',
  'the German Academy of Sciences at Berlin',
  'Nuclear weapons program of the Soviet Union',
  'Reichsbanner Schwarz-Rot-Gold',
  'Socialist Unity Party of Germany']}

## function 12: `norp_info()`

In [88]:
def norp_info(page):
    page_links = page.links
    links_text = ','.join(page_links)
    doc = nlp(links_text)
    norp_set = set()
    for ent in doc.ents:
        if ent.label_ == "NORP":
            norp_set.add(ent.text)
    
    dict0 = {
        "norp": list(norp_set)
        }

    return dict0

In [90]:
print(norp_info(hellman_page))


{'norp': ['Jews', 'Spanish', 'Stalinism', 'Communist', 'Maltese', 'Attic']}


## general functions:

In [102]:
def wiki_info(name):
    d0 = {'name': name}
    page0 = wikipedia.page(name, auto_suggest=False)
    categories = page0.categories
    d1 = dates_info(page0)
    d2 = sexuality_info(page0)
    d3 = race_info(page0)
    d4 = occupation_info(page0)
    d5 = descent_info(page0)
    d6 = education_info(page0)
    d7 = conformity_info(page0)
    d8 = party_info(page0)
    d9 = birthplace_info(page0)
    d10 = spatial_info(page0)
    d11 = institutions_info(page0)
    d12 = norp_info(page0)
    out_dict = d0 | d1 | d2 | d3 | d4 | d5 | d6 | d7 | d8 | d9 | d10 | d11 | d12
    print(f'{name} wiki dictionary created')
    
    return out_dict

In [93]:
rand_dict = wiki_info('Ayn Rand')
print(type(rand_dict))
rand_dict

<class 'dict'>


{'name': 'Ayn Rand',
 'birth_year': 1905,
 'death_year': 1982,
 'bio_sex': 'FEMALE',
 'sexuality': 'NOTA',
 'race': 'non-black',
 'occupation': 'author',
 'descent': ['Russian-Jewish', 'Soviet'],
 'education': ['Saint Petersburg State University'],
 'conformity': ['zionist',
  'anti-communist',
  'atheist',
  'critics of christianity',
  'critics of religions',
  'anti-fascist'],
 'party': ["DON'T KNOW"],
 'birthplace': 'russian empire',
 'birthcity': 'saint petersburg',
 'birthstate': None,
 'country': None,
 'list_countries': ['Russia', 'United States', 'Israel', 'Norway', 'Canada'],
 'list_states': ['California', 'New York'],
 'list_cities': ['Saint Petersburg',
  'Chicago',
  'Los Angeles',
  'New York City',
  'Hollywood'],
 'institutions': [],
 'norp': ['Authoritarian',
  'Stateless',
  'novella),Anti-Communist',
  'Restorative',
  'Sovereignty',
  'European',
  'Collectivist',
  'American',
  'Marxian',
  'Modernism',
  'Russian Symbolists',
  'Soviet',
  'Libertarian',
  'Capit

In [None]:
# years_list = []
# for i in huac_df['title']:
#     years_dict = get_life_years(i)
#     years_list.append(years_dict)

# years_df = pd.DataFrame(years_list)
# years_df.to_csv('data/494years.csv')

In [None]:
# list494 = []

# for i in huac_df['title']:
#     wiki_dict = wiki_info(i)
#     list494.append(wiki_dict)

# df494 = pd.DataFrame(list494)
# df494.to_csv('data-tolera/494wiki.csv')