In [1]:
import pandas as pd

In [3]:
import gzip

# Load the Wikipedia article titles into a set for efficient lookup
wiki_titles = set()
with gzip.open("enwiki-latest-all-titles-in-ns0.gz", "rt", encoding="utf-8") as f:
    for title in f:
        title = title.strip()
        title = title.replace("_", " ")
        wiki_titles.add(title.strip())

# Check if a person name has a Wikipedia article associated with it
person_name = "Bill Clinton"
if person_name in wiki_titles:
    print(f"{person_name} has a Wikipedia article!")
else:
    print(f"{person_name} does not have a Wikipedia article.")

Bill Clinton has a Wikipedia article!


In [5]:
import wikipedia

# set the language for the Wikipedia API
wikipedia.set_lang("en")

In [6]:
filename = 'data_2015_to_2021_with_topics.csv'
df = pd.read_csv(filename)
# df = pd.read_json(filename)

for index, row in df.iterrows():
    title = row['title']
    print(title)

    # ask for user input
    while True:
        usertext = input("Enter a person's name: ")
        # check if the title is in the set
        if usertext in wiki_titles:
            print(f"{usertext} has a Wikipedia article!")
            break
        elif usertext == 'q':
            break
        else:
            titles = wikipedia.search(usertext)
            print(f"{usertext} does not have a Wikipedia article. Did you mean: {titles}?")

    df.at[index, 'wiki'] = usertext


Scott Aukerman, host of "Comedy Bang! Bang!," producer/director "Between Two Ferns"
Scott Aukerman has a Wikipedia article!
Fred Armisen &amp; Carrie Brownstein! Awards for anyone up on the west coast! Ask us anything!
Portlandia has a Wikipedia article!
We’re Working on Overturning the Citizens United Supreme Court Decision – Ask Us Anything!
Citizens United Supreme Court Decision does not have a Wikipedia article. Did you mean: ['Citizens United v. FEC', 'List of landmark court decisions in the United States', 'Supreme Court of the United States', 'List of overruled United States Supreme Court decisions', 'Roe v. Wade', 'Korematsu v. United States', 'Obergefell v. Hodges', 'Plessy v. Ferguson', 'Citizens United (organization)', 'Supreme Court of the United Kingdom']?
Citizens United v. FEC has a Wikipedia article!
IamA 14yr old Ebola survivor in remote Liberia
Ebola survivor does not have a Wikipedia article. Did you mean: ['Western African Ebola virus epidemic', 'Ebola virus cases i

In [7]:
df[['title', 'wiki']].to_csv('data_2015_to_2021_wiki.csv')

In [21]:
df_wiki = pd.read_csv('data_2015_to_2021_wiki.csv')

In [2]:
import numpy as np

In [12]:
import wikipediaapi

# Initialize the Wikipedia API
wiki_api = wikipediaapi.Wikipedia('en')

In [26]:
# grab the first paragraph from the summary of the Wikipedia article
for index, row in df_wiki.iterrows():
    title = row['title']
    wiki = row['wiki']
    if wiki in ['Request', 'Story']:
        df_wiki.at[index, 'summary'] = np.nan
        continue
    try:
        summary = wiki_api.page(wiki).summary.split('\n')[0]
        df_wiki.at[index, 'summary'] = summary
        # print(f"{wiki} summary: {summary}")
    except:
        print(f"Error: {wiki} does not have a summary.")

In [27]:
df_wiki.to_csv('data_2015_to_2021_wiki_summary.csv')

In [101]:
df_wiki = pd.read_csv('data_2015_to_2021_wiki_summary.csv')

In [35]:
import re
import nltk

nltk.download('punkt') 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ehsanulkabir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [102]:
# replace nan summaries with empty strings
df_wiki['summary'] = df_wiki['summary'].fillna('')

# only keep as many sentences as needed to have a summary that is more than 50 tokens
def get_first_sentence(summary):
    sentences = nltk.sent_tokenize(summary)
    for i, sentence in enumerate(sentences):
        if len(nltk.word_tokenize(' '.join(sentences[:i]))) > 50:
            return ' '.join(sentences[:i])
    return summary

df_wiki['summary'] = df_wiki['summary'].apply(get_first_sentence)

# remove non-engish characters
df_wiki['summary'] = df_wiki['summary'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

# collapse multiple spaces into one
df_wiki['summary'] = df_wiki['summary'].apply(lambda x: re.sub(r'\s+', ' ', x))


In [104]:
df_wiki.to_csv('data_2015_to_2021_wiki_summary_clean.csv')

In [105]:
df_wiki = pd.read_csv('data_2015_to_2021_wiki_summary_clean.csv')

In [106]:
# preprocess title
# replace 'iama' in any variation of capital/lowercase letters with 'I am a'
df_wiki['title'] = df_wiki['title'].apply(lambda x: re.sub(r'iama ', 'I am a ', x, flags=re.IGNORECASE))

# replace ' ama' in any variation of capital/lowercase letters with ''
df_wiki['title'] = df_wiki['title'].apply(lambda x: re.sub(r' ama', '', x, flags=re.IGNORECASE))

# replace ' aua' in any variation of capital/lowercase letters with ''
df_wiki['title'] = df_wiki['title'].apply(lambda x: re.sub(r' aua', '', x, flags=re.IGNORECASE))

# replace '&amp;' with '&'
df_wiki['title'] = df_wiki['title'].apply(lambda x: re.sub(r'&amp;', '&', x))

In [107]:
# find a list of non-alphanumeric characters in the title
non_alphanumeric = df_wiki['title'].apply(lambda x: re.findall(r'[^a-zA-Z0-9 ]', x))
# merge all the lists into one list
non_alphanumeric = [item for sublist in non_alphanumeric for item in sublist]
# remove duplicates
non_alphanumeric = list(set(non_alphanumeric))

safe_characters = [' ', '&', '-', "'", '.', ',', '!', '?', ':', ';', '(', ')', '[', ']', '/', '"', '%', '$', '@', '#', '+', '=']

# remove safe characters from the list of non-alphanumeric characters
for char in safe_characters:
    if char in non_alphanumeric:
        non_alphanumeric.remove(char)

# remove non-alphanumeric characters from the title
def remove_non_alphanumeric(title):
    for char in non_alphanumeric:
        title = title.replace(char, '')
    return title

df_wiki['title'] = df_wiki['title'].apply(remove_non_alphanumeric)

In [108]:
# function to truncate after 'Ask' in the title
def truncate_title(title):
    # find the index of 'Ask' in the title
    index = title.lower().find('ask ')
    # if 'Ask' is not found, return the original title
    if index == -1:
        return title
    # otherwise, return the title truncated after 'Ask'
    else:
        return title[:index]
    
df_wiki['title'] = df_wiki['title'].apply(truncate_title)

# remove leading and trailing whitespace
df_wiki['title'] = df_wiki['title'].apply(lambda x: x.strip())

# replace consecutive punctuation with the last punctuation
df_wiki['title'] = df_wiki['title'].apply(lambda x: re.sub(r'[.,?!:;]+(?=[.,?!:;])', '', x))

In [109]:
df_wiki.to_csv('data_2015_to_2021_wiki_summary_title_clean.csv')

In [6]:
df_wiki = pd.read_csv('data_2015_to_2021_wiki_summary_title_clean.csv')

In [7]:
# replace nan summary and title with empty strings
df_wiki['summary'] = df_wiki['summary'].fillna('')
df_wiki['title'] = df_wiki['title'].fillna('')

In [8]:
# combine the title and summary into one column: context
df_wiki['context'] = df_wiki.apply(lambda x: x['title'] + ' ' + x['summary'], axis=1)

In [12]:
# make context empty string if there is no summary
df_wiki['context'] = df_wiki.apply(lambda x: '' if x['summary'] == '' else x['context'], axis=1)

In [14]:
df_wiki[['context', 'title']].to_csv('data_2015_to_2021_with_context.csv')

In [13]:
df_wiki['context'].to_numpy()

array(['Scott Aukerman, host of "Comedy Bang! Bang," producer/director "Between Two Ferns" Scott David Aukerman (born July 2, 1970) is an American writer, actor, comedian, television personality, director, producer, and podcast host. Starting as a writer and performer in the later seasons of the sketch series Mr. Show, Aukerman is best-known as the host of the weekly comedy podcast Comedy Bang!',
       "Fred Armisen & Carrie Brownstein! Awards for anyone up on the west coast! Portlandia is an American sketch comedy television series starring Fred Armisen and Carrie Brownstein, set in and around Portland, Oregon, and spoofing the city's reputation as a haven for eccentric hipsters. The show was produced by Broadway Video Television and IFC Original Productions. It was created by Armisen and Brownstein, along with Jonathan Krisel, who directs it.",
       'Were Working on Overturning the Citizens United Supreme Court Decision Citizens United v. Federal Election Commission, 558 U.S. 310 