# Week 8 - Assignment

In [1]:
import pandas as pd
import wikipedia
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

titles = []
titles.append('The Titanic')
titles.append('The Carpathia')

content = []
for title in titles:
    # disambiguous error fix?
    try:
        content.append([title, wikipedia.page(title).content])
    except wikipedia.exceptions.DisambiguationError as e:
        s = random.choice(e.options)
        content.append([title, wikipedia.page(s).content])

df = pd.DataFrame(content, columns=['title', 'content'])
print(df.head())

vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(df['content'].values.astype('U'))

model = LatentDirichletAllocation(n_components=1, random_state=42)
model.fit(vectors)

for index, topic in enumerate(model.components_):
    print(f'Topic {index} top words: {[vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]}')

           title                                            content
0    The Titanic  RMS Titanic was a British passenger liner, ope...
1  The Carpathia  RMS Carpathia was a Cunard Line transatlantic ...
Topic 0 top words: ['passengers', 'class', 'ship', 'carpathia', 'titanic']


In [2]:
tocic = model.components_[0]
keywords = ' '.join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]])
titles = wikipedia.search(keywords, results=5)
print(titles)

['Titanic', 'Passengers of the Titanic', 'RMS Carpathia', 'Olympic-class ocean liner', 'Sinking of the Titanic']


In [3]:
titles[0] = 'The Titanic'

content = []
for title in titles:
    # disambiguous error fix?
    try:
        content.append([title, wikipedia.page(title).content])
    except wikipedia.exceptions.DisambiguationError as e:
        s = random.choice(e.options)
        content.append([title, wikipedia.page(s).content])

df = pd.DataFrame(content, columns=['title', 'content'])
df.head()

Unnamed: 0,title,content
0,The Titanic,"RMS Titanic was a British passenger liner, ope..."
1,Passengers of the Titanic,"A total of 2,208 people sailed on the maiden v..."
2,RMS Carpathia,RMS Carpathia was a Cunard Line transatlantic ...
3,Olympic-class ocean liner,The Olympic-class ocean liners were a trio of ...
4,Sinking of the Titanic,The RMS Titanic sank in the early morning hour...


In [4]:
from kgextension.linking_sklearn import DbpediaLookupLinker

linker = DbpediaLookupLinker(column='title')
df_enhanced = linker.fit_transform(df.head())
df_enhanced.head()

DBpedia Lookup Linker: Querying DLL:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,title,content,new_link
0,The Titanic,"RMS Titanic was a British passenger liner, ope...",http://dbpedia.org/resource/RMS_Titanic
1,Passengers of the Titanic,"A total of 2,208 people sailed on the maiden v...",http://dbpedia.org/resource/Passengers_of_the_...
2,RMS Carpathia,RMS Carpathia was a Cunard Line transatlantic ...,http://dbpedia.org/resource/RMS_Carpathia
3,Olympic-class ocean liner,The Olympic-class ocean liners were a trio of ...,http://dbpedia.org/resource/Olympic-class_ocea...
4,Sinking of the Titanic,The RMS Titanic sank in the early morning hour...,http://dbpedia.org/resource/Sinking_of_the_RMS...
