# Clean AI Wiki Links

In [1]:
import pandas as pd
import numpy as np
import json
import glob
import time


# web scrapping
import requests as r
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
from nltk.corpus import stopwords

#visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
#warnings.simplefilter('always')
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Read links data

In [58]:
links = pd.read_csv("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/wiki_ai_links.csv") #import csv
links = links[links['title'].notnull()] #exclude rows that are na for title
links.title= links.title.str.lower() #lowercase title

links = links.rename(columns={'title': 'text'})
len(links)

1576

In [59]:
links.head()

Unnamed: 0,url,text
1,/wiki/AI_(disambiguation),ai (disambiguation)
2,/wiki/Artificial_intelligence_(disambiguation),artificial intelligence (disambiguation)
3,/wiki/Outline_of_artificial_intelligence,artificial intelligence
4,/wiki/Artificial_intelligence#Goals,major goals
5,/wiki/Artificial_general_intelligence,artificial general intelligence


### De-duplicate Links

TODO: There are some same text but different links ones.

In [60]:
# no entry that have dupilicates on both url and text
links_dedup = links.drop_duplicates()
len(links_dedup)

1576

## Read named entity data

In [61]:
named_entity = pd.read_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text_entity.csv') 
named_entity = named_entity[["text", "type"]]
named_entity.text= named_entity.text.str.lower() #lowercase title
named_entity = named_entity.rename(columns={'type': 'ner'})

In [62]:
named_entity

Unnamed: 0,text,ner
0,tesler,PERSON
1,1955,DATE
2,alpha,PERSON
3,2015,DATE
4,agi,ORG
...,...,...
390,karel čapek's r.u.r.,WORK_OF_ART
391,a.i. artificial intelligence and ex machina,WORK_OF_ART
392,well as the novel do androids dream of electri...,WORK_OF_ART
393,philip k. dick,PERSON


# Validate links 

## Approach I.If not exact match --> valid link

In [63]:
#Join two dataframes
links_ne = pd.merge(links,named_entity,on='text',how='left')
#links_ne.to_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/wiki_ai_links_ner.csv', index = False)  
result_links_valid = links_ne[links_ne['ner'].isna()]
result_links_valid

Unnamed: 0,url,text,ner
0,/wiki/AI_(disambiguation),ai (disambiguation),
1,/wiki/Artificial_intelligence_(disambiguation),artificial intelligence (disambiguation),
2,/wiki/Outline_of_artificial_intelligence,artificial intelligence,
3,/wiki/Artificial_intelligence#Goals,major goals,
4,/wiki/Artificial_general_intelligence,artificial general intelligence,
...,...,...,...
1593,/wiki/Wikipedia:File_Upload_Wizard,upload file,
1594,/wiki/Special:WhatLinksHere/Artificial_intelli...,what links here,
1595,/wiki/Special:RecentChangesLinked/Artificial_i...,related changes,
1596,/wiki/Special:SpecialPages,special pages,


### Examine NER in the links

In [14]:
links_matched = links_ne[links_ne['ner'].notnull()] #exclude rows that are na for title
ls_links_matched = links_matched.text.to_list()

def find_unique_in_list(ls):
    ls_unique = []
    for i in ls:
        if i not in ls_unique:
            ls_unique.append(i)
    return ls_unique
        
ls_links_matched_unique = find_unique_in_list(ls_links_matched)
ls_links_matched_unique

['ethics',
 'karel čapek',
 'mccullouch',
 'dartmouth college',
 'john mccarthy',
 'norbert wiener',
 'allen newell',
 'cmu',
 'herbert simon',
 'mit',
 'marvin minsky',
 'arthur samuel',
 'ibm',
 'garry kasparov',
 'watson',
 'brad rutter',
 'ken jennings',
 'kinect',
 'xbox 360',
 'lee sedol',
 'future of go summit',
 'ke jie',
 'murray campbell',
 'alphazero',
 'muzero',
 'china',
 'denver',
 'san francisco',
 'svm',
 'cyc',
 'hans moravec',
 'deepmind',
 'atari',
 'princeton university',
 'stanford',
 'john haugeland',
 'gofai',
 'seymour papert',
 'roger schank',
 'rodney brooks',
 'david rumelhart',
 'google search',
 'siri',
 'deepfakes',
 'ray kurzweil',
 'joseph weizenbaum',
 'computer power and human reason',
 'charles t. rubin',
 'david chalmers',
 'jerry fodor',
 'hilary putnam',
 'john searle',
 'plug & pray',
 'vernor vinge',
 'kevin warwick',
 'aldous huxley',
 'robert ettinger',
 'george dyson',
 'carl benedikt frey',
 'martin ford',
 'compas',
 'propublica',
 'stephen 

## Approach II.If NER does not contain topics(links) --> valid

In [51]:
ne_ls = named_entity["text"]
links_ls = links["text"]

links_valid_ls =[]
links_ner_ls = []
for link in links_ls:
    for ne in ne_ls:
        if link not in ne:
            links_valid_ls.append(link)
        elif link in ne:
            links_ner_ls.append(link)
links_valid_ls =  find_unique_in_list(links_valid_ls)
links_ner_ls=  find_unique_in_list(links_ner_ls)
print(len(links_valid_ls))
print(len(links_ner_ls))

1513
134


In [55]:
#Links that are NER (invalid topics) 
links_ner_ls[0:50]

['artificial intelligence',
 'robotics',
 'ethics',
 'cca',
 'ica',
 'gan',
 'intelligence',
 'machines',
 'go',
 'logic',
 'computer power',
 'data',
 'mary shelley',
 'frankenstein',
 'karel čapek',
 'r.u.r.',
 'alan turing',
 'mccullouch',
 'turing-complete',
 'dartmouth college',
 'john mccarthy',
 'norbert wiener',
 'allen newell',
 'cmu',
 'herbert simon',
 'mit',
 'marvin minsky',
 'arthur samuel',
 'ibm',
 'department of defense',
 'fifth generation computer',
 'deep blue',
 'garry kasparov',
 'watson',
 'brad rutter',
 'ken jennings',
 'kinect',
 'xbox 360',
 'xbox one',
 'lee sedol',
 'future of go summit',
 'ke jie',
 'murray campbell',
 'alphazero',
 'muzero',
 'google',
 'china',
 'denver',
 'san francisco',
 'svm']

In [54]:
links_valid_ls[1:50]

['artificial intelligence (disambiguation)',
 'artificial intelligence',
 'major goals',
 'artificial general intelligence',
 'planning',
 'computer vision',
 'general game playing',
 'knowledge reasoning',
 'machine learning',
 'natural language processing',
 'robotics',
 'symbolic',
 'deep learning',
 'bayesian networks',
 'evolutionary algorithms',
 'philosophy',
 'ethics',
 'existential risk',
 'turing test',
 'chinese room',
 'control problem',
 'friendly ai',
 'history',
 'timeline',
 'progress',
 'ai winter',
 'applications',
 'projects',
 'programming languages',
 'glossary',
 'data mining',
 'classification',
 'clustering',
 'regression',
 'anomaly detection',
 'automl',
 'association rules',
 'reinforcement learning',
 'structured prediction',
 'feature engineering',
 'feature learning',
 'online learning',
 'semi-supervised learning',
 'unsupervised learning',
 'learning to rank',
 'grammar induction',
 'supervised learning',
 'decision trees',
 'ensembles']

In [46]:
links_valid = pd.DataFrame(links_valid_ls, columns =['text'])
links_valid = links_valid.assign(valid = True)

links_ner = pd.DataFrame(links_ner_ls, columns =['text'])
links_ner= links_ner.assign(valid = False)

valid_vs_ner = pd.concat([links_valid, links_ner])
    
result = pd.merge(links,valid_vs_ner,on='text',how='left')

In [57]:
result_links_valid = result[result['valid']==True]
result_links_valid

Unnamed: 0,url,text,valid
0,/wiki/AI_(disambiguation),ai (disambiguation),True
1,/wiki/Artificial_intelligence_(disambiguation),artificial intelligence (disambiguation),True
2,/wiki/Outline_of_artificial_intelligence,artificial intelligence,True
4,/wiki/Artificial_intelligence#Goals,major goals,True
5,/wiki/Artificial_general_intelligence,artificial general intelligence,True
...,...,...,...
1714,/wiki/Wikipedia:File_Upload_Wizard,upload file,True
1715,/wiki/Special:WhatLinksHere/Artificial_intelli...,what links here,True
1716,/wiki/Special:RecentChangesLinked/Artificial_i...,related changes,True
1717,/wiki/Special:SpecialPages,special pages,True


# Export valid links

In [64]:
result_links_valid.to_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/wiki_ai_links_valid.csv', index = False)   