# Wikipedia Network Analysis
## Full Project by Forrest Hangen

In [38]:
import pandas as pd
import numpy as np
import scipy as sp
import nltk

from bs4 import BeautifulSoup as bs
import requests
import time
import random

import re

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import networkx as nx

# Step 1. Training Model to Identify People in Wikipedia Article Summary
(This will use the data and model I developed in Text Classification - Identifying People.ipynb. I did not label all the data I gathered (60000+ article summaries, but if anyone wants to continue labeling, be my guest!))

In [2]:
df_label = pd.read_csv('wiki_desc_dataset_no_duplicates.csv')

In [3]:
df_label['LABEL'].value_counts().loc['PERSON'] /sum(df_label['LABEL'].value_counts())

0.32715462610899876

In [4]:
df_labled = pd.read_csv('wiki_desc_dataset_no_duplicates.csv')
df_labled_only = (df_labled.dropna()
                            .drop(columns= 'Unnamed: 0'))
df_labled_only['LABEL'] = df_labled_only['LABEL'].apply(lambda x: 1 if x == 'PERSON' else 0)
df_labled_only.head()

Unnamed: 0,LABEL,Titles,Text
0,1,Charles H. Percy,"Charles Harting Percy (September 27, 1919 – Se..."
1,0,Head of state,A head of state (or chief of state) is the pub...
2,1,Henry Winkler,"Henry Franklin Winkler (born October 30, 1945)..."
3,0,La Violencia,La Violencia (Spanish pronunciation: [la βjoˈl...
4,0,London Buses route 176,London Buses route 176 is a Transport for Lond...


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_labled_only['Text'], 
                                                    df_labled_only['LABEL'], 
                                                    random_state=42, train_size= .97)
len(X_train), len(X_test)



(6122, 190)

In [6]:
count_final = CountVectorizer(lowercase=False, min_df = 30, ngram_range=(1,3)).fit(X_train)

X_train_matrix = count_final.transform(X_train)
X_test_matrix = count_final.transform(X_test)

log_final = LogisticRegression(solver = 'newton-cg', C=.1).fit(X_train_matrix, y_train)

In [7]:
y_train_finalpred = log_final.predict(X_train_matrix)
y_test_finalpred = log_final.predict(X_test_matrix)

print('TRAIN')
print('Log Reg:\nAccuracy: {0}\nROC AUC: {1}'.format(accuracy_score(y_train, y_train_finalpred),
                                                     roc_auc_score(y_train, y_train_finalpred)))
print(confusion_matrix(y_train, y_train_finalpred))
print()
print('TEST')
print('Log Reg:\nAccuracy: {0}\nROC AUC: {1}'.format(accuracy_score(y_test, y_test_finalpred),
                                                     roc_auc_score(y_test, y_test_finalpred)))
print(confusion_matrix(y_test, y_test_finalpred))

TRAIN
Log Reg:
Accuracy: 0.9942829140803658
ROC AUC: 0.9925539222953105
[[4106   10]
 [  25 1981]]

TEST
Log Reg:
Accuracy: 0.9789473684210527
ROC AUC: 0.9847328244274809
[[127   4]
 [  0  59]]


This level of accuracy is perfect for what I'm trying to do here.

# Step 2. Gathering the Data!

In [None]:
all_info = parse_page('Benjamin Franklin', all_info)

In [18]:
%%time

all_info = gather_all_data()

ON APP_BF
ON NEXT LEVELbf: 93/93                                                                                                    
 


Wall time: 51min 12s 1265/1265                                                                                                    


In [17]:
def gather_all_data():
    approved = []
    person_links_db = {}
    all_checked = []
    got_links = []
    api_call = "http://en.wikipedia.org/w/api.php?action=parse&format=json&page="

    all_info = {'app': approved, 'pldb':person_links_db, 'all':all_checked, 'api':api_call, 'got':got_links}
    
    all_info = parse_page('Benjamin Franklin', all_info)
    
    app_bf = all_info['app'].copy()
    print('ON APP_BF')
    for i, person in enumerate(app_bf):
        if person not in all_info['got']:
            all_info = parse_page(person, all_info)
        print('Finished app_bf: {0}/{1}{2}'.format(i+1, len(app_bf), ' '*100), end="\r")


    next_level = all_info['app'].copy()
    print('ON NEXT LEVEL')
    print(' ', end ='\n')
    print('\n')
    for i, person in enumerate(next_level):
        if person not in all_info['got']:
            all_info = parse_page(person, all_info)
    print('Finished next level: {0}/{1}{2}'.format(i+1, len(next_level), ' '*100), end="\r")
    return all_info
        

In [16]:
approved = []
rejected = []
links_db = {}
person_links_db = {}
all_checked = []
api_call = "http://en.wikipedia.org/w/api.php?action=parse&format=json&page="

all_info = {'app': approved, 'rej':rejected, 'ldb':links_db, 'pldb':person_links_db, 'all':all_checked, 'api':api_call}


def parse_page(page_title, all_info):
        # Get page and parse
        page = requests.get(all_info['api'] + page_title)
        source = bs(page.json()['parse']['text']['*'], 'html.parser')

        # Get all links
        all_links = []
        to_check = []
        p = source.find_all('p')

        for para in p:
            p_source = bs(str(para), 'html.parser')
            links = p_source.find_all('a', href = True)
            for link in links:
                title = link.get('title')
                # Don't include None
                if title is not None:
                    all_links.append(title)
                    # Check title
                    if title not in all_info['all']:
                        to_check.append(title)
                        all_info['all'].append(title)

        if len(to_check) != 0:
            df_all_checked = collect_text(to_check)
            if df_all_checked is not None:
                try:
                    X = df_all_checked['Text']
                    X_matrix = count_final.transform(X)
                    predictions = log_final.predict(X_matrix)

                    df_all_checked['Pred'] = predictions
    
                    good_titles = df_all_checked[df_all_checked['Pred'] == 1]['Titles']
                    good_titles = list(good_titles)
                    for i in good_titles:
                        all_info['app'].append(i)
                except:
                    test = 'blank'
                    
        good_links = []
        
        for link in all_links:
            if link in all_info['app']:
                good_links.append(link)
                
        all_info['pldb'][page_title] = good_links
        all_info['got'].append(page_title)

        return all_info
    
def split_into_20_names(links):
    
    links = list(set(links))
    
    all_links_lists = []
    
    for i, v in enumerate(links):
        if i == 0:
            list1 = []
            list1.append(v)
        elif i % 20 == 0:
            all_links_lists.append(list1)
            list1 = []
            list1.append(v)
        elif (i + 1) == len(links):
            list1.append(v)
            all_links_lists.append(list1)
        else:
            list1.append(v)
            
    return all_links_lists

def get_text(names_20):
    
    titles = '|'.join(names_20)
    
    response = requests.get(
         'https://en.wikipedia.org/w/api.php',
        params={
             'action': 'query',
             'format': 'json',
             'titles': titles,
             'prop': 'extracts',
             'exintro': True,
             'explaintext': True,
         }
    ).json()
    
    try:
        pages = response['query']['pages']

        page_ids = list(pages.keys())

        texts = [pages[x]['extract'] for x in page_ids]
        names_all = [pages[x]['title'] for x in page_ids]
    
        df = pd.DataFrame({'Titles':names_all, 'Text':texts})
        df = df.replace('', np.nan).dropna()
        return df
    except:
        return None

def collect_text(all_links):
    
    splits = split_into_20_names(all_links)
    
    try:
        
        df_all = get_text(splits[0])
        if df_all is not None:

            for i, v in enumerate(splits[1:]):
                df = get_text(v)
                df_all = df_all.append(df)
                time.sleep(random.choice([.2,.1,.25,.5,.5,1, .75]))

            return df_all
        else:
            return None
    except:
        return None

# Step 3: Making the Graphs

In [39]:
all_edges = []
for person in all_info['pldb']:
    for connection in all_info['pldb'][person]:
        
        all_edges.append((person.split('(')[0].strip(), connection.split('(')[0].strip()))
len(all_edges)

23001

In [43]:
with open('edges.txt', 'w', encoding='utf8') as file:
    for edge in all_edges:
        file.write(str(edge) + '\n')

In [53]:
G = nx.MultiDiGraph()
G.add_edges_from(all_edges)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 1,
 0,
 0,
 4,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [54]:
len(G.nodes())

9656

In [61]:
bad_nodes = list(G.nodes()).copy()

for node in bad_nodes:
    if 'USS' in node:
        G.remove_node(node)
len(G.nodes())

9635

In [63]:
len(G.edges())

22971

In [64]:
G_un = G.to_undirected()

In [65]:
nx.center(G_un)

['Henry Steele Commager',
 'George Washington',
 'John Adams',
 'John Sullivan',
 'Richard Howe, 1st Earl Howe',
 'Charles Gravier, comte de Vergennes',
 'Benjamin Rush',
 'John André',
 'Sir Charles Asgill, 2nd Baronet',
 'Friedrich Wilhelm von Steuben',
 'Anthony Wayne']

In [66]:
nx.diameter(G_un)

8