In [None]:
from google.colab import drive

# Attach drive
drive.mount("/content/drive")

In [None]:
# Global var for folder to save results logs to
DRIVE_FOLDER_OUT = "/content/drive/MyDrive/Colab Files/wiki_entity_logs/"

In [None]:
!git clone https://github.com/daniel-furman/Capstone.git

In [None]:
!pip install -r /content/Capstone/requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
from json import JSONDecodeError
import urllib.parse
import urllib.request

from argparse import Namespace
import os

from datetime import datetime
import time
import tqdm

import re
from ftfy import fix_text
from string import punctuation

import spacy

In [None]:
os.chdir("/content/Capstone/src/wikipedia_entity_analysis/")
from wiki_analysis import CODE_TO_LANG_DICT, CODE_TO_WIKI_CLEANUP_DICT, CODE_TO_SPACY_MODEL_DICT, load_spacy_models, get_mulitlingual_lookup, get_wikipedia_pages, get_article_info, count_entities_in_article

In [None]:
# args config for grabbing entities
args = Namespace(
    language="ru",
    iterations=1,
    articles_per_iter=20,
    cleanup_str=CODE_TO_WIKI_CLEANUP_DICT["ru"],
    debug=False
)

In [None]:
entity_analysis_df = pd.read_csv(
    "../../data/error_analysis/entity_analysis_language_and_accuracy_by_entity.csv"
)

In [None]:
# get entity <-> multilingual translation lookup resolver
target_entities_multiling = get_mulitlingual_lookup(entity_analysis_df, CODE_TO_LANG_DICT)

In [None]:
# download spacy model
model_name = CODE_TO_SPACY_MODEL_DICT[args.language]
!python -m spacy download {model_name}

In [None]:
spacy_model = spacy.load(CODE_TO_SPACY_MODEL_DICT[args.language])

article_titles = []
article_ids = []
article_word_counts = []
article_full_entities_schedule = []
article_full_entities_uniques = []
article_full_entities_counts = []
article_target_entities_schedule = []
article_target_entities_uniques = []
article_target_entities_counts = []

for i in tqdm.tqdm(range(args.iterations)):

    # get pages to parse
    pages_to_parse = get_wikipedia_pages(args.language, args.articles_per_iter, args.debug)

    # for each page
    parsed_pages = 0
    for article_id, article_title in tqdm.tqdm(pages_to_parse.items(), position=0, leave=True):
        article_data = []
        # get info
        article_info = get_article_info(article_title, article_id, args.language, args.cleanup_str, args.debug)

        # get stats
        article_word_count, article_full_entities, article_target_entities = count_entities_in_article(target_entities_multiling, article_info, spacy_model, args.language, args.debug)

        # commit stats
        article_titles.append(article_title)
        article_ids.append(str(article_id))
        article_word_counts.append(article_word_count)

        if parsed_pages % 50 == 0:
            print(f"\n====random progress prints====")
            print(f"retrieved data for {article_title}. {len(article_full_entities)} unique entities and {len(article_target_entities)} unique target entities.")

        # track all entities (vals, total, num unique)
        article_full_entities_schedule.append(article_full_entities)
        article_full_entities_uniques.append(len(article_full_entities) if bool(article_full_entities) != False else 0)
        article_full_entities_counts.append(sum(article_full_entities.values()) if bool(article_full_entities) != False else 0)

        # track target entities (vals, total, num unique)
        article_target_entities_schedule.append(article_target_entities)
        article_target_entities_uniques.append(len(article_target_entities) if bool(article_target_entities) != False else 0)
        article_target_entities_counts.append(sum(article_target_entities.values()) if bool(article_target_entities) != False else 0)
        
        # break for api
        time.sleep(.1)

        parsed_pages += 1

In [None]:
log = pd.DataFrame({'article_title': article_titles, 'article_id': article_id, 'article_word_count': article_word_counts,
                    'article_full_entities': article_full_entities_schedule, 'article_full_entities_counts': article_full_entities_counts, 'article_full_entities_uniques': article_full_entities_uniques,
                    'article_target_entities': article_target_entities_schedule, 'article_target_entities_counts': article_target_entities_counts, 'article_target_entities_uniques': article_target_entities_uniques})
log_name = args.language + '-' + str(args.iterations * args.articles_per_iter) + '-' + 'wiki-entity-counts' + '-' + datetime.now().strftime("%Y-%m-%d-%Hh-%Mm-%Ss") + '.json'
log_new_path = os.path.join(DRIVE_FOLDER_OUT, log_name)
log.to_json(log_new_path, orient="index")

In [None]:
log.head()

In [None]:
np.mean(log['article_word_count'])

In [None]:
np.mean(log['article_full_entities_counts'])

In [None]:
np.mean(log['article_full_entities_uniques'])

In [None]:
np.mean(log['article_target_entities_counts'])

In [None]:
np.mean(log['article_target_entities_uniques'])