# Setup


In [1]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from gensim.models import KeyedVectors
from modules.wiki_parser import FileProcessor, WikiTextExtractor
from modules.nlp import NLP

DATA_DIR = Path().cwd() / "data"

plt.style.use('ggplot')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Article parsing


In [3]:
ARTICLE_DIR = DATA_DIR / "articles"

file_processor = FileProcessor()
wiki_text_extractor = WikiTextExtractor()

# Fetch all article file paths
file_paths = file_processor.fetch_file_paths(ARTICLE_DIR)

# Extract text from all articles
article_texts = [wiki_text_extractor.process_file(file_path) for file_path in tqdm(file_paths)]
articles_df = pd.concat(article_texts)

 34%|███▍      | 687/2001 [00:10<00:18, 72.05it/s]

Error processing /Users/felixwallis/Desktop/Oxford MSc/Oxford Social Data Science Course/Fundamentals for Social Data Science in Python/sds-week-2-wikipedia-presentation/data/articles/Vladimir_Putin/2023/05/1155008451.xml: too many values to unpack (expected 2)
Error processing /Users/felixwallis/Desktop/Oxford MSc/Oxford Social Data Science Course/Fundamentals for Social Data Science in Python/sds-week-2-wikipedia-presentation/data/articles/Vladimir_Putin/2023/05/1155008376.xml: too many values to unpack (expected 2)


 51%|█████     | 1019/2001 [00:15<00:12, 77.78it/s]

Error processing /Users/felixwallis/Desktop/Oxford MSc/Oxford Social Data Science Course/Fundamentals for Social Data Science in Python/sds-week-2-wikipedia-presentation/data/articles/Xi_Jinping/2022/11/1124670506.xml: too many values to unpack (expected 2)


 78%|███████▊  | 1568/2001 [00:22<00:04, 89.48it/s]

Error processing /Users/felixwallis/Desktop/Oxford MSc/Oxford Social Data Science Course/Fundamentals for Social Data Science in Python/sds-week-2-wikipedia-presentation/data/articles/Xi_Jinping/2024/08/1242366887.xml: too many values to unpack (expected 2)


 90%|█████████ | 1803/2001 [00:25<00:02, 77.33it/s]

Error processing /Users/felixwallis/Desktop/Oxford MSc/Oxford Social Data Science Course/Fundamentals for Social Data Science in Python/sds-week-2-wikipedia-presentation/data/articles/Xi_Jinping/2023/.DS_Store: 'utf-8' codec can't decode byte 0xb8 in position 95: invalid start byte


100%|██████████| 2001/2001 [00:27<00:00, 72.07it/s]


# Embeddings


In [2]:
word_vectors = KeyedVectors.load_word2vec_format(
    DATA_DIR / "glove.840B.300d.txt", binary=False, no_header=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Text preprocessing


In [4]:
nlp = NLP()

# Tokenize text and remove stop words
tqdm.pandas(desc="Tokenizing article text")
articles_df['tokens'] = articles_df['section_text'].progress_apply(nlp.tokenize_text)

Tokenizing article text: 100%|██████████| 98832/98832 [00:51<00:00, 1932.76it/s]


In [17]:
articles_df['weighted_embeddings'] = nlp.generate_tfidf_weighted_embeddings(articles_df['tokens'], word_vectors)

Generating weighted embeddings: 100%|██████████| 98832/98832 [02:13<00:00, 742.34it/s]


# Counting the number of revisions over time
