In [2]:
!pip install spacy
!pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

In [1]:
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy model for processing
nlp = spacy.load("en_core_web_sm")

# Function to clean and tokenize the text
def clean_and_tokenize(text):
    doc = nlp(text.lower())  # Lowercase and parse with spaCy
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]  # Keep only non-stop words
    return " ".join(tokens)

# Function to extract keywords using TF-IDF
def extract_keywords(texts, n_keywords=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=n_keywords)  # Limit number of keywords
    X = vectorizer.fit_transform(texts)
    keywords = vectorizer.get_feature_names_out()
    return list(keywords)  # Convert to list

# Function to preprocess the entire JSON data
def preprocess_data(input_filename, output_filename):
    # Use utf-8 encoding to open the file
    with open(input_filename, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    
    # Process each topic
    processed_data = {}
    
    # For combined keywords across all topics
    combined_keywords = set()
    
    for topic, articles in json_data.items():
        processed_articles = []
        topic_keywords = set()  # Set to hold keywords for this particular topic
        
        for article in articles:
            title = article['title']  # Get the title of the article
            summary = article['summary']['text_en']
            cleaned_summary = clean_and_tokenize(summary)
            
            # Extract keywords from cleaned summaries for the current article
            keywords = extract_keywords([cleaned_summary])
            
            # Add keywords to the combined set for this topic
            topic_keywords.update(keywords)
            
            # Add preprocessed summary and keywords to the original article data
            processed_article = {
                'title': title,
                'revision_id': article['revision_id'],
                'summary': article['summary'],
                'url': article['url'],
                'topic': article['topic'],
                'preprocessed_summary': cleaned_summary,
                'keywords': keywords
            }
            
            # Add the article's data to the processed list
            processed_articles.append(processed_article)
        
        # Store the processed data for the current topic
        processed_data[topic] = {
            'articles': processed_articles,
            'combined_keywords': list(topic_keywords)  # Store combined keywords for the topic
        }
        
        # Add the topic's keywords to the global combined set
        combined_keywords.update(topic_keywords)
    
    # Add combined keywords for all topics
    processed_data['combined_keywords'] = list(combined_keywords)
    
    # Save processed data to output file
    with open(output_filename, 'w', encoding='utf-8') as out_file:
        json.dump(processed_data, out_file, ensure_ascii=False, indent=4)

# Main program to preprocess the data
if __name__ == "__main__":
    input_filename = 'wikipedia_scraped_data.json'  # Your input file
    output_filename = 'preprocessed_data.json'     # Output file with preprocessed data
    preprocess_data(input_filename, output_filename)
    print(f"Preprocessing complete. Data saved to {output_filename}")


Preprocessing complete. Data saved to preprocessed_data.json
