In [15]:
#Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import json
import os

# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Paths to input files
input_path1 = "../data/demystifying.txt"
input_path2 = "../data/phineas.txt"

# Precompile regex and load stopwords
regex_punctuation = re.compile(r'[^\w\s]')
regex_numbers = re.compile(r'\b\d+\b')
stop_words = set(stopwords.words('english'))

#Preprocessing with mapping to original words in text (not stemmed)
def preprocess_text_with_mapping(text):
    text = text.lower()
    text = regex_punctuation.sub('', text)
    text = regex_numbers.sub('', text)
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    ps = PorterStemmer()
    stemmed_tokens = []
    original_to_stemmed = {}
    
    for token in filtered_tokens:
        stemmed = ps.stem(token)
        stemmed_tokens.append(stemmed)
        if stemmed not in original_to_stemmed:
            original_to_stemmed[stemmed] = []
        original_to_stemmed[stemmed].append(token)

    return stemmed_tokens, original_to_stemmed

# Reads file 
def read_file(file_path, preprocess=True):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().replace('\n', ' ')
        if preprocess:
            return preprocess_text_with_mapping(text)
        else:
            return text

# Calculates tfidf 
def calculate_tfidf(data):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data['Document'])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
    top_words = tfidf_df.apply(lambda s: s.nlargest(10).index.tolist(), axis=1)
    return tfidf_df, feature_names, top_words

# Get co-ocurrences 
def get_context_words(text, top_words, original_to_stemmed):
    text_tokens = word_tokenize(text)
    word_contexts = {word: [] for word in top_words}
    for stemmed_word in top_words:
        variations = set(original_to_stemmed[stemmed_word])
        indices = [i for i, token in enumerate(text_tokens) if token in variations]
        for index in indices:
            context_range = range(max(0, index - 5), min(len(text_tokens), index + 6))
            context_words = [text_tokens[i] for i in context_range if i != index and text_tokens[i] not in stop_words and not regex_numbers.match(text_tokens[i])]
            word_contexts[stemmed_word].extend(context_words)
    return word_contexts

# Count co-ocurrences 
def count_associations(word_contexts):
    word_associations = {word: {} for word in word_contexts}
    for word, contexts in word_contexts.items():
        for context in contexts:
            if context in word_associations[word]:
                word_associations[word][context] += 1
            else:
                word_associations[word][context] = 1
    return word_associations

# Creates json 
def create_json_files(top_words, word_associations, filename):
    output_data = {}
    for word in top_words:
        output_data[word] = [{"word": assoc_word, "connections": count} for assoc_word, count in word_associations[word].items()]
    with open(f"{filename}.json", 'w', encoding='utf-8') as file:
        json.dump(output_data, file, indent=4)

#Main 

# Process texts
stemmed_text1, mapping1 = read_file(input_path1, preprocess=True)
stemmed_text2, mapping2 = read_file(input_path2, preprocess=True)

# Create DataFrame from stemmed texts
data = pd.DataFrame({'Document': [' '.join(stemmed_text1), ' '.join(stemmed_text2)]})

# Calculate TF-IDF
tfidf_df, features, top_words = calculate_tfidf(data)

# Extract context words using original mappings
context_words1 = get_context_words(' '.join(stemmed_text1), top_words.iloc[0], mapping1)
context_words2 = get_context_words(' '.join(stemmed_text2), top_words.iloc[1], mapping2)

# Count associations for both texts
associations1 = count_associations(context_words1)
associations2 = count_associations(context_words2)

# Create JSON files to save the results
create_json_files(top_words.iloc[0], associations1, "output1")
create_json_files(top_words.iloc[1], associations2, "output2")



[nltk_data] Downloading package punkt to C:\Users\Raquel
[nltk_data]     Coelho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Raquel
[nltk_data]     Coelho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                            Document
0  c demystifi adolesc brain laurenc steinberg ad...
1  phinea gage gruesom true stori brain scienc jo...
