In [1]:
import pandas as pd
import nltk
import openpyxl 
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import os
from collections import Counter
import textstat

# Ensure NLTK dependencies are available
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Load stop words
stop_words = set()
for file in ["stop_words"]:
    with open(file, "r", encoding="utf-8") as f:
        stop_words.update(word.strip().lower() for word in f)

# Load positive and negative words
positive_words = set()
negative_words = set()
with open("positive-words", "r") as f:
    positive_words.update(word.strip().lower() for word in f if word.strip() not in stop_words)
with open("negative-words", "r") as f:
    negative_words.update(word.strip().lower() for word in f if word.strip() not in stop_words)


In [3]:
def clean_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Tokenize words and sentences
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    # Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]
    return cleaned_words, sentences

In [4]:
def calculate_sentiment_scores(words):
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    return positive_score, negative_score

In [5]:
def calculate_polarity_subjectivity(positive_score, negative_score, total_words):
    polarity = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity = (positive_score + negative_score) / (total_words + 0.000001)
    return polarity, subjectivity

In [6]:
def calculate_readability_metrics(words, sentences):
    total_words = len(words)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    avg_words_per_sentence = total_words / total_sentences if total_sentences > 0 else 0
    complex_words = sum(1 for word in words if len(re.findall(r'[aeiouy]', word)) > 2)
    percentage_complex_words = complex_words / total_words if total_words > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, avg_words_per_sentence, percentage_complex_words, fog_index, complex_words

In [7]:
def calculate_word_stats(words):
    total_words = len(words)
    avg_word_length = sum(len(word) for word in words) / total_words if total_words > 0 else 0
    return total_words, avg_word_length

In [8]:
def syllable_count(word):
    word = word.lower()
    syllables = len(re.findall(r'[aeiouy]', word))
    if word.endswith(('es', 'ed')):
        syllables = max(1, syllables - 1)
    return syllables

In [9]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE)
    return len(pronouns)

In [70]:
def analyze_texts(input_file, articles_folder, output_file):
    # Load Input File
    df = pd.read_excel(input_file, engine='openpyxl')

    # Prepare a list to hold results
    results = []

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        #text = row["URL_ID"]  # Adjust this based on your column name
        with open(f"{articles_folder}/{url_id}.txt", 'r', encoding='utf-8') as file:
             text = file.read()
             try:
                # Perform Cleaning
                cleaned_tokens, sentences = clean_text(text)
                        
                # Calculate Sentiment Scores
                positive_score, negative_score = calculate_sentiment_scores(cleaned_tokens)
                polarity_score, subjectivity_score = calculate_polarity_subjectivity(
                positive_score, negative_score, len(cleaned_tokens)
                )
                        
                # Readability and Complexity Analysis
                avg_sentence_length, avg_words_per_sentence, percentage_complex_words, fog_index, complex_word_count = calculate_readability_metrics(cleaned_tokens, sentences)
                        
                # Word Statistics
                total_words, avg_word_length = calculate_word_stats(cleaned_tokens)
                        
                # Personal Pronouns Count
                personal_pronouns = count_personal_pronouns(text)
                # Save Results
                results.append({
                'URL_ID': row.get('URL_ID', ''),
                'URL': row.get('URL', ''),
                "POSITIVE_SCORE": positive_score,
                'NEGATIVE_SCORE': negative_score,
                'POLARITY_SCORE': polarity_score,
                'SUBJECTIVITY_SCORE': subjectivity_score,
                'AVG_SENTENCE_LENGTH': avg_sentence_length,
                'PERCENTAGE_OF_COMPLEX_WORDS': percentage_complex_words,
                'FOG_INDEX': fog_index,
                'AVG_NUMBER_OF_WORDS_PER_SENTENCE': avg_words_per_sentence,
                'COMPLEX_WORD_COUNT': complex_word_count,
                'WORD_COUNT': total_words,
                'SYLLABLE_PER_WORD': avg_word_length,  # Fix: This was incorrectly labeled before
                'PERSONAL_PRONOUNS': personal_pronouns,
                'AVG_WORD_LENGTH': avg_word_length,
               })
        
             except Exception as e:
                 print(f"Error processing row {index} with URL_ID {row.get('URL_ID', '')}: {e}")
        
    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
                
    # Save to Output Excel
    results_df.to_excel(output_file, index=False)

# Example usage
analyze_texts("Input.xlsx", "Articles", "Output_Data_Structure.xlsx")
 
        

In [72]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("Output_Data_Structure.xlsx")

# Save it as a CSV with UTF-8 encoding
df.to_csv("Output_Data_Structure.csv", index=False, encoding='utf-8')
