In [76]:
from bs4 import BeautifulSoup
import requests
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd

In [93]:
# Function to read words from a file and return as a set
def read_words(file_path):
    with open(file_path, 'r') as file:
        return set(line.strip().lower() for line in file)

# Function to count syllables in a word using a simple heuristic
def count_syllables(word):
    vowels = "aeiou"
    word = word.lower().strip()
    if word[0] in vowels:
        count = 1
    else:
        count = 0
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        count += 1
    if count == 0:
        count = 1
    return count

# Function to perform text analysis
def analyze_text(article_content, pos_words, neg_words, stop_words, pronoun_list):
    words = nltk.word_tokenize(article_content.lower())
    sentences = nltk.sent_tokenize(article_content)
    
    punctuation_chars = set(string.punctuation)
    
    analysis_words = {word.strip() for word in words if word not in stop_words and word not in punctuation_chars}
    
    pos_score = sum(1 for word in analysis_words if word in pos_words)
    neg_score = sum(1 for word in analysis_words if word in neg_words)
    
    neg_score = -neg_score
    
    word_count = len([word for word in analysis_words if word not in stop_words])
    char_count = sum(len(word) for word in analysis_words)
    
    pol_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    sub_score = (pos_score + neg_score) / (word_count + 0.000001)
    
    pron_count = sum(1 for word in words if word in pronoun_list)
    
    avg_sen_len = round(word_count / len(sentences))
    avg_word_len = round(char_count / word_count)
    
    avg_num_words_sentence = round(word_count / len(sentences))
    
    complex_words = sum(1 for word in analysis_words if count_syllables(word) >= 3)
    percentage_complex_words = complex_words / word_count
    
    fog_index = 0.4 * (avg_sen_len + percentage_complex_words)
    
    syllable_per_word = round(sum(count_syllables(word) for word in analysis_words) / word_count)
    
    return {
        'POSITIVE SCORE': pos_score,
        'NEGATIVE SCORE': neg_score,
        'POLARITY SCORE': pol_score,
        'SUBJECTIVITY SCORE': sub_score,
        'AVG SENTENCE LENGTH': avg_sen_len,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_num_words_sentence,
        'COMPLEX WORD COUNT': complex_words,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': pron_count,
        'AVG WORD LENGTH': avg_word_len
    }

# Main processing function
def process_articles(input_file, output_file):
    input_df = pd.read_excel(input_file)
    
    pos_words = read_words('positive-words.txt')
    neg_words = read_words('negative-words.txt')
    stop_words = read_words('StopWords.txt')
    pronoun_list = ['i', 'we', 'my', 'ours', 'us']
    
    results = []
    
    for url, url_id in zip(input_df['URL'], input_df['URL_ID']):
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            
            article = soup.find('div', class_='td-post-content')
            if not article:
                print(f"Content Not Found for URL ID {url_id}")
                continue
            
            article_content = article.get_text(separator='\n', strip=True)
            article_title = soup.find('title').get_text()
            
            # Save article to a text file
            with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
                file.write(article_title + '\n' + article_content)
            
            # Perform text analysis
            analysis = analyze_text(article_content, pos_words, neg_words, stop_words, pronoun_list)
            
            # Add results to the list
            result = {'URL_ID': url_id, 'URL': url}
            result.update(analysis)
            results.append(result)
        
        except Exception as e:
            print(f"Failed to process URL {url}: {e}")
    
    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    
    # Save the DataFrame to an Excel file
    results_df.to_excel(output_file, index=False)

# Run the main processing function
process_articles('Input.xlsx', 'Output.xlsx')

Content Not Found for URL ID blackassign0036
Content Not Found for URL ID blackassign0049
