In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import os

# Function to extract text from the article URL
def extract_article_text(url):
    try:
        # Fetch the HTML content of the article URL
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract article title
        title = soup.find('title').get_text().strip()

        # Extract article text
        article_text = ''
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text() + '\n'

        return title, article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None, None

# Read the Excel file
input_file = 'Input.xlsx'
df = pd.read_excel(input_file)

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    # Extract URL_ID and Article_URL
    url_id = row['URL_ID']
    article_url = row['URL']
    
    # Extract article text
    article_title, article_text = extract_article_text(article_url)

    if article_title and article_text:
        # Create a directory to store text files if it doesn't exist
        if not os.path.exists('article_texts'):
            os.makedirs('article_texts')

        # Write the extracted text to a text file
        with open(f'article_texts/{url_id}.txt', 'w', encoding='utf-8') as f:
            f.write(article_title + '\n\n')
            f.write(article_text)
            
        print(f"Text extracted from {article_url} and saved as {url_id}.txt")
    else:
        print(f"Failed to extract text from {article_url}")

print("Extraction completed.")




In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string

nltk.download('punkt')
nltk.download('stopwords')

# Loading custom stopwords file
def load_stopwords(stopwords_file):
    with open(stopwords_file, 'r') as file:
        stopwords = file.readlines()
    return [word.strip() for word in stopwords]

# Function to count syllables in a word
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    prev_char_was_vowel = False
    for char in word:
        if char.lower() in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False
    
    # Adjust count for certain endings
    if word.endswith(('es', 'ed')):
        count -= 1

    # Ensure at least one syllable
    return max(1, count)


# Function to compute variables from the article text
def compute_variables(article_text):
    # Tokenize the text
    tokens = word_tokenize(article_text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    # Compute word count
    word_count = len(tokens)
    
    positive_score = 0
    
    # Count positive words
    for token in tokens:
        if token in positive_words: # Need to add positive words file
            positive_score += 1
    

    negative_score = 0
    
    # Count negative words
    for token in tokens:
        if token in negative_words: # Need to add negative words file
            negative_score += 1
    

    #Polarity Score
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score) + 0.000001)


    # Compute unique word count
    unique_word_count = len(set(tokens))
    
    # Subjectivity Score
    subjectivity_score = (positive_score + negative_score)/ ((word_count) + 0.000001)


    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Calculate average sentence length
    if total_sentences > 0:
        average_sentence_length = word_count / total_sentences
    else:
        average_sentence_length = 0
    

    # Percentage of complex words
    complex_word = 0

    for token in tokens:
        if token in complex_words:
            complex_word += 1

    percentage_of_complex_words = (complex_word/word_count)*100
 

    # Fog index
    fog_index = 0.4 * (average_sentence_Length + percentage_of_complex_words)


    # Compute average word length
    average_word_length = sum(len(word) for word in tokens) / len(tokens)


    # Average Number of Words per Sentence
    avg_words_per_sentence = word_count/sentences

    # Count syllables for each word
    total_syllables = sum(count_syllables(word) for word in words)
    
    # Calculate average syllables per word
    if len(words) > 0:
        average_syllables_per_word = total_syllables / len(words)
    else:
        average_syllables_per_word = 0
    

    # Personal Pronouns
    personal_pronouns = ['i', 'we', 'my', 'ours', 'us']
    pronoun_count = 0
    
    # Iterate through words and count personal pronouns
    for word in words:
        if word in personal_pronouns:
            # Skip counting 'us' if it appears as a standalone word
            if word == 'us' and (words.index(word) > 0 and words[words.index(word) - 1] != 'the'):
                continue
            pronoun_count += 1
    
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    
    # Calculate average word length
    if total_words > 0:
        average_word_length = total_characters / total_words
    else:
        average_word_length = 0
    
    return positive_score, negative_score, polarity_score, unique_word_count, subjectivity_score, average_sentence_length, percentage_of_complex_words, fog_index, avg_words_per_sentence, complex_word, average_syllables_per_word, word_count, average_word_length