In [4]:
import pandas as pd
df = pd.read_excel(r"C:\Users\Rudra\Downloads\Output Data Structure.xlsx")

In [5]:
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,,,,,,,,,,,,,
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,,,,,,,,,,,,,
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,,,,,,,,,,,,,


In [10]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import nltk
from nltk.corpus import stopwords, opinion_lexicon
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('opinion_lexicon')
nltk.download('punkt')
nltk.download('stopwords')



positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())


def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title_tag = soup.find('title')
            title = title_tag.get_text(strip=True) if title_tag else 'No Title'
            article_tag = soup.find('article') or soup.find('div', class_='content')
            article_text = article_tag.get_text(strip=True) if article_tag else 'No Article Text Found'
            return f"{title}\n\n{article_text}"
        else:
            return "Failed to retrieve the article"
    except Exception as e:
        return str(e)


def analyze_text(text):
    
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    
    
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
    
    
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)
    
   
    complex_words = [word for word in words if sum(1 for char in word if char in 'aeiou') > 2]
    percentage_complex_words = len(complex_words) / len(words)
    
   
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    
    avg_words_per_sentence = len(words) / len(sentences)
    
    
    complex_word_count = len(complex_words)
    
    
    word_count = len(words)
    
    
    syllable_per_word = sum(sum(1 for char in word if char in 'aeiou') for word in words) / len(words)
    
    
    personal_pronouns = sum(1 for word in words if word in ['i', 'we', 'my', 'ours', 'us'])
    
    
    avg_word_length = sum(len(word) for word in words) / len(words)
    
    return {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'avg_words_per_sentence': avg_words_per_sentence,
        'complex_word_count': complex_word_count,
        'word_count': word_count,
        'syllable_per_word': syllable_per_word,
        'personal_pronouns': personal_pronouns,
        'avg_word_length': avg_word_length
    }


output_dir = "Desktop/articles"
os.makedirs(output_dir, exist_ok=True)


results = []


for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    article_text = extract_article_text(url)
    
    if "Failed to retrieve the article" not in article_text and "No Article Text Found" not in article_text:
        analysis = analyze_text(article_text)
        analysis['URL_ID'] = url_id
        results.append(analysis)
        
        output_file_path = os.path.join(output_dir, f"{url_id}.txt")
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(article_text)


results_df = pd.DataFrame(results)
results_df.to_excel(os.path.join(output_dir, 'text_analysis_results.xlsx'), index=False)

   

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\Rudra\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rudra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rudra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
