# Text Analytics & Sentiment Scoring for Blog Articles
This notebook performs:
1. Web scraping to extract articles from given URLs.
2. Text analysis including sentiment, readability, and linguistic features.


In [1]:
# Run this once if not already installed
!pip install pandas requests beautifulsoup4 openpyxl




In [29]:
#importing the libraries 
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import nltk
import string
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Pranshu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Load the list of URLs and URL_IDs
input_df = pd.read_excel("../Data/Input.xlsx")
input_df.head()


Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...


In [8]:
# Create a folder to save the article text files
output_folder = "extracted_articles"
os.makedirs(output_folder, exist_ok=True)


In [11]:
def extract_article(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else ""

        #extract only the main content area
        main_content = (
            soup.find("div", class_="td-post-content") or 
            soup.find("article") or
            soup.find("div", class_="post-content") or
            soup.find("div", class_="blog-content") or
            soup  # fallback 
        )

        # Extract all paragraph tags from that section
        article_tags = main_content.find_all('p')
        article_text = ' '.join([tag.get_text(strip=True) for tag in article_tags])

        return title + "\n\n" + article_text
    except Exception as e:
        print(f" Failed to extract from URL: {url} | Error: {e}")
        return ""


In [14]:
# Loop over all URLs and save content in .txt files
for index, row in input_df.iterrows():
    url_id = str(row['URL_ID'])
    url = row['URL']
    
    content = extract_article(url)
    
    if content:
        file_path = os.path.join(output_folder, f"{url_id}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        print(f" Saved: {url_id}.txt")
    else:
        print(f" Skipped: {url_id}")


 Saved: Netclan20241017.txt
 Saved: Netclan20241018.txt
 Saved: Netclan20241019.txt
 Saved: Netclan20241020.txt
 Saved: Netclan20241021.txt
 Saved: Netclan20241022.txt
 Saved: Netclan20241023.txt
 Saved: Netclan20241024.txt
 Saved: Netclan20241025.txt
 Saved: Netclan20241026.txt
 Saved: Netclan20241027.txt
 Saved: Netclan20241028.txt
 Saved: Netclan20241029.txt
 Saved: Netclan20241030.txt
 Saved: Netclan20241031.txt
 Saved: Netclan20241032.txt
 Saved: Netclan20241033.txt
 Saved: Netclan20241034.txt
 Saved: Netclan20241035.txt
 Saved: Netclan20241036.txt
 Saved: Netclan20241037.txt
 Saved: Netclan20241038.txt
 Saved: Netclan20241039.txt
 Saved: Netclan20241040.txt
 Saved: Netclan20241041.txt
 Saved: Netclan20241042.txt
 Saved: Netclan20241043.txt
 Saved: Netclan20241044.txt
 Saved: Netclan20241045.txt
 Saved: Netclan20241046.txt
 Saved: Netclan20241047.txt
 Saved: Netclan20241048.txt
 Saved: Netclan20241049.txt
 Saved: Netclan20241050.txt
 Saved: Netclan20241051.txt
 Saved: Netclan20241

In [23]:
# Load stopwords from your single file
with open('./StopWords/stopwords.txt', 'r') as f:
    stop_words = set([line.strip().lower() for line in f if line.strip()])

# Load positive words
with open('./MasterDictionary/positive-words.txt', 'r') as f:
    positive_words = set([line.strip().lower() for line in f if line.strip()])

# Load negative words
with open('./MasterDictionary/negative-words.txt', 'r') as f:
    negative_words = set([line.strip().lower() for line in f if line.strip()])


In [24]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    meaningful_words = [word for word in tokens if word not in stop_words and word.isalpha()]
    
    return meaningful_words


In [25]:
def sentiment_scores(tokens):
    pos_score = sum(1 for word in tokens if word in positive_words)
    neg_score = sum(1 for word in tokens if word in negative_words)
    
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (len(tokens) + 0.000001)
    
    return pos_score, neg_score, polarity_score, subjectivity_score


In [26]:
results = []

for filename in os.listdir('extracted_articles'):
    if filename.endswith('.txt'):
        url_id = filename.split('.')[0]
        
        with open(os.path.join('extracted_articles', filename), 'r', encoding='utf-8') as file:
            full_text = file.read()
            
            # Clean and tokenize
            tokens = clean_text(full_text)
            
            # Calculate scores
            pos_score, neg_score, polarity, subjectivity = sentiment_scores(tokens)
            
            results.append({
                'URL_ID': url_id,
                'POSITIVE SCORE': pos_score,
                'NEGATIVE SCORE': neg_score,
                'POLARITY SCORE': polarity,
                'SUBJECTIVITY SCORE': subjectivity
            })

# Convert to DataFrame
scores_df = pd.DataFrame(results)
scores_df.head()


Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE
0,Netclan20241024,9,2,0.636364,0.073333
1,Netclan20241030,13,5,0.444444,0.076271
2,Netclan20241018,13,7,0.3,0.051282
3,Netclan20241150,2,2,0.0,0.044444
4,Netclan20241144,10,3,0.538461,0.117117


In [27]:
# Count syllables in a word
def count_syllables(word):
    word = word.lower()
    vowels = 'aeiou'
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i - 1] not in vowels:
            count += 1
    if word.endswith("es") or word.endswith("ed"):
        count -= 1
    return max(count, 1)

# Check if a word is complex (more than 2 syllables)
def is_complex_word(word):
    return count_syllables(word) > 2

# Count personal pronouns using regex
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, flags=re.I)
    return len(pronouns)


In [30]:
def analyze_text_metrics(text, tokens):
    sentences = sent_tokenize(text)
    
    # Sentence stats
    num_sentences = len(sentences)
    num_words = len(tokens)
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    
    # Complex words
    complex_words = [word for word in tokens if is_complex_word(word)]
    complex_word_count = len(complex_words)
    percent_complex_words = complex_word_count / num_words if num_words > 0 else 0
    
    # Fog index
    fog_index = 0.4 * (avg_sentence_length + percent_complex_words)
    
    # Syllables per word
    syllable_count = sum(count_syllables(word) for word in tokens)
    syllable_per_word = syllable_count / num_words if num_words > 0 else 0
    
    # Average word length
    total_chars = sum(len(word) for word in tokens)
    avg_word_length = total_chars / num_words if num_words > 0 else 0
    
    # Personal pronouns
    personal_pronouns = count_personal_pronouns(text)
    
    return {
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percent_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,  # same value
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': num_words,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }


In [31]:
final_results = []

for filename in os.listdir('extracted_articles'):
    if filename.endswith('.txt'):
        url_id = filename.split('.')[0]
        
        with open(os.path.join('extracted_articles', filename), 'r', encoding='utf-8') as file:
            full_text = file.read()
            
            # Clean text and tokenize
            tokens = clean_text(full_text)
            
            # Sentiment scores
            pos_score, neg_score, polarity, subjectivity = sentiment_scores(tokens)
            
            # Readability and other metrics
            extra_metrics = analyze_text_metrics(full_text, tokens)
            
            # Combine all results
            result = {
                'URL_ID': url_id,
                'POSITIVE SCORE': pos_score,
                'NEGATIVE SCORE': neg_score,
                'POLARITY SCORE': polarity,
                'SUBJECTIVITY SCORE': subjectivity
            }
            result.update(extra_metrics)
            
            final_results.append(result)

# Create final DataFrame
final_df = pd.DataFrame(final_results)
final_df = final_df.sort_values('URL_ID')
final_df.head()


Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
137,Netclan20241017,5,0,1.0,0.04065,20.5,0.398374,8.35935,20.5,49,123,2.292683,1,7.569106
2,Netclan20241018,13,7,0.3,0.051282,10.833333,0.330769,4.465641,10.833333,129,390,2.271795,7,7.125641
7,Netclan20241019,9,2,0.636364,0.073333,18.75,0.313333,7.625333,18.75,47,150,2.293333,1,7.346667
33,Netclan20241020,23,11,0.352941,0.091398,13.777778,0.543011,5.728315,13.777778,202,372,2.612903,4,8.096774
26,Netclan20241021,2,0,1.0,0.018018,12.333333,0.396396,5.091892,12.333333,44,111,2.45045,1,7.468468


In [33]:
# Merge URL_ID and URL from already loaded input_df
final_df = pd.merge(input_df[['URL_ID', 'URL']], final_df, on='URL_ID', how='left')

# Save the final file
final_df.to_excel("Output Data Structure.xlsx", index=False)
print("Final output with URL saved to 'Output Data Structure.xlsx'")

Final output with URL saved to 'Output Data Structure.xlsx'
