<a href="https://colab.research.google.com/github/gaikwada16/anil-portfolio/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install textstat


Collecting textstat
  Downloading textstat-0.7.12-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.12-py3-none-any.whl (176 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.6/176.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.12


In [39]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import textstat

# =============================
# PATHS
# =============================
INPUT_FILE = "/content/Input.xlsx"
OUTPUT_STRUCTURE_FILE = "/content/Output Data Structure.xlsx"
ARTICLES_DIR = "1extracted_articles"
FINAL_OUTPUT_FILE = "Final_Output.xlsx"

POSITIVE_WORDS_FILE = "positive-words.txt"
NEGATIVE_WORDS_FILE = "negative-words.txt"

os.makedirs(ARTICLES_DIR, exist_ok=True)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9"
}

In [40]:
# =============================
# LOAD INPUT
# =============================
input_df = pd.read_excel(INPUT_FILE)
output_structure = pd.read_excel(OUTPUT_STRUCTURE_FILE)

In [41]:
import os
import requests

POSITIVE_WORDS_FILE = "positive-words.txt"
NEGATIVE_WORDS_FILE = "negative-words.txt"

def download_if_missing(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        response = requests.get(url)
        response.raise_for_status()
        with open(filename, "wb") as f:
            f.write(response.content)

download_if_missing(
    "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt",
    POSITIVE_WORDS_FILE
)

download_if_missing(
    "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt",
    NEGATIVE_WORDS_FILE
)


In [42]:
# =============================
# LOAD SENTIMENT LEXICONS
# =============================
def load_words(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as f:
        return set(
            word.strip().lower()
            for word in f
            if word.strip() and not word.startswith(";")
        )

positive_words = load_words(POSITIVE_WORDS_FILE)
negative_words = load_words(NEGATIVE_WORDS_FILE)


In [43]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [44]:
# =============================
# LOAD STOPWORDS
# =============================
stopwords = set()

stopword_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]

for file in stopword_files:
    try:
        with open(file, "r", encoding="ISO-8859-1") as f:
            for line in f:
                word = line.strip().lower()
                if word and not word.startswith(";"):
                    stopwords.add(word)
    except FileNotFoundError:
        print(f"Missing stopword file: {file}")


Missing stopword file: StopWords_Auditor.txt
Missing stopword file: StopWords_Currencies.txt
Missing stopword file: StopWords_DatesandNumbers.txt
Missing stopword file: StopWords_Generic.txt
Missing stopword file: StopWords_GenericLong.txt
Missing stopword file: StopWords_Geographic.txt
Missing stopword file: StopWords_Names.txt


In [45]:
# =============================
# TEXT ANALYSIS
# =============================
results = []

def count_syllables(word):
    return textstat.syllable_count(word)

for _, row in input_df.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]

    file_path = os.path.join(ARTICLES_DIR, f"{url_id}.txt")
    if not os.path.exists(file_path):
        continue

    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read().lower()

    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # ✅ STOPWORD FIX APPLIED HERE
    words_clean = [
        w for w in words
        if w.isalpha() and w not in stopwords
    ]

    word_count = len(words_clean)
    sentence_count = len(sentences)

    # =============================
    # SENTIMENT
    # =============================
    pos_score = sum(1 for w in words_clean if w in positive_words)
    neg_score = sum(1 for w in words_clean if w in negative_words)

    polarity = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity = (pos_score + neg_score) / (word_count + 0.000001)

    # =============================
    # READABILITY
    # =============================
    avg_sentence_length = word_count / sentence_count if sentence_count else 0

    complex_words = [w for w in words_clean if count_syllables(w) > 2]
    complex_word_count = len(complex_words)

    percentage_complex_words = complex_word_count / word_count if word_count else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_words_per_sentence = avg_sentence_length

    syllables_per_word = (
        sum(count_syllables(w) for w in words_clean) / word_count
        if word_count else 0
    )

    # =============================
    # PERSONAL PRONOUNS
    # =============================
    pronouns = re.findall(r"\b(I|we|my|ours|us)\b", text, re.I)
    personal_pronouns = len(pronouns)

    # =============================
    # AVG WORD LENGTH
    # =============================
    avg_word_length = (
        sum(len(w) for w in words_clean) / word_count if word_count else 0
    )

    results.append([
        url_id, url, pos_score, neg_score, polarity, subjectivity,
        avg_sentence_length, percentage_complex_words, fog_index,
        avg_words_per_sentence, complex_word_count, word_count,
        syllables_per_word, personal_pronouns, avg_word_length
    ])


In [46]:
len(results)

146

In [47]:
# =============================
# SAVE FINAL OUTPUT
# =============================
final_df = pd.DataFrame(results, columns=output_structure.columns)
final_df.to_excel(FINAL_OUTPUT_FILE, index=False)

print("✅ Extraction + Analysis completed")
print(f"✅ Output saved as {FINAL_OUTPUT_FILE}")

✅ Extraction + Analysis completed
✅ Output saved as Final_Output.xlsx


In [48]:
import pandas as pd

final_df = pd.read_excel("Output Data Structure.xlsx")
structure_df = pd.read_excel("/content/Output Data Structure.xlsx")

# 1. Column order check
assert list(final_df.columns) == list(structure_df.columns), "❌ Column order mismatch"

# 2. Row count check
assert len(final_df) > 0, "❌ No rows generated"

# 3. Null check
assert final_df.isnull().sum().sum() == 0, "❌ Null values found"

# 4. URL_ID uniqueness
assert final_df["URL_ID"].is_unique, "❌ Duplicate URL_IDs"

print("✅ Validation passed: Final_Output.xlsx is correct")


✅ Validation passed: Final_Output.xlsx is correct
