## Data Extraction

In [2]:
import pandas as pd

# Load the input file
input_file = "Input.xlsx"
df = pd.read_excel(input_file)
urls = df['URL']  # Assuming column name for URLs is 'URL'
url_ids = df['URL_ID']  # Assuming column name for URL_IDs is 'URL_ID'


In [3]:
import requests
from bs4 import BeautifulSoup
import os

def extract_article(url):
    try:
        # Send an HTTP request
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title and article text
        title = soup.find('h1').get_text(strip=True)  # Assuming title is in <h1>
        article_text = " ".join([p.get_text(strip=True) for p in soup.find_all('p')])

        return title, article_text

    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None


In [4]:
output_dir = "Articles"  # Directory to save articles
os.makedirs(output_dir, exist_ok=True)

for url, url_id in zip(urls, url_ids):
    title, article_text = extract_article(url)
    if title and article_text:
        # Save to a text file
        with open(os.path.join(output_dir, f"{url_id}.txt"), "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n{article_text}")
    else:
        print(f"Skipping URL_ID {url_id} due to extraction error.")


## Data Analysis or Text Analysis

In [77]:
import pandas as pd
import nltk
import openpyxl 
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import os
from collections import Counter
import textstat

# Ensure NLTK dependencies are available
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [79]:
# Load stop words
stop_words = set()
for file in ["Complete_StopWords"]:
    with open(file, "r", encoding="utf-8") as f:
        stop_words.update(word.strip().lower() for word in f)

# Load positive and negative words
positive_words = set()
negative_words = set()
with open("positive-words", "r") as f:
    positive_words.update(word.strip().lower() for word in f if word.strip() not in stop_words)
with open("negative-words", "r") as f:
    negative_words.update(word.strip().lower() for word in f if word.strip() not in stop_words)


In [81]:
def clean_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Tokenize words and sentences
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    # Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]
    return cleaned_words, sentences

In [83]:
def calculate_sentiment_scores(words):
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    return positive_score, negative_score

In [85]:
def calculate_polarity_subjectivity(positive_score, negative_score, total_words):
    polarity = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity = (positive_score + negative_score) / (total_words + 0.000001)
    return polarity, subjectivity

In [87]:
def calculate_readability_metrics(words, sentences):
    total_words = len(words)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    avg_words_per_sentence = total_words / total_sentences if total_sentences > 0 else 0
    complex_words = sum(1 for word in words if len(re.findall(r'[aeiouy]', word)) > 2)
    percentage_complex_words = complex_words / total_words if total_words > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, avg_words_per_sentence, percentage_complex_words, fog_index, complex_words

In [89]:
def calculate_word_stats(words):
    total_words = len(words)
    avg_word_length = sum(len(word) for word in words) / total_words if total_words > 0 else 0
    return total_words, avg_word_length

In [91]:
def syllable_count(word):
    word = word.lower()
    syllables = len(re.findall(r'[aeiouy]', word))
    if word.endswith(('es', 'ed')):
        syllables = max(1, syllables - 1)
    return syllables

In [93]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE)
    return len(pronouns)

In [98]:
def analyze_texts(input_file, articles_folder, output_file):
    # Load Input File
    df = pd.read_excel(input_file, engine='openpyxl')

    results = []
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        try:
            with open(f"{articles_folder}/{url_id}.txt", 'r', encoding='utf-8') as file:
                text = file.read()

            # Perform Cleaning
            cleaned_tokens = clean_text(text)
            print(f"Cleaned Tokens for {url_id}: {cleaned_tokens}")  # Debugging

            # Calculate Sentiment Scores
            positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(cleaned_tokens)
            print(f"Scores for {url_id}: {positive_score}, {negative_score}, {polarity_score}, {subjectivity_score}")  # Debugging

            # Readability and Complexity Analysis
            avg_sentence_length, percentage_complex_words, fog_index, num_sentences, num_words = readability_analysis(text)
            avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

            # Word Statistics
            total_words, syllables_per_word, avg_word_length = calculate_word_stats(cleaned_tokens)
            complex_word_count = count_complex_words(cleaned_tokens)

            # Personal Pronouns Count
            personal_pronouns = count_personal_pronouns(text)

            # Save Results
            result_entry = {
                'URL_ID': url_id,
                'URL': row.get('URL', ''),  
                "POSITIVE_SCORE": positive_score,
                'NEGATIVE_SCORE': negative_score,
                'POLARITY_SCORE': polarity_score,
                'SUBJECTIVITY_SCORE': subjectivity_score,
                'AVG_SENTENCE_LENGTH': avg_sentence_length,
                'PERCENTAGE_OF_COMPLEX_WORDS': percentage_complex_words,
                'FOG_INDEX': fog_index,
                'AVG_NUMBER_OF_WORDS_PER_SENTENCE': avg_words_per_sentence,
                'COMPLEX_WORD_COUNT': complex_word_count,
                'WORD_COUNT': total_words,
                'SYLLABLE_PER_WORD': syllables_per_word,
                'PERSONAL_PRONOUNS': personal_pronouns,
                'AVG_WORD_LENGTH': avg_word_length,
            }

            print(f"Result Entry for {url_id}: {result_entry}")  # Debugging
            results.append(result_entry)

        except Exception as e:
            print(f"Error processing {url_id}: {e}")

    # Save to Output Excel
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_file, index=False)  

# Example function calls (make sure these functions are defined)
analyze_texts("Input.xlsx", "Articles", "Output Data Structure.csv")


Cleaned Tokens for Netclan20241017: (['mlbased', 'youtube', 'analytics', 'content', 'creation', 'tool', 'optimizing', 'subscriber', 'engagement', 'content', 'strategy', 'integrating', 'machine', 'learning', 'code', 'kubeflow', 'pipeline', 'kuberflow', 'mlops', 'kubernetes', 'facial', 'recognition', 'attendance', 'system', 'face', 'recognition', 'deepface', 'mlbased', 'youtube', 'analytics', 'content', 'creation', 'tool', 'optimizing', 'subscriber', 'engagement', 'content', 'strategy', 'audio', 'text', 'conversational', 'bot', 'livekit', 'receptionist', 'voice', 'center', 'representative', 'representative', 'coach', 'face', 'recognition', 'deepfills', 'framework', 'deepface', 'development', 'ea', 'robot', 'automated', 'trading', 'rising', 'cities', 'impact', 'economy', 'environment', 'infrastructure', 'life', '2040', 'rising', 'cities', 'impact', 'economy', 'environment', 'infrastructure', 'life', 'future', 'internet', 'demands', 'evolution', 'communication', 'impact', '2035s', 'alterna

In [75]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("Output Data Structure.xlsx")

# Save it as a CSV with UTF-8 encoding
df.to_csv("Output_Data_Structure_UTF8.csv", index=False, encoding='utf-8')
