1.DATA EXTRACTION

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

2.Function to extract-> article title and text from a URL

In [5]:


# Function to extract title and text from a URLs 

def extract_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # removing unwanted elements
        for element in soup(["header", "footer"]):
            element.decompose()
        
        # Extract article title and text
        ar_title = soup.find('title').text.strip()
        ar_text = ""
        
        # Extract text from <div > tag
        ar_div = soup.find('div', class_='td-post-content tagdiv-type')
        if ar_div:
            ar_text = ar_div.get_text()
        return ar_title, ar_text
    
    except Exception:
        print(f"Error while extracting article from {url}: {Exception}")
        return None, None

# Function to save the title and text to a text file
    
def save_article_to_file(url_id, ar_title, ar_text):
    if not os.path.exists("articles"):
        os.mkdir("articles")
    
    with open(f"articles/{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(f"Title: {ar_title}\n\n")
        file.write(ar_text)

def main():
    input_file = "Input.xlsx"
    df = pd.read_excel(input_file)
    
    for index, row in df.iterrows():
        url_id = row["URL_ID"]
        url = row["URL"]
        
        # Extract article title and text
        ar_title, ar_text = extract_article(url)
        
        # Check if extraction was successful
        if ar_title and ar_text:
            save_article_to_file(url_id, ar_title, ar_text)
            print(f"Article {url_id} extracted and saved successfully.")
        else:
            print(f"Failed to extract article {url_id}.")

if __name__ == "__main__":
    main()


Article blackassign0001 extracted and saved successfully.
Article blackassign0002 extracted and saved successfully.
Article blackassign0003 extracted and saved successfully.
Article blackassign0004 extracted and saved successfully.
Article blackassign0005 extracted and saved successfully.
Article blackassign0006 extracted and saved successfully.
Article blackassign0007 extracted and saved successfully.
Article blackassign0008 extracted and saved successfully.
Article blackassign0009 extracted and saved successfully.
Article blackassign0010 extracted and saved successfully.
Article blackassign0011 extracted and saved successfully.
Article blackassign0012 extracted and saved successfully.
Article blackassign0013 extracted and saved successfully.
Failed to extract article blackassign0014.


KeyboardInterrupt: 

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

#Load nltk resources
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')

3.Extracting Dervied Variables

In [None]:
#function to load positive and negative dictionaries
def load_both_dictionaries(positive_file, negative_file):
    with open(positive_file, "r") as file:
        positive_words = set(file.read().splitlines())
    with open(negative_file, "r") as file:
        negative_words = set(file.read().splitlines())
        
    return positive_words, negative_words

#func to calculate the metrics of sentimental analysis
def sentiment_scores(text,positive_words, negative_words):
    sia = SentimentIntensityAnalyzer()
    token = word_tokenize(text)
    
    positive_score = 0
    negative_score = 0
    
    for word in token :
        #removing punctuation 
        word = word.lower()
        if word.isalpha():
            #checking
            if word in positive_words:
                positive_score+=1
            if word in negative_words:
                negative_score+=1
                
    #calculating metrics
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(token) + 0.000001)
    
    return positive_score,negative_score,polarity_score,subjectivity_score

def main():
    input_file = "Output Data Structure.xlsx"
    positive_file = "positive-words.txt"
    negative_file = "negative-words.txt"
    article_directory = "articles"
    
    positive_words, negative_words = load_both_dictionaries(positive_file, negative_file)
    
     #Read output data structure Excel file
    output = pd.read_excel(input_file)
    
    result = []
    for index,row in output.iterrows():
        url_id = row["URL_ID"]
        url = row["URL"]
        article_file = os.path.join(article_directory, f"{url_id}.txt")
        
        if os.path.exists(article_file):
            #Read article text from file
            with open(article_file, 'r', encoding='utf-8') as article:
                article_text = article.read()
            
            # Perform sentiment analysis
            positive_score, negative_score, polarity_score, subjectivity_score = sentiment_scores(article_text, positive_words, negative_words)
            
            result.append({
                "URL_ID": url_id,
                "URL": url,
                "Positive_Score": positive_score,
                "Negative_Score": negative_score,
                "Polarity_Score": polarity_score,
                "Subjectivity_Score": subjectivity_score
            })
            
    
     #Creating a DataFrame from result
    results_df = pd.DataFrame(result)
    
    #Save results to Excel
    results_df.to_excel("sentiment_analysis.xlsx", index=False)

if __name__ == "__main__":
    main()
    
    

    
    
  

        

In [None]:
sentiment_analysis = pd.read_excel("sentiment_analysis.xlsx")

In [None]:
sentiment_analysis

In [None]:
import re
from nltk.tokenize import sent_tokenize

4.Calculating metrics

In [None]:
# Function to calculate average sentence length
def cal_avg_sentence_length(sentences):
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    return total_words / total_sentences

# Function to calculate percentage of complex words
def cal_percentage_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words) / len(words)

# Function to calculate fog index
def cal_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Function to calculate average number of words per sentence
def cal_avg_words_per_sentence(words, sentences):
    return len(words) / len(sentences)

# Function to calculate complex word count
def cal_complex_word_count(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words)

# Function to calculate word count
def cal_word_count(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    cleaned_words = [word for word in words if word not in stop_words and word.isalpha()]
    print(cleaned_words)
    return len(cleaned_words)

# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouAEIOU"
    count = 0
    if word[-1] in ['e', 'E'] and word[-2:] != 'le' and word[-2:] != 'LE':
        word = word[:-1]
    for index, letter in enumerate(word):
        if index == 0 and letter in vowels:
            count += 1
        elif letter in vowels and word[index-1] not in vowels:
            count += 1
    return count

# Function to calculate syllable count per word
def cal_syllable_count_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(count_syllables(word) for word in words)
    return syllable_count / len(words)

# Function to calculate personal pronoun count
def cal_personal_pronouns(text):
    pronouns = ["I", "we", "my", "ours", "us"]
    pattern = r'\b(?:' + '|'.join(pronouns) + r')\b'
    matches = re.findall(pattern, text)
    return len(matches)

# Function to calculate average word length
def cal_avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

def main():
    output_data_file = "Output Data Structure.xlsx"
    articles_directory = "articles"

    # Read output data structure Excel file
    output = pd.read_excel(output_data_file)

    results_ = []
    for index, row in output.iterrows():
        url_id = row["URL_ID"]
        article_file = os.path.join(articles_directory, f"{url_id}.txt")

        if os.path.exists(article_file):
            # Read article text from file
            with open(article_file, 'r', encoding='utf-8') as article:
                article_text = article.read()

            # Tokenize sentences for text analysis
            sentences = sent_tokenize(article_text)
            words = word_tokenize(article_text)

            # Calculate text analysis metrics
            avg_sentence_length = cal_avg_sentence_length(sentences)
            percentage_complex_words = cal_percentage_complex_words(article_text)
            fog_index = cal_fog_index(avg_sentence_length, percentage_complex_words)
            avg_words_per_sentence = cal_avg_words_per_sentence(words, sentences)
            complex_word_count = cal_complex_word_count(article_text)
            word_count = cal_word_count(article_text)
            syllable_count_per_word = cal_syllable_count_per_word(article_text)
            personal_pronoun_count = cal_personal_pronouns(article_text)
            avg_word_length = cal_avg_word_length(article_text)

            results_.append({
                "URL_ID": url_id,
                "Avg_Sentence_Length": avg_sentence_length,
                "Percentage_Complex_Words": percentage_complex_words,
                "Fog_Index": fog_index,
                "Avg_Words_Per_Sentence": avg_words_per_sentence,
                "Complex_Word_Count": complex_word_count,
                "Word_Count": word_count,
                "Syllable_Count_Per_Word": syllable_count_per_word,
                "Personal_Pronoun_Count": personal_pronoun_count,
                "Avg_Word_Length": avg_word_length
            })

    # Create DataFrame from results
    result_df2 = pd.DataFrame(results_)

    # Save results to Excel
    result_df2.to_excel("Analysis_results.xlsx", index=False)

if __name__ == "__main__":
    main()


In [None]:
text_analysis = pd.read_excel("Analysis_results.xlsx")
text_analysis

#merging both sentiment_analysis df and text_analysis df

In [None]:
merged_df = pd.merge(sentiment_analysis, text_analysis, on='URL_ID')
#display dataframe
merged_df

#OUTPUT

In [None]:
merged_df.to_excel("OutputDataStructure.xlsx")