### Importing modules

In [1]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
import numpy as np
import matplotlib as mp
import pprint
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sent_tokenize, word_tokenize
import zipfile
import syllables
import string
import re

In [3]:
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\disha\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\disha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Importing the dataset

In [2]:
df = pd.read_csv(r"C:\Users\disha\Downloads\Input.xlsx - Sheet1.csv")
df.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


### Function to scrape the website

In [27]:
def scrape_url(url, output_file):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html5lib')
    # print(soup.prettify())
    title = soup.find_all('h1', attrs={'class': "entry-title"})
    if title:
        title_text = title[0].get_text()
    else:
        title_text = "Not Found"
#     print(title_text)
    
    content = soup.find_all('div', attrs={'class': 'td-post-content tagdiv-type'})

    if content:
        text = ""
        for paragraph in content[0].find_all('p'):
            text += paragraph.get_text() + "\n"
#         print(text)
    else:
        text = "Content not found"

        
#     Append the title and text to the output file
    with open(output_file, 'a', encoding='utf-8') as file:
        file.write(f"Title: {title_text}\n")
        file.write(text)

### Creating output files

In [None]:
output_directory = "output_text_files/"  # Creating an output directory
os.makedirs(output_directory, exist_ok=True)
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    output_file = os.path.join(output_directory, f"{url_id}.txt")  # Create the output file path

    scrape_url(url, output_file)

### Data Analysis

#### Storing stop words as a set

In [4]:
stop_words_zip = r"C:\Users\disha\Downloads\StopWords-20230919T180021Z-001.zip"

In [5]:
# Extract stop words from the ZIP file
stop_words = set()

with zipfile.ZipFile(stop_words_zip, 'r') as zip_file:
    for file_name in zip_file.namelist():
        with zip_file.open(file_name) as word_file:
            word_list = [line.decode('latin-1').strip().lower() for line in word_file]
            stop_words.update(word_list)

#### Reading positive and negative words

In [6]:
positive_words_file = r"C:\Users\disha\Downloads\positive-words.txt"
with open(positive_words_file, 'r', encoding='utf-8') as pos_file:
    positive_words = set([line.strip() for line in pos_file])
    
negative_words_file = r"C:\Users\disha\OneDrive\Documents\negative-words.txt"
with open(negative_words_file, 'r', encoding='utf-8') as neg_file:
    negative_words = set([line.strip() for line in neg_file])

### 1. Sentiment Analysis

#### Function to clean using stop words

In [7]:
# Define a function to remove stop words from a text
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word not in string.punctuation]
    return ' '.join(filtered_words)

#### Function to calculate positive score

In [8]:
# Define a function to calculate the positive score using the positive word dictionary
def calculate_positive_score(text, positive_words):
    words = text.split()
    positive_score = sum(1 if word.lower() in positive_words else 0 for word in words)
    return positive_score

#### Function to calculate negative score

In [9]:
# Define a function to calculate the negative score using the negative word dictionary
def calculate_negative_score(text, negative_words):
    words = text.split()
    negative_score = sum(-1 if word.lower() in negative_words else 0 for word in words)
    return negative_score

#### Function to calculate polarity score

In [10]:
# Define a function to calculate the polarity score
def calculate_polarity_score(positive_score, negative_score):
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    return polarity_score

#### Function to calculate subjectivity score

In [11]:
# Define a function to calculate the subjectivity score
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    return subjectivity_score

### 2. Analysis of readibility

#### Function to calculate complex wrod count

In [12]:
# Define a function to calculate the count of complex words
def calculate_complex_word_count(text):
    words = text.split()
    complex_word_count = sum(1 for word in words if syllables.estimate(word) > 2)
    return complex_word_count

#### Function to count the number of syllables

In [13]:
def count_syllables(word):
    # Convert the word to lowercase for consistent counting
    word = word.lower()
    
    # Remove common suffixes that do not contribute to syllable count
    exceptions = ["es", "ed"]
    for exception in exceptions:
        if word.endswith(exception):
            word = word[:-len(exception)]
    
    # Count the number of vowels (a, e, i, o, u) in the word
    vowels = "aeiou"
    syllable_count = sum(1 for letter in word if letter in vowels)
    
    # Handle words with no vowels
    if syllable_count == 0:
        syllable_count = 1
    
    return syllable_count

#### Function to count personal pronouns

In [14]:
def count_personal_pronouns(text):
    # Define the regular expression pattern for personal pronouns
    pronoun_pattern = r'\b(?:[Ii]|we|my|ours|us)\b'
    
    # Use the regex pattern to find and count personal pronouns in the text
    personal_pronoun_matches = re.findall(pronoun_pattern, text)
    
    # Exclude instances where "US" refers to the country name
    filtered_pronouns = [pronoun for pronoun in personal_pronoun_matches if pronoun.lower() != 'us']
    
    return len(filtered_pronouns)

#### Function to calculate average word length

In [15]:
# Define a function to calculate the average word length
def calculate_average_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    
    # Avoid division by zero
    if total_words > 0:
        avg_word_length = total_characters / total_words
    else:
        avg_word_length = 0
    
    return avg_word_length

#### Function to analyze the entire text

In [16]:
sia = SentimentIntensityAnalyzer()

# Define a function to analyze a text file and calculate the specified factors
def analyze_text_file(file_path,sia):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

        # Remove stop words
        cleaned_text = remove_stopwords(text)

        # Calculate sentiment scores
        sentiment_scores = sia.polarity_scores(cleaned_text)
        positive_score = calculate_positive_score(cleaned_text,positive_words)
        negative_score = calculate_negative_score(cleaned_text,negative_words)
        
        polarity_score = calculate_polarity_score(positive_score, negative_score)
        
        # Tokenize the cleaned text into sentences and words
        sentences = sent_tokenize(cleaned_text)
        words = word_tokenize(cleaned_text)
        
        subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, len(words))
        
        # Calculate Average Sentence Length
        avg_sentence_length = len(words) / len(sentences) if len(sentences) > 0 else 0
        
        #Calculate complex word count
        complex_word_count = calculate_complex_word_count(cleaned_text)
        
        # Calculate Percentage of Complex Words
        percentage_complex_words = complex_word_count / len(words) if len(words) > 0 else 0
        
        # Calculate Fog Index
        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
        
        # Calculate Average Number of Words Per Sentence
        avg_words_per_sentence = len(words) / len(sentences) if len(sentences) > 0 else 0
        
        #Calculate the word count
        word_count = len(words)
        
        # Calculate Syllables Per Word
        syllables_per_word = sum(count_syllables(word) for word in words) / len(words) if len(words) > 0 else 0
        
        # Calculate Personal Pronoun Count
        personal_pronoun_count = count_personal_pronouns(cleaned_text)
        
        # Calculate Average Word Length
        average_word_length = calculate_average_word_length(cleaned_text)
        
        factors = {
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
            'FOG INDEX': fog_index,
            'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
            'COMPLEX WORD COUNT': complex_word_count,
            'WORD COUNT': word_count,
            'SYLLABLES PER WORD': syllables_per_word,
            'PERSONAL PRONOUNS': personal_pronoun_count,
            'AVERAGE WORD LENGTH': average_word_length  
        }
        
        return factors

### Generating the output

In [29]:
# Define the directory containing the text files
text_files_directory = "output_text_files/"

# Initialize a list to store the data
data = []

# Iterate through text files and analyze each one
for filename in os.listdir(text_files_directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(text_files_directory, filename)
        url_id = filename.split('.txt')[0]
#         print(url_id)

        # Find the URL associated with the URL_ID by iterating through the DataFrame
        url = None
        for index, row in df.iterrows():
            if row['URL_ID'] == float(url_id):
                url = row['URL']
                break  # Found the URL, so exit the loop
#         print(url)

#         Create a dictionary with the URL_ID, URL, and factors
        if url is not None:
            factors = analyze_text_file(file_path, sia)
            row_data = {'URL_ID': url_id, 'URL': url, **factors}
            data.append(row_data)

# Create a DataFrame from the list of data
output = pd.DataFrame(data)
output.head()


Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLES PER WORD,PERSONAL PRONOUNS,AVERAGE WORD LENGTH
0,10282.6,https://insights.blackcoffer.com/will-ai-repla...,55,-23,2.4375,0.040712,112.285714,0.438931,45.089858,112.285714,345,786,2.530534,0,6.969466
1,10744.4,https://insights.blackcoffer.com/man-and-machi...,42,-22,3.2,0.03663,182.0,0.428571,72.971429,182.0,234,546,2.521978,0,6.96337
2,11206.2,https://insights.blackcoffer.com/in-future-or-...,22,-11,3.0,0.033233,165.5,0.486405,66.394562,165.5,161,331,2.501511,0,7.07855
3,11668.0,https://insights.blackcoffer.com/how-neural-ne...,0,0,0.0,0.0,5.0,0.0,2.0,5.0,0,5,2.0,0,5.4
4,12129.8,https://insights.blackcoffer.com/how-machine-l...,33,-13,2.3,0.062893,159.0,0.389937,63.755975,159.0,124,318,2.361635,0,6.58805


#### Converting into excel format

In [31]:
# Define the output Excel file path
output_excel_file = "output_analysis.xlsx"
# Save the DataFrame to an Excel file
output.to_excel(output_excel_file, index=True)