In [1]:
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import requests
from bs4 import BeautifulSoup


In [2]:
# The file
excel_file_path = "./output.xlsx"
sheet_name = 'Sheet1'
url_column = 'URL'
filename_column = 'URL_ID'

In [3]:
# Functions to analyze the text
def analyze_text_with_textblob(df, paragraph, index):
    blob = TextBlob(paragraph)
    
    # Sentiment analysis
    positive_score = blob.sentiment.polarity
    negative_score = 1 - positive_score  # TextBlob returns a value in the range of [-1, 1], where 1 is the most positive
    
    # Additional text metrics
    sentences = sent_tokenize(paragraph)
    words_in_text = word_tokenize(paragraph)
    
    subjectivity_score = blob.sentiment.subjectivity
    
    avg_sentence_length = len(words_in_text) / (len(sentences)+0.00001)
    
    complex_words = [word for word, tag in blob.tags if tag in ['JJ', 'VB', 'RB', 'NN']]  # Adjectives, Verbs, Adverbs, Nouns
    percentage_complex_words = (len(complex_words) / (len(words_in_text)+0.00001)) * 100
    
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    avg_words_per_sentence = len(words_in_text) / (len(sentences)+0.00001)
    
    complex_word_count = len(complex_words)
    
    word_count = len(words_in_text)
    
    def syllable_count(word):
        # Simple syllable count based on vowel occurrences
        vowels = "aeiouy"
        count = sum(1 for char in word if char.lower() in vowels)
        return max(count, 1)
    
    syllables_per_word = sum(syllable_count(word) for word in words_in_text) / (len(words_in_text)+0.00001)
    
    personal_pronouns = sum(1 for word, tag in blob.tags if tag == 'PRP')
    
    avg_word_length = sum(len(word) for word in words_in_text) / (len(words_in_text)+0.00001)

    df.at[index, 'POSITIVE SCORE'] = positive_score
    df.at[index, 'NEGATIVE SCORE'] = negative_score
    df.at[index, 'POLARITY SCORE'] = blob.sentiment.polarity
    df.at[index, 'SUBJECTIVITY SCORE'] = subjectivity_score
    df.at[index, 'AVG SENTENCE LENGTH'] = avg_sentence_length
    df.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
    df.at[index, 'FOG INDEX'] = fog_index
    df.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
    df.at[index, 'COMPLEX WORD COUNT'] = complex_word_count
    df.at[index, 'WORD COUNT'] = word_count
    df.at[index, 'SYLLABLE PER WORD'] = syllables_per_word
    df.at[index, 'PERSONAL PRONOUNS'] = personal_pronouns
    df.at[index, 'AVG WORD LENGTH'] = avg_word_length
    print(df)
    return df

In [4]:
# Function to save the .txt file
def save_file(filename_value, title, paragraphs):
    # Use the filename value from the specified column
    filename = f"{filename_value}.txt"

    # Save the extracted data to a text file with the modified title as the filename
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"{title}\n{paragraphs}")
    print(f"Data successfully extracted and saved to '{filename}'")

In [5]:
# Function to write in the excel file
def save_results_to_excel(excel_file_path, sheet_name, paragraphs, df, index): 
    df = analyze_text_with_textblob(df, paragraphs, index)
    df.to_excel(excel_file_path, sheet_name=sheet_name, index=False)
    print ('data saved')

In [6]:
# Function to access the excel file
def access_urls(excel_file_path, sheet_name, url_column, filename_column):
    try:
        # Read the Excel file using pandas
        df = pd.read_excel(excel_file_path, sheet_name=sheet_name)

        # Access URLs and corresponding filename values from the specified columns
        for index, row in df.iterrows():
            url = row[url_column]
            filename_value = row[filename_column]

            try:
                # Send a GET request to the URL
                response = requests.get(url, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'})
                webpage = response.text

                # Check if the request was successful (status code 200)
                if response.status_code == 200:
                    # Parse the HTML content using BeautifulSoup
                    soup = BeautifulSoup(webpage, 'html.parser')
                    
                    # To remove the footer
                    if soup.find('pre'):
                        soup.pre.decompose()
                    
                    #To remove additional tags
                    
                    if soup.find("ul",class_="tdb-tags"):
             
                        soup.find("ul",class_="tdb-tags").decompose()
                    
                    # Extract the data you need from the soup object
                    content_div1 = soup.find_all('div', class_="td-post-content tagdiv-type")
                    content_div2 = soup.find_all('div', class_="tdb-block-inner td-fix-index")
                    title1 = soup.find('h1', class_='entry-title')
                    title2 = soup.find('h1', class_='tdb-title-text')
                    
                    if content_div1:
                        p_tags=[]

                        for item in content_div1:
                            
                            p_tags+=item.find_all(["p","li","h2","h3","h5"])
                        
                        paragraphs=""
                        
                        for content in p_tags:
                                
                                paragraphs+=f"{content.get_text(separator=' ', strip=True)}\n"
                        
                        # paragraphs = content_div1.get_text(separator='\n', strip=True)
                        title = title1.text.strip()
                        # To save the file
                        save_file(filename_value, title, paragraphs)
                        save_results_to_excel(excel_file_path, sheet_name, paragraphs, df, index)

                    elif content_div2:  # If the HTML structure is different
                        p_tags=[]

                        for item in content_div2:
                            
                            p_tags+=item.find_all(["p","li","h2","h3","h5"])
                        
                        paragraphs=""
                        
                        for content in p_tags:
                                
                                paragraphs+=f"{content.get_text(separator=' ', strip=True)}\n"
                        
                        # paragraphs = content_div2.get_text(separator='\n', strip=True)
                        title = title2.text.strip()
                        # To save the file
                        save_file(filename_value, title, paragraphs)
                        save_results_to_excel(excel_file_path, sheet_name, paragraphs, df, index)
                    else:
                        print(f"No content div found for URL: {url}")

                else:
                    print(f"Failed to retrieve the web page. Status code: {response.status_code}")
            except Exception as e:
                print(f"Error accessing URL {url}: {e}")

    except Exception as ex:
        print(f"Error reading Excel file: {ex}")
        

In [7]:
# To access the code we need to call this function
access_urls(excel_file_path, sheet_name, url_column, filename_column)

Data successfully extracted and saved to 'blackassign0001.txt'
             URL_ID                                                URL  \
0   blackassign0001  https://insights.blackcoffer.com/rising-it-cit...   
1   blackassign0002  https://insights.blackcoffer.com/rising-it-cit...   
2   blackassign0003  https://insights.blackcoffer.com/internet-dema...   
3   blackassign0004  https://insights.blackcoffer.com/rise-of-cyber...   
4   blackassign0005  https://insights.blackcoffer.com/ott-platform-...   
..              ...                                                ...   
95  blackassign0096  https://insights.blackcoffer.com/what-is-the-r...   
96  blackassign0097  https://insights.blackcoffer.com/impact-of-cov...   
97  blackassign0098  https://insights.blackcoffer.com/contribution-...   
98  blackassign0099  https://insights.blackcoffer.com/how-covid-19-...   
99  blackassign0100  https://insights.blackcoffer.com/how-will-covi...   

    POSITIVE SCORE  NEGATIVE SCORE  POLARITY SCO