In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import nltk
import string
import warnings
import re
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_excel('Input.xlsx')
urls = df['URL'].tolist()
url_ids = df['URL_ID'].tolist()

In [4]:
for url, url_id in zip(urls, url_ids):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    if soup.find('h1',attrs={"class":"tdb-title-text"}):
        title_element = soup.find('h1',attrs={"class":"tdb-title-text"})
    elif soup.find('h1',attrs={"class": "entry-title"}):
        title_element = soup.find('h1', attrs={'class': 'entry-title'})
    else:
        print(f'No title found for URL_ID {url_id}')
        continue
    title = title_element.text.replace('\n', '  ').replace('/', '')
    content = soup.find(attrs={'class': 'td-post-content'}).text.replace('\xa0', '  ').replace('\n', '  ')
    text = title + '.' + content
    # Save the text to a file
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as f:
        f.write(text)
    print(f'Article text for URL_ID {url_id} has been saved in {url_id}.txt')

In [5]:
# The code defines a function called `generate_stopwords()` that reads multiple text files and extracts the stopwords from them.
def generate_stopwords():
    parent_path = "StopWords/StopWords_"
    child_paths = ['Auditor', 'Currencies', 'DatesandNumbers','Generic', 'GenericLong', 'Geographic', 'Names']
    words = []
    for child in child_paths:
        text = ""
        with open(f"{parent_path}{child}.txt", 'r', encoding = "utf-8", errors = "ignore") as f:
            words.extend([word for line in f for word in line.split() if word not in ['\n', '|']])
    return words

In [6]:
# The code is generating a master dictionary by reading text files containing positive and negative words.
def generate_MasterDict(stopwords):
    parent_path = "MasterDictionary/"
    child_paths = ['positive-words', 'negative-words']
    words = []
    for child in child_paths:
        text = ""
        with open(f"{parent_path}{child}.txt", 'r', encoding = "utf-8", errors = "ignore") as f:
            words.append([word for line in f for word in line.split() if word not in stopwords+['\n', '|']])
    return words

In [7]:
def generate_content(url_id):
    text = ""
    with open(f"{url_id}.txt", 'r', encoding = "utf-8", errors = "ignore") as f:
        text += f.read()
    sent = nltk.sent_tokenize(text)
    text = text.translate(str.maketrans('','',string.punctuation))
    words = nltk.word_tokenize(text)
    words = [w for w in words if w not in stopwords]
    return sent, words

In [8]:
# The `AvgSyllable` function is a Python function that calculates various metrics related to syllables and complexity in a given list of words.

def AvgSyllable(words):
    plurals = "aeiouAEIOU"
    total_syallable = 0
    complex_words = 0
    for w in words:
        if not w.endswith(('es', 'ed')):
            syallable = sum([1 for c in w if c in plurals])
            if(syallable > 2): complex_words += 1
            total_syallable += syallable
    syllable_per_word = (total_syallable/len(words))
    complex_percent = (complex_words)/len(words)
    fog_index = 0.4 * (syllable_per_word + complex_percent)
    return syllable_per_word,  complex_words, complex_percent, fog_index

In [9]:
# The code is defining a regular expression pattern `pronounRegex` that matches common pronouns such as "I", "we", "my", "ours", and "us".
# This function calculating the count of pronouns in the given sentence.
def Calculate_pronouns(sent):
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronouns = []
    for s in sent:
        if s: pronouns.extend(re.findall(pronounRegex, s))
    pronouns_count = sum([1 for w in pronouns])
    return pronouns_count

In [10]:
stopwords = generate_stopwords()
MasterWords = generate_MasterDict(stopwords)
Pos_words = MasterWords[0]
Neg_words = MasterWords[1]
df = pd.read_excel("Output Data Structure.xlsx")
df.index = df.URL_ID
df.drop(["URL_ID"],axis = 1, inplace = True)

In [11]:
df.head()

Unnamed: 0_level_0,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
blackassign0003,https://insights.blackcoffer.com/internet-dema...,,,,,,,,,,,,,
blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,,,,,,,,,,,,,
blackassign0005,https://insights.blackcoffer.com/ott-platform-...,,,,,,,,,,,,,


In [12]:
df.columns

Index(['URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [13]:
# The code snippet is performing several calculations and assignments based on the content generated for a specific URL ID.
for url_id in df.index:
    if url_id not in ["blackassign0036", "blackassign0049"]:
        
        sent, words = generate_content(url_id)
        pos_score = sum([1 for w in words if w in Pos_words])
        neg_score = sum([1 for w in words if w in Neg_words])
        Syllable_per_word , complex_count, complex_percentage, fog_index = AvgSyllable(words)
        pronouns_count = Calculate_pronouns(sent)
        
        
        # The code snippet is assigning values to various columns in a DataFrame `df` based
        # on the calculations and assignments performed in the above code.
        
        df["POSITIVE SCORE"][url_id] =  pos_score
        df["NEGATIVE SCORE"][url_id] = neg_score
        df["POLARITY SCORE"][url_id] = (pos_score - neg_score)/(pos_score + neg_score + 0.000001)
        df['SUBJECTIVITY SCORE'][url_id] = (pos_score + neg_score)/(len(words)+0.000001)
        df['AVG SENTENCE LENGTH'][url_id] = len(words)/len(sent)
        df['PERCENTAGE OF COMPLEX WORDS'][url_id] = complex_percentage
        df['FOG INDEX'][url_id] = fog_index
        df['AVG NUMBER OF WORDS PER SENTENCE'][url_id] = len(words)/len(sent)
        df['COMPLEX WORD COUNT'][url_id] = complex_count
        df["WORD COUNT"][url_id] = len(words)
        df['SYLLABLE PER WORD'][url_id] = Syllable_per_word
        df['PERSONAL PRONOUNS'][url_id] = pronouns_count
        df['AVG WORD LENGTH'][url_id] = sum([len(w) for w in words])/len(words)

In [14]:
df[:20]

Unnamed: 0_level_0,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,33.0,5,0.736842,0.061489,7.923077,0.312298,0.91521,7.923077,193.0,618.0,1.975728,12.0,6.252427
blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,56.0,31,0.287356,0.104819,10.246914,0.438554,1.083855,10.246914,364.0,830.0,2.271084,4.0,7.173494
blackassign0003,https://insights.blackcoffer.com/internet-dema...,37.0,23,0.233333,0.087977,11.758621,0.489736,1.185337,11.758621,334.0,682.0,2.473607,13.0,7.882698
blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,35.0,71,-0.339623,0.163328,12.245283,0.514638,1.221572,12.245283,334.0,649.0,2.539291,5.0,7.801233
blackassign0005,https://insights.blackcoffer.com/ott-platform-...,21.0,8,0.448276,0.07672,9.45,0.410053,1.091005,9.45,155.0,378.0,2.31746,6.0,7.492063
blackassign0006,https://insights.blackcoffer.com/the-rise-of-t...,83.0,23,0.566038,0.093146,12.931818,0.462214,1.137083,12.931818,526.0,1138.0,2.380492,6.0,7.734622
blackassign0007,https://insights.blackcoffer.com/rise-of-cyber...,30.0,60,-0.333333,0.11811,9.896104,0.406824,1.059318,9.896104,310.0,762.0,2.24147,1.0,6.866142
blackassign0008,https://insights.blackcoffer.com/rise-of-inter...,26.0,9,0.485714,0.068359,10.44898,0.505859,1.214063,10.44898,259.0,512.0,2.529297,3.0,7.910156
blackassign0009,https://insights.blackcoffer.com/rise-of-cyber...,35.0,49,-0.166667,0.130435,11.298246,0.509317,1.196273,11.298246,328.0,644.0,2.481366,3.0,7.795031
blackassign0010,https://insights.blackcoffer.com/rise-of-cyber...,135.0,173,-0.123377,0.156903,8.247899,0.430464,1.081814,8.247899,845.0,1963.0,2.27407,13.0,7.426898


In [15]:
# It is saving the DataFrame `df` to an Excel file named "OUTPUT.xlsx".
# df.to_excel("OUTPUT.xlsx")