In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import string

In [None]:
df = pd.read_excel('/content/Input.xlsx')

In [None]:
df.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [None]:
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    # send a GET request to the URL and get its HTML content
    response = requests.get(url)
    # create a BeautifulSoup object and parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    # find the article title and text
    title = soup.find('title').get_text().strip()
    article_text = ''
    # find all the paragraphs in the article text
    for p in soup.find_all('p'):
        # exclude any text that is not part of the article text, such as website header or footer
        if 'class' in p.attrs and ('header' in p.attrs['class'] or 'footer' in p.attrs['class']):
            continue
        article_text += p.get_text().strip() + '\n'
    # save the article text in a text file with URL_ID as the file name
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as f:
        f.write(title + '\n\n' + article_text)

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# set up NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
output_df = pd.read_excel('/content/Output Data Structure.xlsx')

In [None]:
# create lists to store the computed variables
word_count_list = []
unique_word_count_list = []
sentence_count_list = []
avg_sentence_length_list = []
stopword_count_list = []
lemmatized_text_list = []
tfidf_top_3_list = []

In [None]:
# set up paths to the stop words and master dictionary files
stop_words_path = '/content/sample_data/StopWords'
master_dictionary_path = '/content/sample_data/MasterDictionary'

# create a set of stop words from the stop words list
stop_words = set()
for file_name in os.listdir(stop_words_path):
    with open(stop_words_path + "/"+file_name, 'r', encoding='latin-1') as f:
        stop_words.update(set(f.read().splitlines()))

# create a dictionary of positive and negative words from the master dictionary
positive_words = set()
negative_words = set()
with open(master_dictionary_path+'/positive-words.txt', 'r', encoding='latin-1') as f:
  positive_words.update(set(f.read().splitlines()))
with open(master_dictionary_path+'/negative-words.txt', 'r', encoding='latin-1') as f:
  negative_words.update(set(f.read().splitlines()))


In [None]:
from nltk.corpus import cmudict
nltk.download('cmudict')
# function to count number of syllables in a word
def count_syllables(word):
    d = cmudict.dict()
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        # if word not found in the CMU Dictionary,
        # estimate the syllable count based on vowels
        vowels = "aeiouy"
        count = 0
        prev_char_vowel = False
        for char in word.lower():
            if char in vowels:
                if not prev_char_vowel:
                    count += 1
                prev_char_vowel = True
            else:
                prev_char_vowel = False
                if char == "e":
                    if count == 0:
                        count += 1
                    elif not prev_char_vowel:
                        count += 1
        if count == 0:
            count = 1 # every word should have at least one syllable
        return count
def count_syllables_per_word(word_tokens):
    # count syllables for each word
    syllable_counts = [count_syllables(word) for word in word_tokens]
    return sum(syllable_counts)

# function to check if a word is complex
def is_complex(word):
    return count_syllables(word) > 2
# function to calculate number of complex words in text
def count_complex_words(word_tokens):
    complex_word_count = 0
    for word in word_tokens:
        if is_complex(word):
            complex_word_count += 1
    return complex_word_count

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [None]:
import re
def count_personal_pronouns(text):
    # regex pattern to match personal pronouns
    pattern = r'\b(i|we|my|ours|us)\b'
    # exclude country name 'US'
    exclude = r'\bUS\b'
    # count personal pronouns
    count = len(re.findall(pattern, text, flags=re.IGNORECASE)) - len(re.findall(exclude, text, flags=re.IGNORECASE))
    return count

In [None]:
# Define function to count cleaned words
def cleaned_words(word_tokens):
   # Define punctuation marks
    punctuations = string.punctuation
    # Create a translation table to remove punctuation
    translator = str.maketrans('', '', punctuations)
    # Remove punctuation from each word in word_tokens list
    words = [word.translate(translator) for word in word_tokens]
    # Remove stop words from words
    words = [word for word in word_tokens if word not in stop_words]
    # Count remaining words
    count = len(words)
    return words

In [None]:
def avg_word_length(word_tokens):
    total_chars = sum(len(word) for word in word_tokens)
    total_words = len(word_tokens)
    return total_chars/total_words

In [None]:
# loop through each article text file
for index, row in output_df.iterrows():

    url_id = row['URL_ID']
    # read the article text file
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as f:
        article_text = f.read()

    # tokenize the article text into words and sentences
    word_tokens = word_tokenize(article_text)
    sent_tokens = sent_tokenize(article_text)

    #clean the words
    word_tokens=cleaned_words(word_tokens)

    #calculate positive and negative score
    pos_score = len(positive_words.intersection(word_tokens))
    neg_score = len(negative_words.intersection(word_tokens))
    output_df.loc[index,'POSITIVE SCORE']=pos_score
    output_df.loc[index,'NEGATIVE SCORE']=neg_score

    #calculate polarity score
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    output_df.loc[index,'POLARITY SCORE'] = polarity_score

    #calculate subjective score
    Subjectivity_Score = (pos_score + neg_score)/ (len(word_tokens) + 0.000001)
    output_df.loc[index,'SUBJECTIVITY SCORE']=Subjectivity_Score

    #Average sentence lenght
    Average_Sentence_Length = len(word_tokens) /len(sent_tokens)
    output_df.loc[index,'AVG SENTENCE LENGTH']=Average_Sentence_Length

    #calculate complex words
    complex_words = count_complex_words(word_tokens)
    output_df.loc[index,'COMPLEX WORD COUNT'] = complex_words

    #calculate percentage of complex words
    Percentage_of_Complex_words = (complex_words / len(word_tokens))*100
    output_df.loc[index,'PERCENTAGE OF COMPLEX WORDS'] = Percentage_of_Complex_words

    #calculate fog index
    Fog_Index = 0.4 * (Average_Sentence_Length +  Percentage_of_Complex_words)
    output_df.loc[index,'FOG INDEX']=Fog_Index

    #average no of words per sentence
    Average_Number_of_Words_Per_Sentence = len(word_tokens)/len(sent_tokens)
    output_df.loc[index,'AVG NUMBER OF WORDS PER SENTENCE']=Average_Number_of_Words_Per_Sentence


    #word_count
    word_count=len(word_tokens)
    output_df.loc[index,'WORD COUNT']=word_count



    #calculate Avg syllables per word
    syllable_count=count_syllables_per_word(word_tokens)
    output_df.loc[index,'SYLLABLE PER WORD']=syllable_count/len(word_tokens)

    #calculate personal pronouns
    personal_pronouns = count_personal_pronouns(article_text)
    output_df.loc[index,'PERSONAL PRONOUNS']=personal_pronouns

    #calculate AVERAGE WORD LENGTH
    avg_word_lenght = avg_word_length(word_tokens)
    output_df.loc[index,'AVG WORD LENGTH']=avg_word_lenght


In [None]:
output_df.to_csv("output_data.csv")