# **BLACKOFFER TEST ASSIGNMENT** #

## ***Data Extraction and NLP*** ##

The objective of this assignment is to extract textual data articles from the given URL and perform text analysis to compute variables. 

In [1]:
# Importing the required libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from urllib.error import HTTPError
import nltk
from nltk.corpus import stopwords
from nltk import punkt
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import os

### **Importing data from file Input.xlsx** ###

In [2]:
# Reading the data from the file 'Input.xlsx'
input_orig_data = pd.read_excel("Input.xlsx")
input_orig_data

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...
...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...
97,blackassign0098,https://insights.blackcoffer.com/contribution-...
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...


In [3]:
input_data = input_orig_data.copy()

In [4]:
# Function to iterate the rows to get the content and add it to text file and also extract title.

import os

from requests.exceptions import ConnectionError
for index, row in input_data.iterrows():
  url = row['URL']
  url_id = row['URL_ID']

  # THIS BLOCK IS TO CREATE THE SOUP
  # Handling error cases when creating the soup for each URL using BeautifulSoup
  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
  except ConnectionError as ce:
    print(f'Error occured when opening the URL for URL ID: {url_id} - {ce}')
    break


  # THIS BLOCK IS TO EXTRACT THE ARTICLE TITLE
  # Handling error cases when extracted the Article Title for each URL
  try:
    # Extracting the Title from the soup
    article_title = soup.find('h1').get_text()
  except HTTPError as hp:
    print(f'Error occured when extracting the Article Title for URL ID: {url_id} - {hp}')
    continue
  except Exception as err:
    print(f'Error occured when extracting the Article Title for URL ID: {url_id} - {err}')
    continue


  # THIS BLOCK IS TO EXTRACT THE ARTICLE TEXT
  # Handling error cases when extracted the Article text for each URL
  try:
    article_text = "" # Initializing the article text
    for p in soup.find_all('p'):
      article_text = article_text + p.get_text() + " "
  except HTTPError as hp: # Printing the error case and proceeding further
    print(f'Error occured when extracting the Article Text for URL ID: {url_id} - {hp}')
    continue

  # THIS BLOCK IS TO PRINT THE ARTICLE TITLE AND TEXT TO THE OUTPUT FILE HAVING NAME AS URL_ID
  # Handling error cases when saving content to the file
  # Printing the required text information to a file
  try:
    output_fname = str(url_id) + ".txt"
    output_dir = 'Article_Text_Output'

    # Checking if the Article Text Output Directory path exists
    # if it doesn’t exist we create one   
    if os.path.isdir(output_dir):
      output_file_path = f'{output_dir}/{output_fname}'
    else:
      os.makedirs(output_dir)
      print(f'The directory {output_dir} was not found. Created it for further processing')
      output_file_path = f'{output_dir}/{output_fname}'
    
  except Exception as err:
    print(f'Unexpected error opening {output_file_path} is {repr(err)}')
  else:
    with open(output_file_path, 'w', encoding="utf-8") as file:
      file.write(f'{article_title}\n{article_text}')


The directory Article_Text_Output was not found. Created it for further processing
Error occured when extracting the Article Title for URL ID: blackassign0036 - 'NoneType' object has no attribute 'get_text'
Error occured when extracting the Article Title for URL ID: blackassign0049 - 'NoneType' object has no attribute 'get_text'


### **1.1 Creating the Stopwords list from the StopWords folders** ###

In [5]:
import os
import io
import re
# import pathlib
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# Directories
article_output_path = "Article_Text_Output"
stopwords_path = "StopWords"
sentments_path = "MasterDictionary"

# Creating the list of stopwords from the default library stopwords.words('english')
pkg_eng_stopwords = set(stopwords.words('english'))

main_stopwords = set()
stopwords_files = os.listdir(stopwords_path)
for stop_fl in stopwords_files:
    # Use this to read file content as a stream:
    stop_fl_path = f'{stopwords_path}/{stop_fl}'

    # Opening file
    file_handle = open(stop_fl_path, 'r')
    
    # Using for loop
    for line in file_handle:
        filter_text = re.match("(\w+)\s+\|(.*)",line)
        if filter_text:
            main_stopwords.add(filter_text[1].lower())
        else:
            main_stopwords.add(line.strip().lower())

# The total stopwords extracted from the files in the StopWords directory are - 12749
final_stopwords = main_stopwords
print(f'The stopwords extracted from the StopWords Directory are - {len(final_stopwords)}')


# Adding the stop words obtained from the stopwords english package
final_stopwords.update(pkg_eng_stopwords) # If we were to consider adding stopwords from english package was well, we would've merged both the outputs, however, the requirement now is different.
print(f'The stopwords extracted from the StopWords Directory are - {len(final_stopwords)}')



The stopwords extracted from the StopWords Directory are - 12749
The stopwords extracted from the StopWords Directory are - 12773


### **1.2 Creating a dictionary of Positive and Negative words (minus stopwords as per the requirement)** ###

In [6]:
# Master Dictionary Directory
master_dictionary_path = "MasterDictionary"

Master_Dictionary = dict()
master_dict_files = os.listdir(master_dictionary_path) # This dictionary has 2 files i.e., the 'positive-words.txt' file and 'negative-words.txt' file
for mfile in master_dict_files:
   if (mfile == "positive-words.txt"): # Reading the positive words from the positive-words.txt file
      # Use this to read file content as a stream:
      pve_wds_fl_path = f'{master_dictionary_path}/{mfile}'

      # Opening file
      pve_wds_fl_handle = open(pve_wds_fl_path, 'r', encoding='ISO-8859-1' )
      
      # Reading each line in the postive words file using for loop
      for pline in pve_wds_fl_handle:
         poWords = word_tokenize(pline)
         for poWd in poWords:
            if not poWd in final_stopwords:
               if 'Positive' not in Master_Dictionary.keys():
                  Master_Dictionary['Positive'] = [poWd.lower()] # In order to store a list of Positive words, we need to store the first postive word element as a list 
               else:
                  Master_Dictionary['Positive'].append(poWd.lower()) 
   
   elif (mfile == "negative-words.txt"): # Reading the negative words from the negative-words.txt file
      # Use this to read file content as a stream:
      nve_wds_fl_path = f'{master_dictionary_path}/{mfile}'

      # Opening file
      nve_wds_fl_handle = open(nve_wds_fl_path, 'r', encoding='ISO-8859-1' )
      
      # Reading each line in the postive words file using for loop
      for nline in nve_wds_fl_handle:
         neWords = word_tokenize(nline)
         for neWd in neWords:
            if not neWd in final_stopwords:
               if 'Negative' not in Master_Dictionary.keys():
                  Master_Dictionary['Negative'] = [neWd.lower()] # In order to store a list of Negative words, we need to store the first negative word element as a list 
               else:
                  Master_Dictionary['Negative'].append(neWd.lower())

print(f"The Master Dictionary created with the Positive and Negative words are -\n{Master_Dictionary}")    

The Master Dictionary created with the Positive and Negative words are -


### **1.3	Extracting Derived variables** ###

In [7]:
from nltk.tokenize import sent_tokenize
def get_sentences(file):
   
   # Reading the file content as a stream:
   article_fl_path = f'{article_output_path}/{file}'

   # Opening the file and reading it
   art_fl_handle = open(article_fl_path, 'r', encoding='ISO-8859-1' )
   ar_text = art_fl_handle.read()

   ar_text = re.sub(r'[^\w\s.]','',ar_text) # Removing all characters except words and spaces
   sentences_list = sent_tokenize(ar_text)
   return sentences_list 

In [8]:
# Readymade package used for identifying the count of syllables
from hyphen import Hyphenator
def sylcou(word):
    return word, Hyphenator('en_US').syllables(word), len(Hyphenator('en_US').syllables(word))

print(sylcou('beautiful'))
print(sylcou('Loaded'))
print(sylcou('Considered'))
print(sylcou('Travelled'))
print(sylcou('Unchecked'))
print(sylcou('bases'))
print(sylcou('fades'))
print(sylcou('looses'))

('beautiful', ['beau', 'ti', 'ful'], 3)
('Loaded', ['Loaded'], 1)
('Considered', ['Con', 'sid', 'ered'], 3)
('Travelled', ['Trav', 'elled'], 2)
('Unchecked', ['Unchecked'], 1)
('bases', ['bases'], 1)
('fades', ['fades'], 1)
('looses', ['looses'], 1)


### **6 Syllable Count Per Word** ###

#### ***Function to get the Syllable Count Per Word*** ####

In [9]:
# A function used from internet that an give the number of syllables in a word
# Since the function is so elaborate, I didn't want to modify it or 
# Disclaimer: I have used this function get_syllable_count provided in the webpage https://stackoverflow.com/questions/46759492/syllable-count-in-python
import re
def get_syllable_count(word) :
    word = word.lower()

    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables

    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']

    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']

    pre_one = ['preach']

    syls = 0 #added syllable number
    disc = 0 #discarded syllable number

    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls

    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)

    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    #3) discard trailing "e", except where ending is "le"  

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass

        else :
            disc+=1

    #4) check if consecutive vowels exists, triplets or pairs, count them as one.

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple

    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))

    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1

    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1

    #8) add one if "y" is surrounded by non-vowels and is not in the last word.

    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1

    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1

    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"

    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1

    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:2] == "co" and word[2] in 'eaoui' :

        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1

    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1

    #13) check for "-n't" and cross match with dictionary to add syllable.

    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass   

    #14) Handling the exceptional words.

    if word in exception_del :
        disc+=1

    if word in exception_add :
        syls+=1     

    # calculate the output
    return numVowels - disc + syls

### **7 Personal Pronouns Count** ###

#### ***Function to get the Personal Pronouns Count*** ####

In [10]:
# Function to get Personal Pronouns Count
# Finding the count of the words - “I,” “we,” “my,” “ours,” and “us”.
# Special care is taken so that the country name US is not included in the list. Also 'I' should be in Caps only.
# The remaining can be case insensitive.

import re
def get_personal_pronouns(file):
    # Reading the file content as a stream:
    article_fl_path = f'{article_output_path}/{file}'

    # Opening the file and reading it
    art_fl_handle = open(article_fl_path, 'r', encoding='ISO-8859-1' )
    ar_text = art_fl_handle.read()
    pronounRegex = re.compile(r'\b((?-i:I)|we|my|ours|(?-i:us))\b',re.I)
    personal_pronoun_cnt = len(pronounRegex.findall(ar_text))
    return personal_pronoun_cnt

In [11]:
Article_Dictionary = {}

article_text_files = os.listdir(article_output_path)

for index, row in input_data.iterrows():
   url = row['URL']
   article_fl_name = row['URL_ID']
   article_fl = str(article_fl_name) + ".txt"
   
   all_words = []
   positive_words = []
   negative_words = []
   syllable_words = []
   complex_words = []
   syllable_count = 0
   complex_word_count = 0
   character_count = 0

   
   
   # Reading the file content as a stream:
   article_fl_path = f'{article_output_path}/{article_fl}'

   Article_Dictionary[article_fl_name] = {}

   # Adding the URL to the 'Article_Dictionary' Dictionary
   url = input_orig_data.loc[input_orig_data['URL_ID'] == article_fl_name, 'URL'].iloc[0]
   

   try:
      # Opening file in the path 'article_fl_path'
      art_fl_handle = open(article_fl_path, 'r', encoding='ISO-8859-1' )

      # 3. Getting the sentences and count of sentences using the 'get_sentences' function
      sentences = get_sentences(article_fl)

      # 7. Getting the count of Personal Pronouns using the 'get_personal_pronouns' function
      personal_pronouns = get_personal_pronouns(article_fl)
      
      # Reading each line in the article text word file using for loop
      for line in art_fl_handle:
         line = re.sub(r'[^\w\s]','',line) # Not considering any other characters other than alpha numeric to avoid punctuation, comma, etc
         words = word_tokenize(line)
         for wd in words:
            if not wd.lower() in final_stopwords:
               wd = re.sub(r'[^A-Za-z0-9]','',wd) # Not considering any other characters other than alpha numeric to avoid punctuation, comma, etc
               if not wd =="":
                  all_words.append(wd)
                  character_count += len(wd)
                  if wd.lower() in Master_Dictionary['Positive']:
                     positive_words.append(wd)
                  if wd.lower() in Master_Dictionary['Negative']:
                     negative_words.append(wd)
                  
               
               # Extracting Syllable information
               # Reading each line in the article text word file using for loop
               word_syllable_cnt = get_syllable_count(wd)
               if word_syllable_cnt >= 1:
                  syllable_words.append(wd)
                  syllable_wc = len(syllable_words)
                  syllable_count += word_syllable_cnt

               # Extracting the Complex Word Count
               if word_syllable_cnt > 2:
                  complex_words.append(wd)
               
      
      # 1.3 Extracting Derived variables
      all_words_score = len(all_words) # Count of all words
      positive_score = len(positive_words) # Count of the positive words
      negative_score = len(negative_words) # Count of the negative words
      all_sentences_score = len(sentences) # Count of sentences
      complex_word_count = len(complex_words) # Complex word count
      polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001) # Polarity Score based on the Text Analysis Document
      subjectivity_score = (positive_score + negative_score)/ ((all_words_score) + 0.000001) # Subjectivity Score based on the Text Analysis Document
      syllable_cnt_per_word = syllable_count / len(syllable_words) # Syllable Count for the Text Analysis Document

      # 2 Analysis of Readability
      average_sentence_length = all_words_score / all_sentences_score # Average Sentence Length
      percentage_complex_words = complex_word_count / all_words_score # Percentage of Complex words
      fog_index = 0.4 * (average_sentence_length + percentage_complex_words) # Fog Index

      # 3 Average Number of Words Per Sentence
      avg_words_per_sentence = all_words_score / all_sentences_score

      # 8 Average Word Length
      average_word_length = character_count/all_words_score

      # Loading the values to the dictionary 'Article_Dictionary'
      Article_Dictionary[article_fl_name]['URL'] = url
      Article_Dictionary[article_fl_name]['All_Words'] = all_words
      Article_Dictionary[article_fl_name]['Positive_Words'] = positive_words
      Article_Dictionary[article_fl_name]['Negative_Words'] = negative_words
      Article_Dictionary[article_fl_name]['All_Sentences'] = sentences
      Article_Dictionary[article_fl_name]['All_Words_Score'] = all_words_score
      Article_Dictionary[article_fl_name]['Positive_Score'] = positive_score
      Article_Dictionary[article_fl_name]['Negative_Score'] = negative_score
      Article_Dictionary[article_fl_name]['Complex_Word_Count'] = complex_word_count
      Article_Dictionary[article_fl_name]['Polarity_Score'] = np.round(polarity_score,3)
      Article_Dictionary[article_fl_name]['Subjectivity_Score'] = np.round(subjectivity_score,3)
      Article_Dictionary[article_fl_name]['Sentences_Score'] = np.round(all_sentences_score,3)
      Article_Dictionary[article_fl_name]['Syllable_Count_Per_Word'] = np.round(syllable_cnt_per_word,3)
      Article_Dictionary[article_fl_name]['Average_Sentence_Length'] = np.round(average_sentence_length,3) # Average Sentence Length
      Article_Dictionary[article_fl_name]['Percentage_Complex_Words'] = np.round(percentage_complex_words,3) # Percentage of Complex words
      Article_Dictionary[article_fl_name]['Fog_Index'] = np.round(fog_index,3) # Fog Index
      Article_Dictionary[article_fl_name]['Avg_Words_Per_Sentence'] = np.round(avg_words_per_sentence,3) # Average Number of Words Per Sentence
      Article_Dictionary[article_fl_name]['Personal_Pronouns'] = personal_pronouns # Count of Personal Pronouns in the Text Analysis Document
      Article_Dictionary[article_fl_name]['Average_Word_Length'] = np.round(average_word_length,3) # Average Word Length

   except Exception as err:
      print(f'Unexpected error opening {article_fl_path} is {repr(err)}')
      Article_Dictionary[article_fl_name]['URL'] = url
      keys, values = ['URL','Positive_Score','Negative_Score','Polarity_Score','Subjectivity_Score','Average_Sentence_Length','Percentage_Complex_Words','Fog_Index','Avg_Words_Per_Sentence','Complex_Word_Count','All_Words_Score','Syllable_Count_Per_Word','Personal_Pronouns','Average_Word_Length'], [url,0,0,0,0,0,0,0,0,0,0,0,0,0]
      Article_Dictionary[article_fl_name] = {keys[i]: values[i] for i in range(len(values))}
      continue

Unexpected error opening Article_Text_Output/blackassign0036.txt is FileNotFoundError(2, 'No such file or directory')
Unexpected error opening Article_Text_Output/blackassign0049.txt is FileNotFoundError(2, 'No such file or directory')


In [12]:
# Creating a Dataframe 'article_dictionary_df' using the Dictionary 'Article_Dictionary'
article_dictionary_df = pd.DataFrame.from_dict(Article_Dictionary,orient='index',columns = ['URL', 'Positive_Score','Negative_Score','Polarity_Score','Subjectivity_Score','Average_Sentence_Length','Percentage_Complex_Words','Fog_Index','Avg_Words_Per_Sentence','Complex_Word_Count','All_Words_Score','Syllable_Count_Per_Word','Personal_Pronouns','Average_Word_Length'])
article_dictionary_df.rename(columns = {'Positive_Score':'POSITIVE SCORE','Negative_Score':'NEGATIVE SCORE','Polarity_Score':'POLARITY SCORE','Subjectivity_Score':'SUBJECTIVITY SCORE','Average_Sentence_Length':'AVG SENTENCE LENGTH','Percentage_Complex_Words':'PERCENTAGE OF COMPLEX WORDS','Fog_Index':'FOG INDEX','Avg_Words_Per_Sentence':'AVG NUMBER OF WORDS PER SENTENCE','Complex_Word_Count':'COMPLEX WORD COUNT','All_Words_Score':'WORD COUNT','Syllable_Count_Per_Word':'SYLLABLE PER WORD','Personal_Pronouns':'PERSONAL PRONOUNS','Average_Word_Length':'AVG WORD LENGTH'}, inplace = True)
article_dictionary_df.index.names = ['URL_ID']
article_dictionary_df.head()

Unnamed: 0_level_0,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,7,2,0.556,0.033,9.75,0.333,4.033,9.75,91,273,2.3,5,6.861
blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,53,32,0.247,0.106,9.988,0.421,4.163,9.988,336,799,2.526,6,7.625
blackassign0003,https://insights.blackcoffer.com/internet-dema...,41,24,0.262,0.088,12.267,0.481,5.099,12.267,354,736,2.695,15,8.126
blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,41,75,-0.293,0.161,13.073,0.476,5.419,13.073,342,719,2.642,7,7.968
blackassign0005,https://insights.blackcoffer.com/ott-platform-...,23,9,0.437,0.069,10.767,0.365,4.453,10.767,169,463,2.396,8,7.67


In [13]:
# Saving the dataframe as the file name 'Output Data Structure.xlsx'
output_df = 'Output Data Structure.xlsx'
article_dictionary_df.to_excel(output_df)