In [1]:
import pandas as pd
import nltk 
import pyphen # for syllables

## Text Extraction
> ### web scraping (using bs4)

In [2]:
df = pd.read_csv("Input.csv")
df.head()

Unnamed: 0,URL_ID,URL
0,1,https://insights.blackcoffer.com/is-telehealth...
1,2,https://insights.blackcoffer.com/how-telehealt...
2,3,https://insights.blackcoffer.com/is-telemedici...
3,4,https://insights.blackcoffer.com/is-telehealth...
4,5,https://insights.blackcoffer.com/how-people-di...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  150 non-null    int64 
 1   URL     150 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.5+ KB


In [4]:
import requests
from bs4 import BeautifulSoup

HTML tags to be extracted
- title : `h1.entry-title`
- content : `div.td-post-content`

In [5]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
for index in df.index:
    
    # extracting HTML
    url = df['URL'][index]
    htmlContent = requests.get(url, headers=headers).content
    soup = BeautifulSoup(htmlContent, 'html.parser')

    # extracting content
    title = soup.find("h1", class_="entry-title").text
    content = soup.find("div", class_="td-post-content").text
    
    # writing in file
    file_name = str(df['URL_ID'][index])+'.txt'
    with open(file_name, 'w') as f:
        # encode into utf-8 to remove an error while scraping the text
        title = title.encode(encoding = 'utf-8')
        content = content.encode(encoding = 'utf-8')
        f.write(f'{title}\n{content}')

    # to remove this b' from byte string 
    with open(file_name,'r') as f:
        text = f.readlines()

    l1 = text[0][:-2].replace("b'","")
    l2 = text[1][:-1].replace("b'","")
    with open(file_name, 'w') as f:
        f.write(f"{l1}\n{l2}")

## Text Analysis

###  Made list of positive and negative words from given file

In [6]:
with open('negative-words.txt','r') as f:
    words = f.read()
    neg = words.split("\n")

with open('positive-words.txt','r') as f:
    words = f.read()
    pos = words.split("\n")

dictionary = {  "positive": pos,
                "negative": neg } 

### Reading StopWords files, and making `list` of stop words

In [7]:
import re
stop_words_files = ['StopWords_Generic.txt',
                    'StopWords_Names.txt',
                    'StopWords_DatesandNumbers.txt',
                    'StopWords_Auditor.txt',
                    'StopWords_GenericLong.txt',
                    'StopWords_Currencies.txt',
                    'StopWords_Geographic.txt']

with open("stop_words.txt", "w") as s:
    for file in stop_words_files:
        with open(file, 'r') as f:
            contents = f.read()
            s.write(contents)
# extracting raw text of stop words
s =  open("stop_words.txt", "r")
stop_words_txt = s.read()
s.close()

raw_stop_list = re.split(r'[| \n]\s*', stop_words_txt.lower())
stop_list = [word for word in raw_stop_list if word.isalpha()]
stop_list

['smith',
 'surnames',
 'from',
 'census',
 'johnson',
 'williams',
 'jones',
 'brown',
 'davis',
 'miller',
 'wilson',
 'moore',
 'taylor',
 'anderson',
 'thomas',
 'jackson',
 'white',
 'harris',
 'martin',
 'thompson',
 'garcia',
 'martinez',
 'robinson',
 'clark',
 'rodriguez',
 'lewis',
 'lee',
 'walker',
 'hall',
 'allen',
 'young',
 'hernandez',
 'king',
 'wright',
 'lopez',
 'hill',
 'scott',
 'green',
 'adams',
 'baker',
 'gonzalez',
 'nelson',
 'carter',
 'mitchell',
 'perez',
 'roberts',
 'turner',
 'phillips',
 'campbell',
 'parker',
 'evans',
 'edwards',
 'collins',
 'stewart',
 'sanchez',
 'morris',
 'rogers',
 'reed',
 'cook',
 'morgan',
 'bell',
 'murphy',
 'bailey',
 'rivera',
 'cooper',
 'richardson',
 'cox',
 'howard',
 'ward',
 'torres',
 'peterson',
 'gray',
 'ramirez',
 'james',
 'watson',
 'brooks',
 'kelly',
 'sanders',
 'price',
 'bennett',
 'wood',
 'barnes',
 'ross',
 'henderson',
 'coleman',
 'jenkins',
 'perry',
 'powell',
 'long',
 'patterson',
 'hughes',


In [8]:
# code https://monkeylearn.com/blog/text-cleaning/

### Cleaning the text in ( `file_to_list()` ) function 

In [9]:
# function returns cleans text and 
# returns words containig only alphabets in file in a list
# args: file_name => str
# return list
def file_to_list(file_name):
    with open(file_name,'r') as f:
        text = f.read()
        corp = re.sub('[^a-zA-Z]+',' ', text).strip()
        corp = str(corp).lower()
        tokens = nltk.word_tokenize(corp)     
        return (tokens)

# function that returns number of sentences in a file
# args: file_name => str
# returns: len(sentences) => int
def no_of_sentences(file_name):
    with open(file_name,'r') as f1:
        para = f1.read()
    sentences = nltk.sent_tokenize(para)
    return len(sentences)

# args: word => str, 
# returns : lnumbe of syllables => int
def syllables(word):
    pyp = pyphen.Pyphen(lang='en')
    syll = pyp.inserted(word)
    # print(syll)
    return( len(syll.split("-")) )

### Excluding the `stop_list` words and
### Calculaing scores and other variables for each txt file

In [10]:
# function to calculate scores
# args: 
#   file_name => str,
#   stop => list (containing stop words), 
#   dictionary => dict (available dictionary of positive and negative words)
# returns: scores => dict
def scores(file_name, dictionary, stop_list):

    # extract list of words from the txt file
    file = file_to_list(file_name)

    # exclude words in stop file and store in stop_excluded list
    stop_excluded = set(file).difference(set(stop_list))

    # get list of elements common between file and positive
    pos_in_file = list( set.intersection(stop_excluded , 
                                        set(dictionary["positive"])) )
    # get list of elements common between file and negative
    neg_in_file = list( set.intersection(stop_excluded , 
                                        set(dictionary["negative"])))

    # We count the total cleaned words present in the text by 
    # 1. removing the stop words (using stopwords class of nltk package).
    # 2. removing any punctuations like ? ! , . from the word before counting.
    file = list(stop_excluded)
    no_of_words = len(file)

    # scores
    pos_score = len(pos_in_file)
    neg_score = len(neg_in_file)
    # Polarity Score = (Positive Score – Negative Score)
    #                                       / 
    #                        ((Positive Score + Negative Score) + 0.000001)
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)

    # Subjectivity Score = (Positive Score + Negative Score)
    #                                   / 
    #                       ((Total Words after cleaning) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (no_of_words + 0.000001)

    # Average Number of Words Per Sentence (average sentence length)
    words_per_sentence = no_of_words / no_of_sentences(file_name)

    # Analysis of Readability (Gunning Fog index)
    # Average Sentence Length = the number of words / the number of sentences
    # Percentage of Complex words = the number of complex words / the number of words 
    # Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

    # Complex Word Count
    complex_words = [w for w in file if syllables(w) > 2 ]
    percent_complex_words = len(complex_words) / no_of_words
    fog_index = 0.4 * (words_per_sentence + percent_complex_words)

    # Syllable Count Per Word
    syllables_per_word = sum([syllables(w) for w in file]) / no_of_words

    # Personal Pronouns
    personal_pronouns = []
    for i in range(len(file)):
        if file[i] in ['i', 'we', 'my', 'ours']:
            personal_pronouns.append(file[i])

        # to exclude the US (country)
        elif file[i] == 'us' and file[i-1] != 'the':
            personal_pronouns.append(file[i])

    # Average Word Length
    # Average Word Length is calculated by the formula:
    # Sum of the total number of characters in each word/Total number of words
    sum_char = 0
    for word in ["I", "am","good","boy"]:
        sum_char += len(word)

    avg_word_len = sum_char / no_of_words

    # dict of scores
    scores = {
        "POSITIVE SCORE": pos_score,
        "NEGATIVE SCORE": neg_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": words_per_sentence,
        "PERCENTAGE OF COMPLEX WORDS": percent_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": words_per_sentence,
        "COMPLEX WORD COUNT": len(complex_words),
        "WORD COUNT": no_of_words,
        "SYLLABLE PER WORD": syllables_per_word,
        "PERSONAL PRONOUNS": len(personal_pronouns),
        "AVG WORD LENGTH": avg_word_len 
    }

    return (scores)

### Creating the output.csv file

In [11]:
data = []
for i in range(1,151):
    txt_file = str(i) + '.txt'
    data.append( scores(txt_file, dictionary, stop_list) )

score_df = pd.DataFrame( data )
score_df

Unnamed: 0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,20,5,0.600000,0.117371,14.200000,0.314554,5.805822,14.200000,67,213,2.150235,0,0.046948
1,19,9,0.357143,0.138614,4.697674,0.252475,1.980060,4.697674,51,202,1.980198,0,0.049505
2,48,22,0.371429,0.128676,7.253333,0.336397,3.035892,7.253333,183,544,2.181985,0,0.018382
3,36,14,0.440000,0.098814,10.120000,0.328063,4.179225,10.120000,166,506,2.162055,0,0.019763
4,44,26,0.257143,0.131086,8.754098,0.344569,3.639467,8.754098,184,534,2.176030,0,0.018727
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,20,27,-0.148936,0.135057,9.405405,0.293103,3.879404,9.405405,102,348,2.034483,0,0.028736
146,21,11,0.312500,0.071749,9.102041,0.271300,3.749337,9.102041,121,446,2.008969,0,0.022422
147,18,28,-0.217391,0.141104,5.719298,0.331288,2.420235,5.719298,108,326,2.082822,0,0.030675
148,20,7,0.481481,0.100746,19.142857,0.388060,7.812367,19.142857,104,268,2.302239,0,0.037313


In [12]:
output = pd.concat([df, score_df], axis=1)
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1,https://insights.blackcoffer.com/is-telehealth...,20,5,0.6,0.117371,14.2,0.314554,5.805822,14.2,67,213,2.150235,0,0.046948
1,2,https://insights.blackcoffer.com/how-telehealt...,19,9,0.357143,0.138614,4.697674,0.252475,1.98006,4.697674,51,202,1.980198,0,0.049505
2,3,https://insights.blackcoffer.com/is-telemedici...,48,22,0.371429,0.128676,7.253333,0.336397,3.035892,7.253333,183,544,2.181985,0,0.018382
3,4,https://insights.blackcoffer.com/is-telehealth...,36,14,0.44,0.098814,10.12,0.328063,4.179225,10.12,166,506,2.162055,0,0.019763
4,5,https://insights.blackcoffer.com/how-people-di...,44,26,0.257143,0.131086,8.754098,0.344569,3.639467,8.754098,184,534,2.17603,0,0.018727


In [13]:
with open('output.csv', 'w') as output_file:
    output_file.write(output.to_csv(index=False, line_terminator='\n'))