## Import common modules

In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import re
import warnings
warnings.filterwarnings('ignore')

## Load web scraped data

In [2]:
data = pd.read_excel("web_scraping.xlsx")
data.head()

Unnamed: 0,URL_ID,URL,article
0,37,https://insights.blackcoffer.com/ai-in-healthc...,\nAI in healthcare to Improve Patient Outcomes...
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creato...
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us? “Machine i...


## Import Stopwords

In [3]:
stop_word_dir=r"StopWords/"
stop_words=[]
for i in os.listdir(stop_word_dir):
    stop_file=open(stop_word_dir+i,"r")
    temp_list=(stop_file.read()).split('\n')
    for i in temp_list:
        j=i.split(" |")
        stop_words.append(j[0])
    stop_file.close

## Imported positive and negative words

In [4]:
pos_file=open(r"MasterDictionary/positive-words.txt","r")
pos_list=(pos_file.read()).split('\n')
pos_file.close
neg_file=open(r"MasterDictionary/negative-words.txt","r")
neg_list=((neg_file.read()).split('\n'))
neg_file.close
print(len(pos_list),len(neg_list))

2007 4784


## Added new columns for Text Analysis

In [5]:
column_list=["words","sentences","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE", "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH",
             "PERCENTAGE OF COMPLEX WORDS","FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
             "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"]
for i in column_list:
    data[i]=np.nan

## Tokenize article and remove punctuations

In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer(r'\w+')
for i in range(len(data)):
    if (str(data["article"][i])!="nan"):
        data["words"][i]=tokenizer.tokenize(str(data["article"][i]).lower())

## Tokenize article in sentences tokens for future refrences

In [7]:
from nltk.tokenize import sent_tokenize
for i in range(len(data)):
    if (str(data["article"][i])!="nan"):
        data["sentences"][i]=sent_tokenize(str(data["article"][i]))

## Performed word count by removing NLTK stopwords

In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
for i in range(len(data)):
    if (str(data["article"][i])!="nan"):
        data["WORD COUNT"][i] = len([words for words in data["words"][i] if not words in stopwords])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gautam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Performed multipe operations

### Extracting Derived variables
- Positive score
- Negative score
- Polarity score
- Subjective score

### Analysis of Readability
- Average sentence length
- Complex word count
- Percentage of complex words
- Fog index

### Average Number of Words Per Sentence
- Avg number of words per sentence

### Syllable Count Per Word
- Syllable per word

### Average Word Length
- Avg word length

In [9]:
pattern_1="a|e|i|o|u"

for i in range(len(data)):
    if str(data["article"][i])=="nan":
        continue
    pos=0
    neg=0
    complex_words=0
    words_len=0
    total_words=0
    total_syllable=0
    for j in data["words"][i]:
        words_len+=len(j)
        syllable_count = len(re.findall(pattern_1,j))
        if re.search("es$|ed$",j):
            syllable_count-=1
        total_syllable+=syllable_count
        if syllable_count > 2:
            complex_words += 1
        if j not in stop_words:
            total_words+=1
            if j in pos_list:
                pos+=1
            if j in neg_list:
                neg+=1
    
    data["POSITIVE SCORE"][i]=pos
    data["NEGATIVE SCORE"][i]=neg
    data["POLARITY SCORE"][i]=(pos-neg)/(pos+neg+0.000001)
    data["SUBJECTIVITY SCORE"][i]=(pos+neg)/(total_words+0.000001)
    
    data["AVG SENTENCE LENGTH"][i]=len(data.words[i])/len(data.sentences[i])
    data["COMPLEX WORD COUNT"][i]=complex_words
    data["PERCENTAGE OF COMPLEX WORDS"][i]=complex_words*100/len(data.words[i])
    data["FOG INDEX"][i]=0.4*((data["AVG SENTENCE LENGTH"][i])+(data["PERCENTAGE OF COMPLEX WORDS"][i]))
    
    data["AVG NUMBER OF WORDS PER SENTENCE"][i]=len(data.words[i])/len(data.sentences[i])
    
    data["SYLLABLE PER WORD"][i]=total_syllable/len(data.words[i])
        
    data["AVG WORD LENGTH"][i]=words_len/len(data.words[i])

## Found the number of Personal Pronouns using regex module

In [10]:
for i in range(len(data)):
    if str(data["article"][i])!="nan":
        pattern=" I | i | we | We | my | My | ours | Ours | us | Us"
        data["PERSONAL PRONOUNS"][i]=len(re.findall(pattern, str(data["article"][i])))

## Viewing our data after all operations

In [11]:
data.head(10)

Unnamed: 0,URL_ID,URL,article,words,sentences,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,\nAI in healthcare to Improve Patient Outcomes...,"[ai, in, healthcare, to, improve, patient, out...",[\nAI in healthcare to Improve Patient Outcome...,71.0,36.0,0.327103,0.102786,24.2,32.011019,22.484408,24.2,581.0,1163.0,2.065014,1.0,5.552617
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creato...,"[what, if, the, creation, is, taking, over, th...",[What if the Creation is Taking Over the Creat...,61.0,37.0,0.244898,0.158833,18.05,19.598338,15.059335,18.05,283.0,758.0,1.756925,6.0,4.713989
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,"[what, jobs, will, robots, take, from, humans,...",[What Jobs Will Robots Take From Humans in The...,65.0,36.0,0.287129,0.116226,20.105882,29.783499,19.955753,20.105882,509.0,995.0,2.002926,3.0,5.346987
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,"[will, machine, replace, the, human, in, the, ...",[Will Machine Replace The Human in the Future ...,68.0,27.0,0.431579,0.133615,17.2,21.29743,15.398972,17.2,348.0,898.0,1.828641,17.0,4.766218
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us? “Machine i...,"[will, ai, replace, us, or, work, with, us, ma...","[Will AI Replace Us or Work With Us?, “Machine...",62.0,25.0,0.425287,0.104946,22.649351,22.133028,17.912951,22.649351,386.0,979.0,1.809633,15.0,4.931766
5,42,https://insights.blackcoffer.com/man-and-machi...,Will machine replace the human in the future o...,"[will, machine, replace, the, human, in, the, ...",[Will machine replace the human in the future ...,48.0,26.0,0.297297,0.127807,21.245902,21.527778,17.109472,21.245902,279.0,684.0,1.81713,16.0,4.909722
6,43,https://insights.blackcoffer.com/in-future-or-...,How humans and machines are evolving to work t...,"[how, humans, and, machines, are, evolving, to...",[How humans and machines are evolving to work ...,27.0,12.0,0.384615,0.109551,16.555556,19.194631,14.300075,16.555556,143.0,417.0,1.805369,7.0,4.98255
7,44,https://insights.blackcoffer.com/how-neural-ne...,,,,,,,,,,,,,,,,
8,45,https://insights.blackcoffer.com/how-machine-l...,How machine learning will affect your business...,"[how, machine, learning, will, affect, your, b...",[How machine learning will affect your busines...,38.0,14.0,0.461538,0.153392,20.472222,20.217096,16.275727,20.472222,149.0,399.0,1.731343,0.0,4.691995
9,46,https://insights.blackcoffer.com/deep-learning...,Deep learning impact on areas of e-learning? e...,"[deep, learning, impact, on, areas, of, e, lea...","[Deep learning impact on areas of e-learning?,...",69.0,45.0,0.210526,0.111111,27.234568,23.300091,20.213863,27.234568,514.0,1219.0,1.863554,8.0,4.80689


## Drop the unnecessary columns and save the data

In [12]:
data.drop(["article", "words","sentences"],axis=1,inplace=True)
data.to_excel("final_output_data.xlsx",index=False)