# Readworks Natural Language Processing 
Using BS4 for webscraping, data from ReadWorks.org is scraped and converted to a dataframe.
A corpus is created, and a document-term matrix is created using CountVectorizer.

In [1]:
import bs4 
import sys
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
import re 
import unicodedata
import string
import pickle
import time 
import pandas as pd

Login script for scraping


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as ec

browser = webdriver.Chrome() 

browser.get('https://www.readworks.org/find-content#!q:/g:/t:/f:0/pt:A/features:/')

button = browser.find_element_by_xpath('//*[@id="main-application"]/header/div[1]/div/div/nav[1]/div/a[2]')
browser.execute_script("arguments[0].click();", button)  
 
username = browser.find_element_by_xpath('//*[@id="teacher-login-modal"]/div/div/div/section/div/form/div[1]/input')
password = browser.find_element_by_xpath('//*[@id="teacher-login-modal"]/div/div/div/section/div/form/div[2]/input')

username.send_keys("david.guo.9278@gmail.com")
password.send_keys("Maliwan32!")

browser.find_element_by_xpath('//*[@id="teacher-login-modal"]/div/div/div/section/div/form/div[3]/button').click()

Opening links with Selenium and scraping with BS4

In [9]:
#List of dataframes
data_dict = {}

# Parsing through all 108 pages
for i in range(99, 108): 
    if i > 0:
        browser.get('https://www.readworks.org/find-content#!q:/g:/t:/f:' + str(i) + '/pt:A/features:/')
        time.sleep(3)
                                
    # Landing page
    soup1 = BeautifulSoup(browser.page_source, 'lxml')
    
    # Parse through articles
    for x, link in enumerate(soup1.find_all('a', class_ = 'article A -has-featured-image article-result-content')):

        # Open link with selenium

        link_button = browser.find_element_by_xpath('//*[@id="content"]/form/div[2]/div/section/div[2]/div[' + str(x+1) + ']/a')
        browser.execute_script("arguments[0].click();", link_button)  
        
        time.sleep(2) # Bad coding practice but the only way I can make it work
        
        # Convert to soup
        soup2 = BeautifulSoup(browser.page_source, 'lxml')

        # Find title
        title = re.sub(r'\s', ' ', (soup2.find('h1', class_ = 'main-header-title print-centered')).text)

        # Conditional selection to check if article is a poem, scrape text
        if soup2.find_all('ol', class_ = "lined-poem"):
            text = ','.join([li.text for li in soup2.find('article').find_all('li')])
        else:
            text = ','.join([p.text for p in soup2.find('article').find_all('p')])

        # Scraping additional informatio
        category = re.sub(r'\s', '', soup2.find('h3', class_='topics').text)
        grade = (soup2.find('div', class_="article-single-meta-content").find_all('li')[0]).string
        words = (soup2.find('div', class_="article-single-meta-content").find_all('li')[1]).string
        lexile = (soup2.find('div', class_="article-single-meta-content").find_all('li')[2]).string
        fict = (soup2.find('div', class_="article-single-meta-content").find_all('li')[3]).string

        # Creating a dictionary and storing information there
        data_dict[title] = {}   
        data_dict[title]["Text"] = text
        data_dict[title]["Category"] = category
        data_dict[title]["Grade"] = grade
        data_dict[title]["Word Count"] = words
        data_dict[title]["Lexile"] = lexile 
        data_dict[title]["Type"] = fict 

        browser.execute_script("window.history.go(-1)")
    
        time.sleep(1) # Bad coding practice but the only way I can make it work

The scraping process was performed in 7 runs to preserve memory and avoid timeouts. In-shell code was then executed to pickle the dataframes. 
1. The dictionaries were read in as pandas dataframes, then pickled.
2. The DataFrames were concatenated and the resulting DataFrame was saved as a .csv file in the working directory.

In [4]:
df = pd.read_csv('ReadWorks_Final', index_col=0)  

This round of data cleaning is focused on normalizing the unicode; additional data cleaning for topic modelling and sentiment analysis will be performed afterwards on the text.

In [5]:
df.drop(df.index[2148], inplace = True)

def text_Cleaning(text):
    text = unicodedata.normalize('NFKD', text)
    text = text.strip(', , ,')
    text = text.strip('\t\n') 
    return text
 
def category_Cleaning(category):
    category = (re.sub(r"(?<![A-Z])(?<!^)([A-Z])",r" \1", (category.split(':')[1])))
    return category
                      
def lexile_Cleaning(lexile):
    lexile = lexile.strip('L')[7:]
    if any(x in lexile for x in ['NP', 'Lexile Pending', 'ion', 'no Lexile']):
        return None  
    elif 'BR' in lexile:
        return -100
    else:
        return int(lexile)  

def wordcount_Cleaning(word_count): 
    if 'Lexile' in word_count:
        return None
    else:  
        return int((word_count[6:]).replace(',',''))  

def grade_Cleaning(grade):
    grade = grade[6:]
    if 'K' in grade: 
        grade =  re.sub('K', '0', grade )
    if len(grade) > 2:
        return sum([int(i) for i in (grade.split('-'))])/2
    else:
        return int(grade)
                
df['Text'] = pd.DataFrame(df.Text.apply(lambda x: text_Cleaning(x))) 
df['Category'] = pd.DataFrame(df.Category.apply(lambda x: category_Cleaning(x)))
df['Grade'] = pd.DataFrame(df.Grade.apply(lambda x: grade_Cleaning(x)))
df['Word Count'] = pd.DataFrame(df['Word Count'].apply(lambda x: wordcount_Cleaning(x)))
df['Lexile'] = pd.DataFrame(df.Lexile.apply(lambda x: lexile_Cleaning(x)))

In [6]:
df.to_csv('ReadWorks_Clean1')

Second round of data cleaning.

In [7]:
def clean_text_round2(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

df_text_clean = pd.DataFrame(df.Text.apply(lambda x: clean_text_round2(x)))
df_text_clean

Unnamed: 0,Text
An Unlikely Parasite: The Mistletoe,during the holidays many people hang mistletoe...
The Song of Wandering Aengus,i went out to the hazel woodbecause a fire was...
Thanksgiving: Fact or Fiction,this article is provided courtesy of historyco...
Native American Conflicts,jamestown logo for worlds fair in to european...
A Monument for Peace,in december the civil war was nearly over the...
...,...
Martin Luther King Jr.,martin luther king jr was a leader when he was...
Native American Powwows,some native americans hold powwows today these...
What is a Talking Stick?,photo credit library of congress for hundreds ...
People Need the Ocean,people today could not live without the ocean ...


Using spacy for tokenization, lemmatization, and number of unique entities.

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')
df_text_clean['Tokens'] = df_text_clean['Text'].apply(lambda x: nlp(x)) 

In [13]:
def text_Lemmatizer(doc): 
    lemmatized = []
    for token in doc:
        lemmatized.append(token.lemma_)
    return lemmatized

def text_Unique(doc): 
    uniques = []
    for token in doc:
        uniques.append(token.ent_type_)
    return uniques

df_text_clean['Stems'] = df_text_clean.Tokens.apply(lambda x: ' '.join(text_Lemmatizer(x)))
df_text_clean['Entities'] = df_text_clean.Tokens.apply(lambda x: text_Unique(x))
df_text_clean['Count_Entities'] = df_text_clean.Entities.apply(lambda x: len(list(filter(None, x))))

In [14]:
df_text_clean

Unnamed: 0,Text,Tokens,Stems,Entities,Count_Entities
An Unlikely Parasite: The Mistletoe,during the holidays many people hang mistletoe...,"(during, the, holidays, many, people, hang, mi...",during the holiday many people hang mistletoe ...,"[, , , , , , , , , , , , , , , , , , , , DATE,...",4
The Song of Wandering Aengus,i went out to the hazel woodbecause a fire was...,"(i, went, out, to, the, hazel, woodbecause, a,...",i go out to the hazel woodbecause a fire be in...,"[, , , , , PERSON, , , , , , , , , , , , PERSO...",5
Thanksgiving: Fact or Fiction,this article is provided courtesy of historyco...,"(this, article, is, provided, courtesy, of, hi...",this article be provide courtesy of historycom...,"[, , , , , , , , , , , , , , , , , , DATE, , ,...",179
Native American Conflicts,jamestown logo for worlds fair in to european...,"(jamestown, logo, for, worlds, fair, in, , to...",jamestown logo for world fair in to european...,"[PERSON, , , , , , , , NORP, , , LOC, LOC, , N...",65
A Monument for Peace,in december the civil war was nearly over the...,"(in, december, , the, civil, war, was, nearly...",in december the civil war be nearly over the...,"[, DATE, , , , , , , , , , , , , , , , , , LOC...",58
...,...,...,...,...,...
Martin Luther King Jr.,martin luther king jr was a leader when he was...,"(martin, luther, king, jr, was, a, leader, whe...",martin luther king jr be a leader when -PRON- ...,"[PERSON, PERSON, PERSON, PERSON, , , , , , , ,...",8
Native American Powwows,some native americans hold powwows today these...,"(some, native, americans, hold, powwows, today...",some native americans hold powwow today these ...,"[, , NORP, , , DATE, , , , , , , , , , , , , ,...",19
What is a Talking Stick?,photo credit library of congress for hundreds ...,"(photo, credit, library, of, congress, for, hu...",photo credit library of congress for hundred o...,"[, , , , ORG, , DATE, DATE, DATE, NORP, NORP, ...",7
People Need the Ocean,people today could not live without the ocean ...,"(people, today, could, not, live, without, the...",people today could not live without the ocean ...,"[, DATE, , , , , , , , , , , , , , , , , , , ,...",2


In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(df_text_clean.Text)
df_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
df_dtm.index = df_text_clean.index
df_dtm

In [42]:
sys.setrecursionlimit(100000)
df_text_clean.to_csv('data_clean') 
df_dtm.to_csv('data_dtm')
pickle.dump(cv, open("cv.pkl", "wb"))