# Get exemplary pages for schools exemplary of each topic

- Author: Jaren Haber
- Institution: UC Berkeley; Georgetown University
- Date created: January 2021
- Date last edited: January 2021

Description: Use page counts to rank school pages and look at those with highest scores for a given topic. Start with lists of distinctive terms and distinctive school websites for each topic. Getting distinctive pages is essential for text extraction and stimulus generation for our follow-up survey experiment.

## Initialize

In [3]:
#!pip install nltk
#import nltk; nltk.download('stopwords'); nltk.download('punkt')

In [4]:
# Import packages
import pandas as pd # For working with DataFrames
import gc # To accelerate loading pickle files
import os, datetime, re, sys

import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # approximate but effective (and common) method of stemming words
ps = PorterStemmer()

In [9]:
# Load functions from data mgmt/tools directory:
cwd = os.getcwd()
root = str.replace(cwd, "text_analysis/filter_top_pages", "")
sys.path.insert(0, root + "data_management/tools")

# For displaying basic DF info, storing DFs for memory efficiency, and loading a filtered DF:
from df_tools import check_df, convert_df, load_filtered_df, replace_df_nulls

# For quickly loading & saving pickle files in Python:
from quickpickle import quickpickle_dump, quickpickle_load 

# For saving and loading text lists to/from file:
from textlist_file import write_list, load_list 

In [10]:
# Set file paths
thisday = datetime.date.today().strftime("%m%d%y")

charters_path = root + "misc_data/charters_2015.pkl"
ex_urls_path = root + "text_analysis/topic_modeling/stm_example_urls_011220.csv"
topwords_paths = [fp for fp in os.listdir(root + "text_analysis/topic_modeling/") if fp.startswith("top_words")]

## Load data

In [11]:
# Load data, keeping only relevant columns
df = load_filtered_df(
    charters_path, 
    ["WEBTEXT", "CMO_WEBTEXT", "URL", "SCH_NAME", "INQUIRY_COUNT", "INQUIRY_RATIO", 
     "NUMWORDS", "NUMPAGES", "NCESSCH", "SY_STATUS15"])

# rows and cols:  (10965, 10)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
URL: 3828 missing
SCH_NAME: 8421 missing
INQUIRY_COUNT
INQUIRY_RATIO: 4103 missing
NUMWORDS
NUMPAGES
NCESSCH
SY_STATUS15: 3619 missing
WEBTEXT
CMO_WEBTEXT


In [12]:
# Sanity check: Detect duplicates
print(len(df["NCESSCH"])) # Number of values in NCESSCH column
print(len(df["NCESSCH"])-len(df.drop_duplicates(subset="NCESSCH"))) # Method 1 to detect number of duplicates
print(len(list(df["NCESSCH"]))-len(list(set(df["NCESSCH"])))) # Method 2 to detect number of duplicates

10965
0
0


In [13]:
# Load file with exemplary schools for each key topic in STM
ex_urls_df = pd.read_csv(
    ex_urls_path, low_memory = False, header = 0, 
    usecols = ['TOPIC', 'NCESSCH', 'URL']).sort_values(by = 'TOPIC')
ex_urls_df

Unnamed: 0,NCESSCH,TOPIC,URL
81,80336001839,1,http://www.swecollege.org/
27,61152013697,1,https://emsofl.com/
34,61962006745,1,http://ktlcharterschool.com/
43,62380011518,1,http://www.sscs.cc/
59,63474011464,1,http://ogcs.org/
...,...,...,...
83,80336006517,28,http://uprepschool.org/
85,80441006638,28,http://www.salidadelsolacademy.org/
88,110003500253,28,http://ccpcs.org/
38,62271010851,28,http://www.gertzresslerhigh.org/


In [14]:
# Merge exemplary schools with their WEBTEXT from web-crawled charter school data
ex_urls_df = pd.merge(df[["WEBTEXT", "NCESSCH"]], ex_urls_df, how = "right", on = "NCESSCH")
ex_urls_df

Unnamed: 0,WEBTEXT,NCESSCH,TOPIC,URL
0,"[(http://www.swecollege.org/, False, 0, We are...",8.033600e+10,1,http://www.swecollege.org/
1,"[(https://emsofl.com/college-prep/, False, 1, ...",6.115201e+10,1,https://emsofl.com/
2,"[(http://ktlcharterschool.com/, False, 0, Cont...",6.196201e+10,1,http://ktlcharterschool.com/
3,"[(http://www.sscs.cc/, False, 0, In The News.....",6.238001e+10,1,http://www.sscs.cc/
4,"[(http://ogcs.org/, False, 0, In The News...\n...",6.347401e+10,1,http://ogcs.org/
...,...,...,...,...
275,[(http://uprepschool.org/campuses/arapahoe-str...,8.033601e+10,28,http://uprepschool.org/
276,"[(http://www.salidadelsolacademy.org/, False, ...",8.044101e+10,28,http://www.salidadelsolacademy.org/
277,"[(https://www.ccpcs.org/, False, 0, \r\tJavaSc...",1.100035e+11,28,http://ccpcs.org/
278,"[(https://www.gertzresslerhigh.org/, False, 0,...",6.227101e+10,28,http://www.gertzresslerhigh.org/


In [15]:
# Read in distinctive words for each selected topic. 
# We have 4 metrics for distinctiveness: score, prob, frex, and lift
topwords_dfs = [pd.read_csv(
    root + "text_analysis/topic_modeling/" + fp, low_memory = False) 
                for fp in topwords_paths]

# Merge these DFs by row
topwords_df = pd.concat([
    df.rename(columns = {"Unnamed: 0":"TOPIC"}) 
    for df in topwords_dfs], axis=1)

topwords_df

Unnamed: 0,TOPIC,score.1,score.2,score.3,score.4,score.5,score.6,score.7,score.8,score.9,...,prob.41,prob.42,prob.43,prob.44,prob.45,prob.46,prob.47,prob.48,prob.49,prob.50
0,1,cours,credit,colleg,student,high,onlin,gilbert,creek,graduat,...,admiss,scienc,transcript,contact,opportun,includ,avail,semest,advanc,elect
1,2,athlet,club,galleri,basketbal,ace,denver,varsiti,staff,parent,...,powerschool,soccer,coach,ace,advisori,volleybal,meet,denver,transpar,director
2,3,elementari,school,middl,river,high,bay,haven,north,academi,...,view,learn,technolog,servic,program,dragon,primari,career,magnet,height
3,4,like,get,said,love,realli,work,help,know,want,...,ask,thank,alway,friend,join,give,next,pleas,import,stori
4,5,campus,imagin,click,enrol,suppli,newslett,school,parent,pto,...,bus,faculti,mission,welcom,vision,import,fax,board,payment,question
5,6,discrimin,sex,disabl,complaint,color,constel,school,marit,race,...,pleas,usda,reserv,state,civil,sexual,orient,child,request,receiv
6,7,art,scienc,student,project,learn,music,stem,club,math,...,present,product,game,robot,find,well,build,field,time,health
7,8,spambot,javascript,window,madison,html,file,pane,score,gradebook,...,size,chang,must,set,indic,term,descript,list,edit,download
8,9,kipp,edlio,email,sorri,los,verifi,send,messag,angel,...,street,portal,staff,search,famili,parent,island,atlanta,subscrib,posit
9,10,blackboard,comment,fax,site,valley,directori,search,twitter,privaci,...,close,privaci,form,street,question,updat,support,login,inc,post


## Score pages by keywords

In [16]:
# Create list of punctuation
import string # for one method of eliminating punctuation
punctlist = list(string.punctuation) # assign list of common punctuation symbols
punctlist+=['*','•','©','–','–','``','’','“','”','...','»',"''",'..._...','--','×','|_','_','§','…','⎫'] # Add a few more punctuations also common in web text
punctlist.remove('-') ; punctlist.remove("'")
punctstr = "".join([char for char in list(set(punctlist))])
punctstr = re.sub(r'_--', '', punctstr)

unicode_list  = []
for i in range(1000,3000):
    unicode_list.append(chr(i))

stopword_list = list(set(stopwords.words('english')))

In [17]:
def clean_sentence(messy_string, 
                   unicodelist = unicode_list, 
                   punctuations = punctstr, 
                   stopwords = stopword_list): 
    """Removes numbers, stopwords, emails, URLs, unicode characters, hex characters, and punctuation from a sentence 
    separated by whitespaces. Returns a tokenized, stemmed, cleaned list of words from the sentence.
    
    Args: 
        messy_string (str): may include spaces and punctuation
        unicodelist (list of str): list of unicode symbols
        punctuations (str): string containing punctuation marks
        stopwords (list of str): list of stopwords
    Returns: 
        Cleaned & tokenized sentence, i.e. a list of cleaned, lower-case, one-word strings"""
    
    
    # remove newline characters and "|" characters
    sentence = messy_string.replace("|", " ").replace("\n", " ")
    
    #replace \\x, \\u, \\b, or - followed by any character or anything that ends with \u2605
    #then replace \\x, \\t and then get rid of whitespace
    sentence = re.sub(r"\\x.*|\\u.*|\\b.*|-|\u2605$", "", messy_string.replace(u"\xa0", u" ").replace(u"\\t", u" ").strip(" "))
    
    #get rid of hex character like \xa0\ adn a\x80
    sentence = re.sub(r'[^\x00-\x7f]',r'', sentence) #replace anything that starts with a hex character 
    
    #code that basically removes all elements that appear in the unicode_list (looks like r'u1000|u10001|')
    sentence = re.sub(r'|'.join(map(re.escape, unicodelist)), '', sentence) #removes unicode
    li_text = []
    
    for word in re.split('\s', sentence): #splits by space 
        if ((word not in stopwords) and (not word.isdigit()) and ("@" not in word)) : #gets rid of the numbers and emails
        
            #gets rid of urls 
            if ((not word.startswith(('http', 'https', 'www'))) and (not word.endswith(('.com', '.net', '.gov', '.org')))):
                if ((not word.startswith('//')) and not word.endswith(('.jpg', '.pdf', 'png', 'jpeg', 'php'))): 
                                        
                    word = re.sub(r'['+punctuations+']|am|pm', r'', word) #get rid of punctuation, and the words am and pm
                    word = ps.stem(word) #stem word
                    li_text.append(word)
                    
    sentence = ' '.join(li_text) #joins all words together
    sentence = sentence.strip() #remove beginning and ending white space in string
    return sentence

In [18]:
def score_page(txt, 
               keywords):
    '''
    Assign a score to a page based on the normalized count of keywords it contains.
    
    Args:
        txt (str): text of a web page
        terms (list of str): list of terms to look for in pages (e.g., distinctive words for a given topic)
    Returns:
        score (float, range 0-1): how much this page reflects these keywords
    '''
    
    txt = clean_sentence(txt) # clean sentence
    num_words = len(txt.split()) # split on whitespace to find the number of words
    
    total_count = 0
    for word in keywords: 
        count = len(re.findall(str(word) + str("\W"), txt))
        total_count += count
        
    # normalize by page length and take the log to avoid very small values
    if num_words == 0:
        score = 0
    else:
        score = np.log(total_count / num_words)
    
    return score

In [19]:
def get_top_pages(pagelist, 
                  keywords, 
                  n = 3, ):
    '''
    Get the n most representative pages in a list of website pages (pagelist) by counting frequency of key terms.
    
    Args:
        pagelist (list of list of str): list of pages, each page a quadruple: (URL, is_pdf, depth, text_str)
        n (int): number of pages to return
        keywords (list of str): list of keywords to look for in pages (e.g., distinctive words for a given topic)
    Returns:
        pagelist_selected (list of list of str): n most representative pages given topwords, each page a quadruple (as with input)
    '''
    
    if len(pagelist) <= n:
        # if less than or equal to N pages, no need to filter
        print("ERROR: Scoring failed. Only {} pages scraped for this school, minimum is {} to score.".
              format(str(len(pagelist)), str(n)))
        return pages
    else:
        scores_dict = {}
        
        for page in pagelist:
            page_text = page[3]
            score = score_page(page_text, keywords)
            scores_dict[page] = score
        pagelist_selected = sorted(scores_dict, key=scores_dict.get, reverse=True)[:n]
        
        return pagelist_selected

In [20]:
# Test out page scoring/ranking function
keywords = topwords_df.iloc[0,1:].tolist()
get_top_pages(ex_urls_df["WEBTEXT"].iloc[0], keywords = keywords)

  score = np.log(total_count / num_words)


[('http://www.swecollege.org/creditandgrades.html',
  'False',
  '1',
  'College prep courses earn five (5) credits per semester. Students enrolling in postsecondary classes shall receive both high school credit and college credit for courses successfully completed.  The college semester credit hours convert to high school semester credits. Grades received at the postsecondary institution will affect a student’s high school grade point average. The high school will be accountable for recording grades and credit based on the college transcript. The GPA and class rank will reflect grades received under this program. All grades for postsecondary courses are “weighted grades” for the purpose of determining GPA.   \nCredit and GPA will be computed using the following table: \nGrade Scale\xa0\nRegular and Modified Classes\nHonors, AP and College/ University Classes*\nA(90-100)\n4.0\n5.0\nB (80-89)\n3.0\n4.0\nC (70-79)\n2.0\n3.0\nD(60-69)\n0.0\n0.0\nF(0-59)\n0\n'),
 ('http://www.swecollege.or