Reference script directory:

- work/text_analysis/parsing/Overlap_Parsing_Improved.py : removed overlaps in the webtext
- work/text_analysis/parsing/further_cleaning_script.ipynb : string cleaning methods
    
Use file `/vol_b/data/misc_data/webtext_raw/webtext_unlapped_full.tsv` as data source. We are most interested in the `text_full` column: this is a list of quadruples (one per web page), where 4th element is the text for a web page. Filter each website to its 250 most important pages--that is, keep the top 250 ranked pages and no more. Rank pages based on their number of occurrences of the following keywords:

keywords = ['values', 'academics', 'academic', 'skills', 'skill', 'purpose', 'purposes', 'direction', 'mission', 'vision', 'visions', 'missions', 'ideals', 'cause', 'causes', 'curriculum', 'curricular', 'method', 'methods', 'pedagogy', 'pedagogical', 'pedagogies', 'approach', 'approaches', 'model', 'models', 'system', 'systems', 'structure', 'structures', 'philosophy', 'philosophical', 'philosophies', 'beliefs', 'believe', 'belief', 'principles', 'principle', 'creed', 'creeds', 'credo', 'moral', 'morals', 'morality', 'history', 'histories', 'our story', 'the story', 'school story', 'background', 'backgrounds', 'founding', 'founded', 'foundation', 'foundations', 'foundational', 'established','establishment', 'our school began', 'we began', 'doors opened', 'school opened', 'about us', 'our school', 'who we are', 'identity', 'identities', 'profile', 'highlights']

Save the resulting file as `/vol_b/data/misc_data/webtext_raw/webtext_unlapped_filtered_250.tsv`.
Do the same thing to produce 10 and 100-page lengths, saving as `/vol_b/data/misc_data/webtext_raw/webtext_unlapped_filtered_100.tsv` and 
`/vol_b/data/misc_data/webtext_raw/webtext_unlapped_filtered_10.tsv`.  

### Import Statements and Load Data

In [1]:
import pandas as pd
import numpy as np
import ast
import re

In [2]:
original = pd.read_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_full.tsv", sep="\t")

In [3]:
list(original)

['Unnamed: 0', 'text_full', 'NCESSCH']

In [4]:
original = original.drop(['Unnamed: 0'], axis=1)
original.head()

Unnamed: 0,text_full,NCESSCH
0,"[('http://www.maef.net/', 'False', '0', ""Eveni...",10019700000.0
1,"[('https://www.kgbsd.org/ketchikancharter', 'F...",20015000000.0
2,[('http://tongassschool.org/classrooms/3rd-and...,20015000000.0
3,"[('https://www.asdk12.org/aquarian', 'False', ...",20018000000.0
4,"[('http://winterberrycharterschool.com/', 'Fal...",20018000000.0


### Data Pre-Processing

In [5]:
# convert strings in text_full column into lists
original["text_full"] = original["text_full"].apply(lambda x: ast.literal_eval(x))

In [6]:
# check that text_full contains lists
type(original["text_full"][0])

list

In [7]:
# count the number of pages for each school
original["full_page_num"] = original["text_full"].apply(lambda x:len(x))
original.head()

Unnamed: 0,text_full,NCESSCH,full_page_num
0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0,41
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0,1
2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0,117
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0,1
4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0,33


In [8]:
# check for rows with more than 250 pages
more_than_250 = original[original['full_page_num'] > 250]
more_than_250.head()

Unnamed: 0,text_full,NCESSCH,full_page_num
48,"[(https://www.goasa.org/, False, 0, 2.5 hours ...",40005600000.0,270
108,"[(https://www.alaschools.org/, False, 0, Blog\...",40011200000.0,484
109,"[(https://www.alaschools.org/, False, 0, Blog\...",40011200000.0,487
112,"[(https://www.alaschools.org/, False, 0, Blog\...",40011200000.0,494
121,"[(https://www.allaccelerated.org/tucson/, Fals...",40012900000.0,270


### Score Pages by Keywords

In [9]:
# define keywords to rank pages
keywords = ['values', 'academics', 'academic', 'skills', 'skill', 'purpose', 'purposes', 'direction', 
            'mission', 'vision', 'visions', 'missions', 'ideals', 'cause', 'causes', 'curriculum', 
            'curricular', 'method', 'methods', 'pedagogy', 'pedagogical', 'pedagogies', 'approach', 
            'approaches', 'model', 'models', 'system', 'systems', 'structure', 'structures', 'philosophy', 
            'philosophical', 'philosophies', 'beliefs', 'believe', 'belief', 'principles', 'principle', 
            'creed', 'creeds', 'credo', 'moral', 'morals', 'morality', 'history', 'histories', 'our story', 
            'the story', 'school story', 'background', 'backgrounds', 'founding', 'founded', 'foundation', 
            'foundations', 'foundational', 'established','establishment', 'our school began', 'we began', 
            'doors opened', 'school opened', 'about us', 'our school', 'who we are', 'identity', 'identities', 
            'profile', 'highlights']

In [10]:
def score_page(txt):
    """
    Assign a score to a page based on the normalized count of keywords it contains.
    
    """
    # remove newline characters and "|" characters before splitting
    filtered_txt = txt.replace("|", " ").replace("\n", " ")
    # split on whitespace to find the number of words
    num_words = len(filtered_txt.split())
    
    total_count = 0
    for word in keywords: 
        count = len(re.findall(word + "\W", filtered_txt))
        total_count += count
        
    # normalize by page length and take the log to avoid very small values
    if num_words == 0:
        score = 0
    else:
        score = np.log(total_count / num_words)
    
    return score

In [11]:
def get_top_N(N, pages):
    """
    Return a list of the top N pages with the highest scores from the input list of pages.
    
    """
    if len(pages) <= N:
        # if less than or equal to N pages, no need to filter
        return pages
    else:
        scores_dict = {}
        for page in pages:
            page_text = page[3]
            score = score_page(page_text)
            scores_dict[page] = score
        return sorted(scores_dict, key=scores_dict.get, reverse=True)[:N]

### Keep 250 Highest Scoring Pages

In [89]:
# apply get_top_N to each list of school pages in text_full
original['text_250'] = original['text_full'].apply(lambda x: get_top_N(250, x))
original.head()

Unnamed: 0,text_full,NCESSCH,full_page_num,text_250,page_num_250,text_100
0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0,41,"[(http://www.maef.net/, False, 0, Evening Acad...",41,"[(http://www.maef.net/, False, 0, Evening Acad..."
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0,1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",1,"[(https://www.kgbsd.org/ketchikancharter, Fals..."
2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0,117,[(http://tongassschool.org/classrooms/3rd-and-...,117,"[False, http://tongassschool.org/school-office..."
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0,1,"[(https://www.asdk12.org/aquarian, False, 0, S...",1,"[(https://www.asdk12.org/aquarian, False, 0, S..."
4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0,33,"[(http://winterberrycharterschool.com/, False,...",33,"[(http://winterberrycharterschool.com/, False,..."


In [90]:
# check that every school has less than or equal to 250 pages
original['page_num_250'] = original['text_250'].apply(lambda x: len(x))
original[original["page_num_250"] > 250]

Unnamed: 0,text_full,NCESSCH,full_page_num,text_250,page_num_250,text_100


### Keep 100 Highest Scoring Pages

In [92]:
# apply get_top_N to each list of school pages in text_full
original['text_100'] = original['text_full'].apply(lambda x: get_top_N(100, x))
original.head()

Unnamed: 0,text_full,NCESSCH,full_page_num,text_250,page_num_250,text_100
0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0,41,"[(http://www.maef.net/, False, 0, Evening Acad...",41,"[(http://www.maef.net/, False, 0, Evening Acad..."
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0,1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",1,"[(https://www.kgbsd.org/ketchikancharter, Fals..."
2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0,117,[(http://tongassschool.org/classrooms/3rd-and-...,117,[(http://tongassschool.org/classrooms/ms-lydia...
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0,1,"[(https://www.asdk12.org/aquarian, False, 0, S...",1,"[(https://www.asdk12.org/aquarian, False, 0, S..."
4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0,33,"[(http://winterberrycharterschool.com/, False,...",33,"[(http://winterberrycharterschool.com/, False,..."


In [94]:
# check that every school has less than or equal to 100 pages
original['page_num_100'] = original['text_100'].apply(lambda x: len(x))
original[original["page_num_100"] > 100]

Unnamed: 0,text_full,NCESSCH,full_page_num,text_250,page_num_250,text_100,page_num_100


### Keep 10 Highest Scoring Pages

In [12]:
# apply get_top_N to each list of school pages in text_full
original['text_10'] = original['text_full'].apply(lambda x: get_top_N(10, x))
original.head()

Unnamed: 0,text_full,NCESSCH,full_page_num,text_10
0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0,41,[(http://www.maef.net/our-work/programs/educat...
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0,1,"[(https://www.kgbsd.org/ketchikancharter, Fals..."
2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0,117,[(http://tongassschool.org/classrooms/ms-lydia...
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0,1,"[(https://www.asdk12.org/aquarian, False, 0, S..."
4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0,33,[(http://winterberrycharterschool.com/about/mi...


In [13]:
# check that every school has less than or equal to 100 pages
original['page_num_10'] = original['text_10'].apply(lambda x: len(x))
original[original["page_num_10"] > 10]

Unnamed: 0,text_full,NCESSCH,full_page_num,text_10,page_num_10


### Save as TSV

In [97]:
# save webtext_unlapped_filtered_250.tsv
cleaned_250 = original.loc[:, ["text_250", "NCESSCH"]].rename(columns={"text_250": "text_full"})
cleaned_250.head()

Unnamed: 0,text_full,NCESSCH
0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0
2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0
4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0


In [98]:
cleaned_250.to_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_filtered_250.tsv", sep="\t")

In [99]:
# save webtext_unlapped_filtered_100.tsv
cleaned_100 = original.loc[:, ["text_100", "NCESSCH"]].rename(columns={"text_100": "text_full"})
cleaned_100.head()

Unnamed: 0,text_full,NCESSCH
0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0
2,[(http://tongassschool.org/classrooms/ms-lydia...,20015000000.0
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0
4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0


In [100]:
cleaned_100.to_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_filtered_100.tsv", sep="\t")

In [14]:
# save webtext_unlapped_filtered_10.tsv
cleaned_10 = original.loc[:, ["text_10", "NCESSCH"]].rename(columns={"text_10": "text_full"})
cleaned_10.head()

Unnamed: 0,text_full,NCESSCH
0,[(http://www.maef.net/our-work/programs/educat...,10019700000.0
1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0
2,[(http://tongassschool.org/classrooms/ms-lydia...,20015000000.0
3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0
4,[(http://winterberrycharterschool.com/about/mi...,20018000000.0


In [15]:
cleaned_10.to_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_filtered_10.tsv", sep="\t")

### Check if Successfully Saved

In [102]:
saved_250 = pd.read_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_filtered_250.tsv", sep="\t")
saved_250.head()

Unnamed: 0.1,Unnamed: 0,text_full,NCESSCH
0,0,"[('http://www.maef.net/', 'False', '0', ""Eveni...",10019700000.0
1,1,"[('https://www.kgbsd.org/ketchikancharter', 'F...",20015000000.0
2,2,[('http://tongassschool.org/classrooms/3rd-and...,20015000000.0
3,3,"[('https://www.asdk12.org/aquarian', 'False', ...",20018000000.0
4,4,"[('http://winterberrycharterschool.com/', 'Fal...",20018000000.0


In [103]:
saved_100 = pd.read_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_filtered_100.tsv", sep="\t")
saved_100.head()

Unnamed: 0.1,Unnamed: 0,text_full,NCESSCH
0,0,"[('http://www.maef.net/', 'False', '0', ""Eveni...",10019700000.0
1,1,"[('https://www.kgbsd.org/ketchikancharter', 'F...",20015000000.0
2,2,[('http://tongassschool.org/classrooms/ms-lydi...,20015000000.0
3,3,"[('https://www.asdk12.org/aquarian', 'False', ...",20018000000.0
4,4,"[('http://winterberrycharterschool.com/', 'Fal...",20018000000.0


In [16]:
saved_10 = pd.read_csv("/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_filtered_10.tsv", sep="\t")
saved_10.head()

Unnamed: 0.1,Unnamed: 0,text_full,NCESSCH
0,0,[('http://www.maef.net/our-work/programs/educa...,10019700000.0
1,1,"[('https://www.kgbsd.org/ketchikancharter', 'F...",20015000000.0
2,2,[('http://tongassschool.org/classrooms/ms-lydi...,20015000000.0
3,3,"[('https://www.asdk12.org/aquarian', 'False', ...",20018000000.0
4,4,[('http://winterberrycharterschool.com/about/m...,20018000000.0
