In [1]:
import multiprocessing as mp
import pandas as pd
import re
import numpy as np
import time
import ast
import heapq as q

In [2]:
keywords = ['values', 'academics', 'academic', 'skills', 'skill', 'purpose', 'purposes',
                       'direction', 'mission', 'vision', 'visions', 'missions',
                       'ideals', 'cause', 'causes', 'curriculum', 'curricular',
                       'method', 'methods', 'pedagogy', 'pedagogical', 'pedagogies', 'approach', 'approaches', 'model', 'models', 'system', 'systems',
                       'structure', 'structures', 'philosophy', 'philosophical', 'philosophies', 'beliefs', 'believe', 'belief',
                       'principles', 'principle', 'creed', 'creeds', 'credo', 'moral', 'morals', 'morality', 'history', 'histories', 'our story',
                       'the story', 'school story', 'background', 'backgrounds', 'founding', 'founded', 'foundation', 'foundations', 'foundational',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'identity', 'identities', 'profile', 'highlights']

In [3]:
keywords_values = {'values':2, 'academics':1, 'academic':1, 'skills':1, 'skill':1, 'purpose':2, 'purposes':2,
                       'direction':1, 'mission':2, 'vision':2, 'visions':2, 'missions':2,
                       'ideals':2, 'cause':1, 'causes':1, 'curriculum':2, 'curricular':2,
                       'method':1, 'methods':1, 'pedagogy':2, 'pedagogical':1, 'pedagogies':1, 'approach':1, 'approaches':1, 'model':2, 'models':2, 'system':2, 'systems':2,
                       'structure':1, 'structures':1, 'philosophy':2, 'philosophical':2, 'philosophies':2, 'beliefs':2, 'believe':2, 'belief':2,
                       'principles':2, 'principle':2, 'creed':2, 'creeds':2, 'credo':2, 'moral':2, 'morals':2, 'morality':2, 'history':1, 'histories':1, 'our story':1,
                       'the story':1, 'school story':1, 'background':1, 'backgrounds':1, 'founding':1, 'founded':1, 'foundation':1, 'foundations':1, 'foundational':1,
                       'established':1,'establishment':1, 'our school began':1, 'we began':1,
                       'doors opened':1, 'school opened':1, 'about us':2, 'our school':1, 'who we are':1,
                       'identity':1, 'identities':1, 'profile':1, 'highlights':2}

In [4]:
#charter_path = '../../charters_full_2015.pkl'
#df_charter = pd.read_pickle(charter_path)
#df_charter['WEBTEXT']=df_charter['WEBTEXT'].fillna('') # turn nan to empty list/string for future convenience
#df_charter['CMO_WEBTEXT'] = df_charter['CMO_WEBTEXT'].fillna('0') # ugly hack so that we can apply literal_eval on column later

### Read and Format CMO Webtext Data

In [68]:
# read dataframe with CMO webtext
df_webtext_path = '/home/jovyan/work/misc_data/charters_2015.pkl'
df_webtext = pd.read_pickle(df_webtext_path)

In [69]:
# check type of webtext column
# if str then need to apply literal_eval
type(df_webtext["WEBTEXT"][0])
# check type of CMO webtext column
# if str then need to apply literal_eval
type(df_webtext["CMO_WEBTEXT"][95])

list

In [70]:
# check if there are NaNs in the webtext column
# if 0 then no need to replace NaNs
sum(df_webtext["WEBTEXT"].isna()) 
# check if there are NaNs in the CMO webtext column
# if 0 then no need to replace NaNs
sum(df_webtext["CMO_WEBTEXT"].isna())

0

In [71]:
# check what empty rows in the webtext column contain
df_webtext["WEBTEXT"][1]
# check what empty rows in the CMO webtext column contain
df_webtext["CMO_WEBTEXT"][99]

''

In [72]:
# ONLY NEED TO RUN IF THERE ARE NANS
# turn nan to empty list/string for future convenience
df_webtext['WEBTEXT']=df_webtext['WEBTEXT'].fillna('') 
# ugly hack so that we can apply literal_eval on column later
df_webtext['CMO_WEBTEXT'] = df_webtext['CMO_WEBTEXT'].fillna('0')

Note: The reasoning behind the above two lines is that we want to replace nan with a value that is compatible with the dict_count functions(something that is iterable and empty for nan) below. We will need to use literal_eval on the CMO_WEBTEXT which errors on the empty string so instead we use '0'. After calling literal_eval, we replace 0 with ''.

In [73]:
df_webtext_short = df_webtext[["NCESSCH", "WEBTEXT", "CMO_WEBTEXT"]].rename(columns={"WEBTEXT": "ORIG_WEBTEXT"})
df_webtext_short.head(5)

Unnamed: 0,NCESSCH,ORIG_WEBTEXT,CMO_WEBTEXT
0,10019700000.0,"[(http://www.maef.net/, False, 0, Evening Acad...",
1,20000100000.0,,
2,20015000000.0,"[(https://www.kgbsd.org/ketchikancharter, Fals...",
3,20015000000.0,[(http://tongassschool.org/classrooms/3rd-and-...,
4,20018000000.0,"[(https://www.asdk12.org/aquarian, False, 0, S...",


### Read and Format Unlapped Webtext Data

In [74]:
# read dataframe with unlapped webtext from webtext_unlapped_full.tsv
df_unlapped_path = "/home/jovyan/work/misc_data/webtext_raw/webtext_unlapped_full.tsv"
df_unlapped = pd.read_csv(df_unlapped_path, sep="\t")
df_unlapped = df_unlapped.rename(columns={'text_full': 'WEBTEXT'})

In [75]:
# check type of webtext column
# if str then need to apply literal_eval
type(df_unlapped["WEBTEXT"][0])

str

In [76]:
# check if there are NaNs in the webtext column
# if 0 then no need to replace NaNs
sum(df_unlapped["WEBTEXT"].isna()) 

0

In [None]:
# ONLY NEED TO RUN IF THERE ARE NANS
# ugly hack so that we can apply literal_eval on column later
df_unlapped['WEBTEXT'] = df_unlapped['WEBTEXT'].fillna('0') 

In [77]:
# check if there are webtexts that are just empty strings
# empty strings will cause errors with literal_eval
# have to replace '' with '0' in order to run literal_eval
sum(df_unlapped["WEBTEXT"].apply(lambda x: len(x)) == 0)

0

In [78]:
# need to apply literal_eval to turn str into list
type(ast.literal_eval(df_unlapped['WEBTEXT'].iloc[0]))

list

In [79]:
# apply to whole webtext column
df_unlapped['WEBTEXT'] = df_unlapped['WEBTEXT'].apply(ast.literal_eval)

In [None]:
# if there we replaced '' with '0' in the webtext column
# need to turn 0 back into '' so now all NAN are ''
#df_unlapped['WEBTEXT'] = df_unlapped['WEBTEXT'].replace(0, '') 

In [80]:
df_unlapped.head()

Unnamed: 0.1,Unnamed: 0,WEBTEXT,NCESSCH
0,0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0
1,1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0
2,2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0
3,3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0
4,4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0


### Merge CMO and Webtext Data

In [98]:
df_charter = df_unlapped.merge(df_webtext_short, on="NCESSCH")

In [99]:
df_unlapped.shape, df_webtext_short.shape, df_charter.shape

((6462, 3), (10965, 3), (6462, 5))

In [100]:
df_charter.head()

Unnamed: 0.1,Unnamed: 0,WEBTEXT,NCESSCH,ORIG_WEBTEXT,CMO_WEBTEXT
0,0,"[(http://www.maef.net/, False, 0, Evening Acad...",10019700000.0,"[(http://www.maef.net/, False, 0, Evening Acad...",
1,1,"[(https://www.kgbsd.org/ketchikancharter, Fals...",20015000000.0,"[(https://www.kgbsd.org/ketchikancharter, Fals...",
2,2,[(http://tongassschool.org/classrooms/3rd-and-...,20015000000.0,[(http://tongassschool.org/classrooms/3rd-and-...,
3,3,"[(https://www.asdk12.org/aquarian, False, 0, S...",20018000000.0,"[(https://www.asdk12.org/aquarian, False, 0, S...",
4,4,"[(http://winterberrycharterschool.com/, False,...",20018000000.0,"[(http://winterberrycharterschool.com/, False,...",


In [43]:
page_from_first_school= df_charter['WEBTEXT'].iloc[0][0][3]
page_from_first_school

"Evening Academy\nAlabama's first tuition-free public charter school serves high school students in grades 9-12 from Mobile, Baldwin, and Washington Counties. ACCEL provides a challenging college-preparatory curriculum, individualized instruction, small class sizes, and engaging use of technology in a safe, supportive environment to ensure students graduate college and career ready. Enrollment is now OPEN\nLearn More\nWelcome to the Mobile Area Education Foundation\n75K Degrees\nEvidence2Success\nGraduate Ready\nYes We Can: Building a Blueprint for Equity and Excellence in Mobile’s Schools  \nACCELerate Day and Evening Academy\nEducation Commission\nEngaging Youth Through Engineering\nHarold Dodge Fund-A-S.T.A.R.\nPartners in Education\nReading Buddy\nResearch Alliance for Multiple Pathways (RAMP)\nSuperintendent’s Student Advisory Council\nVital Link 2.0\nThe Mobile Area Education Foundation is a nonprofit organization dedicated to improving our local public schools. Founded in 1992, 

# Text Parsing

In [46]:
# current regex and split parsing does not handle separation of words by / and ()
parse_str = '123, separate the following.      \t         4 words:\n yes/no(hi), progress!'
(re.sub(r'[^\w\s]', '', parse_str)).split() 

['123', 'separate', 'the', 'following', '4', 'words', 'yesnohi', 'progress']

In [47]:
# how it works on page from website of first school
(re.sub(r'[^\w\s]', '', page_from_first_school)).split() 

['Evening',
 'Academy',
 'Alabamas',
 'first',
 'tuitionfree',
 'public',
 'charter',
 'school',
 'serves',
 'high',
 'school',
 'students',
 'in',
 'grades',
 '912',
 'from',
 'Mobile',
 'Baldwin',
 'and',
 'Washington',
 'Counties',
 'ACCEL',
 'provides',
 'a',
 'challenging',
 'collegepreparatory',
 'curriculum',
 'individualized',
 'instruction',
 'small',
 'class',
 'sizes',
 'and',
 'engaging',
 'use',
 'of',
 'technology',
 'in',
 'a',
 'safe',
 'supportive',
 'environment',
 'to',
 'ensure',
 'students',
 'graduate',
 'college',
 'and',
 'career',
 'ready',
 'Enrollment',
 'is',
 'now',
 'OPEN',
 'Learn',
 'More',
 'Welcome',
 'to',
 'the',
 'Mobile',
 'Area',
 'Education',
 'Foundation',
 '75K',
 'Degrees',
 'Evidence2Success',
 'Graduate',
 'Ready',
 'Yes',
 'We',
 'Can',
 'Building',
 'a',
 'Blueprint',
 'for',
 'Equity',
 'and',
 'Excellence',
 'in',
 'Mobiles',
 'Schools',
 'ACCELerate',
 'Day',
 'and',
 'Evening',
 'Academy',
 'Education',
 'Commission',
 'Engaging',
 'Yo

In [48]:
# easy fix although it does add an empty string at end
re.split('\W+', parse_str) 

['123',
 'separate',
 'the',
 'following',
 '4',
 'words',
 'yes',
 'no',
 'hi',
 'progress',
 '']

In [49]:
# side effect: splits phone numbers
re.split('\W+', page_from_first_school) 

['Evening',
 'Academy',
 'Alabama',
 's',
 'first',
 'tuition',
 'free',
 'public',
 'charter',
 'school',
 'serves',
 'high',
 'school',
 'students',
 'in',
 'grades',
 '9',
 '12',
 'from',
 'Mobile',
 'Baldwin',
 'and',
 'Washington',
 'Counties',
 'ACCEL',
 'provides',
 'a',
 'challenging',
 'college',
 'preparatory',
 'curriculum',
 'individualized',
 'instruction',
 'small',
 'class',
 'sizes',
 'and',
 'engaging',
 'use',
 'of',
 'technology',
 'in',
 'a',
 'safe',
 'supportive',
 'environment',
 'to',
 'ensure',
 'students',
 'graduate',
 'college',
 'and',
 'career',
 'ready',
 'Enrollment',
 'is',
 'now',
 'OPEN',
 'Learn',
 'More',
 'Welcome',
 'to',
 'the',
 'Mobile',
 'Area',
 'Education',
 'Foundation',
 '75K',
 'Degrees',
 'Evidence2Success',
 'Graduate',
 'Ready',
 'Yes',
 'We',
 'Can',
 'Building',
 'a',
 'Blueprint',
 'for',
 'Equity',
 'and',
 'Excellence',
 'in',
 'Mobile',
 's',
 'Schools',
 'ACCELerate',
 'Day',
 'and',
 'Evening',
 'Academy',
 'Education',
 'Commi

In [50]:
test_str = "hi joe joe" # used to test hit count functions below
test_list = ['hi joe joe', 'hi joe', 'hi', 'joe']

In [51]:
df_charter['WEBTEXT'].apply(len).sum() # total number of pages

375542

# Hit Count Functions

In [52]:
# Optimized dict_count attempt for cases where entries in 'custom_dict' have long word lengths

# precalculations
dict_words = [entry.split() for entry in keywords] # list words for each dict entry
dict_lengths = [len(x) for x in dict_words]
first_words = [x[0] for x in dict_words] # first words of each entry in dict

def dict_count1(text):
    words_list = re.split('\W+|_', text) # list of words in text
    # find indices where word in first_words matches word in words_list
    mask = [[word == entry for word in words_list] for entry in first_words]
    indices = np.transpose(np.nonzero(mask))
    count = 0
    for ind in indices:
        if ind[1] <= (len(words_list) - dict_lengths[ind[0]]) and dict_words[ind[0]] == words_list[ind[1] : ind[1] + dict_lengths[ind[0]]]:
            count+=1
    return count
    


In [53]:
# Repurposed dict_count and helper function in webparser_mp.py.

max_entry_length = max([len(entry.split()) for entry in keywords]) # Get length (in words) of longest entry in combined dictionary

def dict_count(text):
    
    """Performs dictionary analysis, returning number of dictionary hits found.
    Removes punctuation and stems the phrase being analyzed. 
    Compatible with multiple-word dictionary elements."""    
    
    counts = 0 # number of matches between text_list and custom_dict
    splitted_phrase = re.split('\W+|_', text) # Remove punctuation with regex that keeps only letters and spaces

    # Do dictionary analysis for word chunks of lengths max_entry_length down to 1
    for length in range(1, max_entry_length + 1):
        if len(splitted_phrase) < length:
            continue # If text chunk is shorter than length of dict entries being matched, there are no matches.
        for i in range(len(splitted_phrase) - length + 1):
            entry = ' '.join(splitted_phrase[i:i+length]) # Builds chunk of 'length' words without ending space
            if entry in keywords:
                counts += 1
    
    return counts

In [54]:
# Hybrid approach

# Separate keywords to be treated differently
small_keywords = []
large_keywords = []

for entry in keywords:
    small_keywords.append(entry) if len(entry.split()) < 3 else large_keywords.append(entry)

large_words = [entry.split() for entry in large_keywords] # list words for each large dict entry
large_lengths = [len(x) for x in large_words]
large_first_words = [x[0] for x in large_words] # first words of each large entry in dict

def dict_count2(text):

    """Hybrid of dict_count and dict_count1. 
    
    Uses dict_count1 approach to count matches for entries with > 2 words in keywords.
    Uses dict_count approach for all other entries.
    """

    counts = 0 # hitscore
    splitted_phrase = re.split('\W+|_', text.lower()) # Remove punctuation with regex that keeps only letters and spaces

    for length in range(1, 3):
        if len(splitted_phrase) < length:
            continue # If text chunk is shorter than length of dict entries being matched, there are no matches.
        for i in range(len(splitted_phrase) - length + 1):
            entry = ' '.join(splitted_phrase[i:i+length]) # Builds chunk of 'length' words without ending space
            if entry in keywords:
                counts += keywords_values[entry]
    mask = [[word == entry for word in splitted_phrase] for entry in large_first_words]
    indices = np.transpose(np.nonzero(mask))
    for ind in indices:
        if ind[1] <= (len(splitted_phrase) - large_lengths[ind[0]]) and large_words[ind[0]] == splitted_phrase[ind[1] : ind[1] + large_lengths[ind[0]]]:
            counts += keywords_values[large_keywords[ind[0]]]
    return counts

In [55]:
class Page:
    def __init__(self,p):
        self.url = p[0]
        self.boo = p[1]
        self.depth = p[2]
        self.text = p[3]
    def __repr__(self):
        return self.text
    def __eq__(self, other):
        if isinstance(other, Page):
            return self.text == other.text
        else:
            return False
    def __ne__(self, other):
        return (not self.__eq__(other))
    def __hash__(self):
        return hash(self.__repr__())

In [64]:
def filter_pages(school_pages, MIN_HITCOUNT = 1, MAX_NUMPAGES = 250, AGGRO = False, is_set = False):
    """Filters page text with hit count at least min hit count if school has more than MAX_NUMPAGES distinct pages else unfiltered of pages is returned.
    
    Returns max_numpages pages with priority given to higher hitscore and then lower page depth(even when AGGRO is TRUE). Boolean value returned is to help generate WEBTEXT_METHOD later.
    school_pages: entry of 'webtext' column
    is_set: True if school_pages is set of pages
    aggro: When true, only pages that have >= MIN_HITCOUNT hits pass. Only resort to CMO pages when no pages pass
    """
    if not is_set:
        school_pages = set([Page(p) for p in school_pages])
#     if len(pages) <= MAX_NUMPAGES:
#         return ([(p.url, p.boo, p.depth, p.text) for p in pages], 0)
    all_tuples = []
    filtered_num = 0 # number of pages that passed the hitscore requirement
    filtered = []
    max_hc = -1
    min_depth = 99999
    for p in school_pages:
        hit_count = dict_count2(p.text)
        if hit_count >= MIN_HITCOUNT:
            filtered.append((hit_count, (p.url, p.boo, p.depth, p.text)))
            filtered_num += 1
#         if max_hc < hit_count:
#             max_hc = hit_count
#         if min_depth > int(p.depth):
#             min_depth = int(p.depth)
        # maintain list containing all pages and corresponding hit scores
        all_tuples.append((hit_count, (p.url, p.boo, p.depth, p.text)))
    if not AGGRO and filtered_num and filtered_num <= MAX_NUMPAGES:
            return ([t[1] for t in filtered], False)        
    all_tuples = [(t[0] - .00001*int(t[1][2]), t[1]) for t in all_tuples] # prepare list to be heapified
    if AGGRO:
        all_tuples = filtered
    # priority number is hit_count - .00001*page.depth so pages with high hitscores are prioritized followed by low page depths
    filtered = [t[1] for t in q.nlargest(MAX_NUMPAGES, all_tuples)]
    return (filtered, filtered_num == False)

In [66]:
def run_filter(df, type = 'a', MIN_HITCOUNT = 1, MAX_NUMPAGES = 250, AGGRO = False):
    """Runs filter of given type and returns modified df.
    
    MAX_NUMPAGES: upper limit on number of pages for each school. Priority is given to pages with high hit scores. Ties broken by lower page depths
    type: column to run filter on. 'c' for CMO_WEBTEXT, 'w' for 'WEBTEXT', 'a' for complete filter with backup/last resort pages implemented
    MIN_HITCOUNT: min hit count to pass filter
    df: dataframe to be run on
    'w' and 'c' types save to checkpoint file
    aggro: When true, only pages that have >= MIN_HITCOUNT hits pass. Only resort to CMO pages when no pages pass
    """
    if type == 'w':
        df_charter = df
        print('WEBTEXT Page filter start. Min hit count: {:f}'.format(MIN_HITCOUNT))
        filtered_pages = []
        s = []
        start = time.time()
        for i, row in enumerate(df_charter['WEBTEXT'].values):
            result = filter_pages(row, MIN_HITCOUNT, MAX_NUMPAGES, AGGRO)
            filtered_pages.append(result[0])
            s.append(result[1])
            if i%1000 == 0:
                end = time.time()
                print('Time Elapsed:{:f}, Percent Complete:{:f}'.format(end - start,i*100/len(df_charter)))
        df_charter['WEBTEXT'] = pd.Series(filtered_pages, index=df_charter.index)
        df_charter['WEBTEXT_EMPTY'] = pd.Series(s, index=df_charter.index) # 0 means no filter+nonempty, False = nonempty, True = empty
        #ckpt_file_path = 'charters_full_2015{:s}{:d}_checkpoint1.pkl'.format(type,round(10*MIN_HITCOUNT))
        ckpt_file_path = 'webtext_unlapped_replaced_filtered_{:s}{:d}_checkpoint1.tsv'.format(type,round(10*MIN_HITCOUNT))
        #df_charter.to_pickle(ckpt_file_path) # checkpoint file contains new column 'FILTERED_TEXT'
        df_charter.to_csv(ckpt_file_path, sep='\t')
        print('Completed text filtering. Saved checkpoint to ' + ckpt_file_path)
    elif type == 'c':
        df_charter = df
        print('CMO_WEBTEXT Page filter start. Min hit count: {:f}'.format(MIN_HITCOUNT))
        filtered_pages = []
        s = []
        start = time.time()
        for i, row in enumerate(df_charter['CMO_WEBTEXT'].values):
            result = filter_pages(row, MIN_HITCOUNT, MAX_NUMPAGES, AGGRO)
            filtered_pages.append(result[0])
            s.append(result[1])
            if i%1000 == 0:
                end = time.time()
                print('Time Elapsed:{:f}, Percent Complete:{:f}'.format(end - start,i*100/len(df_charter)))
        df_charter['CMO_WEBTEXT'] = pd.Series(filtered_pages, index=df_charter.index)
        df_charter['CMO_WEBTEXT_EMPTY'] = pd.Series(s, index=df_charter.index)
        #ckpt_file_path = 'charters_full_2015{:s}{:d}_checkpoint1.pkl'.format(type,round(10*MIN_HITCOUNT))
        #df_charter.to_pickle(ckpt_file_path) # checkpoint file contains new column 'FILTERED_TEXT'
        ckpt_file_path = 'webtext_unlapped_replaced_filtered_{:s}{:d}_checkpoint1.tsv'.format(type,round(10*MIN_HITCOUNT))
        df_charter.to_csv(ckpt_file_path, sep='\t')
        print('Completed text filtering. Saved checkpoint to ' + ckpt_file_path)
    elif type == 'a':
        print('Complete Page filter start. Min hit count: {:f}'.format(MIN_HITCOUNT))
        filtered_pages = []
        s = []
        start = time.time()
        for i, row in enumerate(df['WEBTEXT'].values):
            pages = set([Page(p) for p in row])
            if sum([len(re.split('\W+|_', p.text)) for p in pages]) < 10: # Check num words < 10
                result_cmo = filter_pages(df.loc[df.index[i], 'CMO_WEBTEXT'], MAX_NUMPAGES, AGGRO, False)
                if result_cmo[1]:
                    result = filter_pages(pages, MIN_HITCOUNT, MAX_NUMPAGES, AGGRO, True)
                    filtered_pages.append(result[0])
                    s.append(2) # 2 means last resort webtext
                else:
                    filtered_pages.append(result_cmo[0])
                    s.append(1) # 1 in WEBTEXT_METHOD means replaced with CMO filtered  text
            else:
                result = filter_pages(pages, MIN_HITCOUNT, MAX_NUMPAGES, AGGRO, True)
                if result[1]:
                    result_cmo = filter_pages(df.loc[df.index[i], 'CMO_WEBTEXT'], MAX_NUMPAGES, AGGRO)
                    if result_cmo[1]:
                        filtered_pages.append(result[0])
                        s.append(2)
                    else:
                        filtered_pages.append(result_cmo[0])
                        s.append(1)
                else:
                    filtered_pages.append(result[0])
                    s.append(0) # 0 in WEBTEXT_METHOD means normal filter              
            if i%1000 == 0:
                end = time.time()
                print('Time Elapsed:{:f}, Percent Complete:{:f}'.format(end - start,i*100/len(df)))
        df.loc[:,'WEBTEXT_METHOD'] = np.array(s) # see above for key
        df.loc[:,'WEBTEXT'] = pd.Series(filtered_pages, index=df.index)
        return df
#         ckpt_file_path = 'charters_full_2015{:s}{:d}_checkpoint1.pkl'.format(type,round(10*MIN_HITCOUNT))
#         df.to_pickle(ckpt_file_path) # checkpoint file contains new column 'FILTERED_TEXT'
#         print('Completed text filtering. Saved checkpoint to ' + ckpt_file_path)

In [58]:
# pd.concat(results[:round(len(results)/2)]).to_pickle('charters_full_2015_250_1.pkl') # in case of kernel failure

In [59]:
# pd.concat(results[round(len(results)/2):].to_pickle('charters_full_2015_250_2.pkl')

In [65]:
# testing on first page
filter_pages(df_charter['WEBTEXT'].iloc[0], 2)

([('http://www.maef.net/accel-merit-scholar-awards/',
   'False',
   '2',
   't Scholar Awards\nStudents of ACCEL Day and Evening Academy scholars earn the Merit Award for being models of ACCEL’s Five Core Values:\nAchievement\xa0underscores the belief that we can reach our goals through focus and hard work.\nCollaboration\xa0reminds us that we can be our best and achieve the most when we work well with others.\nCare\xa0emphasizes that we must show appreciation and concern for our community and fellow-man.\nEmpowerment\xa0means that we must take the initiative to give voice and power to our interests in a responsible way.\nLifelong-Learning\xa0encourages students to seek “learning” in and out of school for the rest of their lives.\nMERIT AWARD SCHOLARS\nWeek of 9/04/2017 Winner\nJoseph Johnson\nClick to enlarge\nWeek of 8/21/2017 Winners\nBailey Davis, Larry Smith, Derrick Carson, Micah Kinlaw, Klintaveus Thompson\nClick to Enlarge\nHome\n | \nAbout\n | \nOur Work\n | \nOur Impact\n | 

# Performance

In [37]:
start = time.time()
v1=dict_count(page_from_first_school)
end = time.time()
print('dict_count:',end - start,v1) # time it took, hit count

start = time.time()
v2=dict_count1(page_from_first_school)
end = time.time()
print('dict_count1:', end - start,v2)

start = time.time()
v2=dict_count2(page_from_first_school)
end = time.time()
print('dict_count2:', end - start,v2)

dict_count: 0.005997419357299805 2
dict_count1: 0.007956266403198242 2
dict_count2: 0.004393100738525391 2


In [38]:
start = time.time()
df_charter['WEBTEXT'].iloc[:1000].apply(lambda x: [dict_count(a[3]) for a in x])
end = time.time()
print('dict_count:',end - start)
start = time.time()
df_charter['WEBTEXT'].iloc[:1000].apply(lambda x: [dict_count1(a[3]) for a in x])
end = time.time()
print('dict_count1:', end - start) # slow by itself since most keywords are only 1-2 words long
start = time.time()
df_charter['WEBTEXT'].iloc[:1000].apply(lambda x: [dict_count2(a[3]) for a in x])
end = time.time()
print('dict_count2:', end - start) # ~25% faster for given dictionary with max length keyword 3

dict_count: 156.05708813667297
dict_count1: 180.12152433395386
dict_count2: 116.56855177879333


# Text Filtering

### Run filter for 250, 100, 10 pages (AGGRO=False)

In [67]:
webtext_unlapped_replaced_filtered_250 = run_filter(df_charter)

Complete Page filter start. Min hit count: 1.000000
Time Elapsed:0.134748, Percent Complete:0.000000
Time Elapsed:126.401389, Percent Complete:15.475085
Time Elapsed:207.002477, Percent Complete:30.950170
Time Elapsed:275.838635, Percent Complete:46.425255
Time Elapsed:353.740995, Percent Complete:61.900340
Time Elapsed:444.126610, Percent Complete:77.375426
Time Elapsed:528.882757, Percent Complete:92.850511


In [84]:
webtext_unlapped_replaced_filtered_100 = run_filter(df_charter, MAX_NUMPAGES=100)

Complete Page filter start. Min hit count: 1.000000
Time Elapsed:0.138420, Percent Complete:0.000000
Time Elapsed:125.774794, Percent Complete:15.475085
Time Elapsed:206.475465, Percent Complete:30.950170
Time Elapsed:275.282981, Percent Complete:46.425255
Time Elapsed:353.765485, Percent Complete:61.900340
Time Elapsed:443.309996, Percent Complete:77.375426
Time Elapsed:528.907617, Percent Complete:92.850511


In [88]:
webtext_unlapped_replaced_filtered_10 = run_filter(df_charter, MAX_NUMPAGES=10)

Complete Page filter start. Min hit count: 1.000000
Time Elapsed:0.135902, Percent Complete:0.000000
Time Elapsed:125.631185, Percent Complete:15.475085
Time Elapsed:205.496432, Percent Complete:30.950170
Time Elapsed:274.168323, Percent Complete:46.425255
Time Elapsed:351.842780, Percent Complete:61.900340
Time Elapsed:441.701090, Percent Complete:77.375426
Time Elapsed:527.003666, Percent Complete:92.850511


### Run filter for 100, 50, 10 pages (AGGRO=True)

In [92]:
webtext_unlapped_replaced_aggro_filtered_100 = run_filter(df_charter, MAX_NUMPAGES=100, AGGRO=True)

Complete Page filter start. Min hit count: 1.000000
Time Elapsed:0.125868, Percent Complete:0.000000
Time Elapsed:126.545441, Percent Complete:15.475085
Time Elapsed:206.471148, Percent Complete:30.950170
Time Elapsed:275.222400, Percent Complete:46.425255
Time Elapsed:352.759031, Percent Complete:61.900340
Time Elapsed:442.528435, Percent Complete:77.375426
Time Elapsed:527.587749, Percent Complete:92.850511


In [97]:
webtext_unlapped_replaced_aggro_filtered_50 = run_filter(df_charter, MAX_NUMPAGES=50, AGGRO=True)

Complete Page filter start. Min hit count: 1.000000
Time Elapsed:0.124559, Percent Complete:0.000000
Time Elapsed:126.171555, Percent Complete:15.475085
Time Elapsed:206.255093, Percent Complete:30.950170
Time Elapsed:275.070494, Percent Complete:46.425255
Time Elapsed:352.404799, Percent Complete:61.900340
Time Elapsed:442.281952, Percent Complete:77.375426
Time Elapsed:526.952468, Percent Complete:92.850511


In [101]:
webtext_unlapped_replaced_aggro_filtered_10 = run_filter(df_charter, MAX_NUMPAGES=10, AGGRO=True)

Complete Page filter start. Min hit count: 1.000000
Time Elapsed:0.125017, Percent Complete:0.000000
Time Elapsed:124.357771, Percent Complete:15.475085
Time Elapsed:202.979065, Percent Complete:30.950170
Time Elapsed:270.419367, Percent Complete:46.425255
Time Elapsed:347.602115, Percent Complete:61.900340
Time Elapsed:435.992736, Percent Complete:77.375426
Time Elapsed:520.502262, Percent Complete:92.850511


### Save dataframes in webtext_raw

In [102]:
webtext_raw_path = "/home/jovyan/work/misc_data/webtext_raw/"
webtext_unlapped_replaced_filtered_250.to_csv(webtext_raw_path + "webtext_unlapped_replaced_filtered_250.tsv", sep="\t")
webtext_unlapped_replaced_filtered_100.to_csv(webtext_raw_path + "webtext_unlapped_replaced_filtered_100.tsv", sep="\t")
webtext_unlapped_replaced_filtered_10.to_csv(webtext_raw_path + "webtext_unlapped_replaced_filtered_10.tsv", sep="\t")

In [103]:
webtext_unlapped_replaced_aggro_filtered_100.to_csv(webtext_raw_path + "webtext_unlapped_replaced_aggro_filtered_100.tsv", sep="\t")
webtext_unlapped_replaced_aggro_filtered_50.to_csv(webtext_raw_path + "webtext_unlapped_replaced_aggro_filtered_50.tsv", sep="\t")
webtext_unlapped_replaced_aggro_filtered_10.to_csv(webtext_raw_path + "webtext_unlapped_replaced_aggro_filtered_10.tsv", sep="\t")

### Old dataframes (?)

In [40]:
df_charter = run_filter(df_charter, MIN_HITCOUNT = 1.5)

Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.030406, Percent Complete:0.000000
Time Elapsed:146.592043, Percent Complete:8.679802
Time Elapsed:353.703931, Percent Complete:17.359604
Time Elapsed:530.819655, Percent Complete:26.039406
Time Elapsed:677.192208, Percent Complete:34.719208
Time Elapsed:825.153257, Percent Complete:43.399011
Time Elapsed:971.440240, Percent Complete:52.078813
Time Elapsed:1107.583437, Percent Complete:60.758615
Time Elapsed:1300.784274, Percent Complete:69.438417
Time Elapsed:1603.102855, Percent Complete:78.118219
Time Elapsed:1750.707799, Percent Complete:86.798021
Time Elapsed:1901.627595, Percent Complete:95.477823


In [21]:
# Parallel if you have access to XL VM
with mp.Pool(processes = round(mp.cpu_count()/2)) as pool:
    results = pool.map(run_filter, [df_charter[300*i:i*300+300] for i in range(round(len(df_charter)/300)+1)]) # execute on chunks of 300(arbitrary)

Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.024925, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.269800, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.161939, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.174518, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.007863, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.105508, Percent Complete:0.000000
Time Elapsed:0.006315, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.075708, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.012661, Percent Complete:0.000000
Complete Page filter start. Min hit count: 1.500000
Time Elapsed:0.006639, Percent Complete

In [22]:
(pd.concat(results)).to_pickle('../../charters_full_2015_250_new.pkl')

In [24]:
pd.concat(results)['WEBTEXT_METHOD'].value_counts()

0    6583
2    4922
1      16
Name: WEBTEXT_METHOD, dtype: int64

In [11]:
# create filtered_text2 column containing 'WEBTEXT' pages with at least 2 hit counts for each school
run_filter('w', 2) # should take ~15 min


WEBTEXT Page filter start. Min hit count: 2
Time Elapsed:0.004377, Percent Complete:0.000000
Time Elapsed:82.185403, Percent Complete:8.679802
Time Elapsed:178.682582, Percent Complete:17.359604
Time Elapsed:294.779538, Percent Complete:26.039406
Time Elapsed:318.126451, Percent Complete:34.719208
Time Elapsed:318.128265, Percent Complete:43.399011
Time Elapsed:318.129419, Percent Complete:52.078813
Time Elapsed:348.292155, Percent Complete:60.758615
Time Elapsed:498.321725, Percent Complete:69.438417
Time Elapsed:719.577975, Percent Complete:78.118219
Time Elapsed:821.129462, Percent Complete:86.798021
Time Elapsed:920.829367, Percent Complete:95.477823


In [12]:
# create cmo_filtered_text2 column containing 'CMO_WEBTEXT' pages with at least 2 hit counts for each school
run_filter('c', 2) # should take ~15 min

CMO_WEBTEXT Page filter start. Min hit count: 2
Time Elapsed:0.157997, Percent Complete:0.000000
Time Elapsed:222.392479, Percent Complete:8.679802
Time Elapsed:544.479955, Percent Complete:17.359604
Time Elapsed:867.602014, Percent Complete:26.039406
Time Elapsed:939.218461, Percent Complete:34.719208
Time Elapsed:939.219584, Percent Complete:43.399011
Time Elapsed:939.220661, Percent Complete:52.078813
Time Elapsed:939.221899, Percent Complete:60.758615
Time Elapsed:939.223158, Percent Complete:69.438417
Time Elapsed:939.224231, Percent Complete:78.118219
Time Elapsed:939.225359, Percent Complete:86.798021
Time Elapsed:939.226450, Percent Complete:95.477823


In [None]:
df_charter['CMO_REPLACED'] = df_charter['WEBTEXT_METHOD'] == 1 # replaced wtih CMO filtered pages
# df_right = df_charter.groupby('CMO_NAME')['REPLACED'].sum() > 0 # df to be merged to the right of df_charter
# df_right = df_right.reset_index()
# df_right.rename(columns={"REPLACED": "CMO_REPLACED"},inplace=True)
# df_charter = pd.merge(df_charter, df_right, how = 'left', on = ['CMO_NAME'])
# ckpt_file_path = 'charters_full_2015_{:d}.pkl'.format(round(float(sys.argv[2])*10))
# df_charter.to_pickle(ckpt_file_path) # checkpoint file contains new 'CMO_REPLACED','WEBTEXT_METHOD', and filtered 'WEBTEXT' columns
# print('Completed text filtering. Saved checkpoint to ' + ckpt_file_path)

In [None]:
ckpt_file_path = 'charters_full_2015_15.pkl'
df_charter.to_pickle(ckpt_file_path)

Future Ideas

Compile list of 100 or so categorized pages and use as false negative test for efficacy of page filtering

This list could possibly be compiled using a search for common mission/school objective page titles

Assign weights to keywords to allow for more continuous filtering of schools

