# Clean webtext from charter school data

- Author: Jaren Haber, Madeleine Peng, James Jung
- Institution: UC Berkeley
- Date created: July 26, 2019
- Date last edited: 

Description: 

## Initialize

In [70]:
# Import packages
import pandas as pd # For working with DataFrames
import gc # To accelerate loading pickle files
import nltk
# Show visualizations within notebook:
%matplotlib inline 

In [71]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [72]:
# Load functions from data_tools directory:
import sys; sys.path.insert(0, "../tools")

# For displaying basic DF info, storing DFs for memory efficiency, and loading a filtered DF:
from df_tools import check_df, convert_df, load_filtered_df, replace_df_nulls

# For quickly loading & saving pickle files in Python:
from quickpickle import quickpickle_dump, quickpickle_load 

# For saving and loading text lists to/from file:
from textlist_file import write_list, load_list 

In [73]:
# Set file paths
charters_path = "../../nowdata/charters_2015.pkl"
original_path = "../../nowdata/backups/charters_full_2015_250_v2a_unlappedtext_counts3_geoleaid.pkl"
filtered_path = "../../nowdata/parsing/filtered_10.pkl"
raw_folder = "../../nowdata/webtext_raw/" # for raw webtext when extracted
raw_filtered_data = raw_folder + "webtext_unlapped_filtered_10.tsv"
cleaned_folder = "../../nowdata/webtext_cleaned/" # for cleaned webtext: save as CSV, include ONLY the columns "NCESSCH" (unique school identifier) and "text_full" (renamed from "WEBTEXT")

In [74]:
#Import spacy and clean_text to test clean_sentence
import spacy, clean_text
from clean_text import clean_sentence

In [76]:
#Test clean_sentence on string with remove_acronyms=False
clean_sentence("ABOUT\nSCHOOLS\nPROGRAMS\nENROLL\nPARENT LINKS\nCOMMUNITY\nCONTACT\nMORE>>\nBENNETT ACADEMY ACCELERATED HIGH SCHOOL\xa0\n\u200b\nBennett Accelerated High School\xa0\nBennett Accelerated High School will begin serving students in 9th and 10th grade in August 2018. Bennett Accelerated High School will offer the followng:\xa0\n\u200b\nA good school with good discipline\nA well-organized learning environment with a friendly vibe\nA place where teachers, staff, and students treat each other kindly and respectfully\nAn environment with friends (and we add, good friends)\nTeachers who care about their students and give them good help\nCourses that teach them what they need\nCore classes and several electives\nSports\nVision and Purpose\nGrade 9 Course Descriptions\nGrade 10 Course Descriptions\nFAQ\nEnroll Now\n© 2012 by Twenty First\xa0Century\xa0\n\xa0 \xa0 \xa0Charter Schools\n\u200b\nBennett Academy Middle School\n\u200b2930 W. Bethany Home Road. Phoenix, AZ\xa085017\nBennett Academy Primary School - Venture Site\n1535 W.\xa0Dunlap Ave. Phoenix, AZ 85021\nWebmaster Login\n\ufeffPursuant to A.R.S. §38-431.02, Twenty First\xa0Century\xa0\nSchools, Inc. hereby states that all notices of the meetings of Bennett Academy and Bennett Venture Site will be posted at each administrative office and on our website (www.bennettacademy.com). The location is open to the public Monday through Friday from 8:00 AM to 4:00 PM. Such notices will indicate the date, time and place of the meeting and will include an agenda or information concerning the manner in which the public may obtain an agenda for the meeting.\n\u200b", slow_webclean = True, stemming = True, remove_acronyms=False)

['about',
 's',
 'program',
 'enrol',
 'link',
 'commun',
 'contact',
 'more',
 'bennett',
 'academi',
 'high',
 'begin',
 'serv',
 'student',
 'th',
 'th',
 'grade',
 '',
 'offer',
 'followng',
 'a',
 'good',
 'school',
 'good',
 'disciplin',
 'a',
 'well-organ',
 'learn',
 'environ',
 'friendli',
 'vibe',
 'a',
 'place',
 'teacher',
 'staff',
 'student',
 'treat',
 'kindli',
 'respect',
 'an',
 'environ',
 'friend',
 'add',
 'good',
 'friend',
 'teacher',
 'care',
 'student',
 'give',
 'good',
 'help',
 'teach',
 'need',
 'core',
 'class',
 'sever',
 'elect',
 'sport',
 '',
 '',
 'enrol',
 'now',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'bmaster',
 '',
 'herebi',
 'state',
 'notic',
 'meet',
 'post',
 'administr',
 'offic',
 'websit',
 'wwwbennettacademycom',
 'the',
 'locat',
 'open',
 'public',
 '',
 'am',
 '',
 'pm',
 'such',
 'notic',
 'indic',
 'date',
 'time',
 'place',
 'meet',
 'includ',
 'agenda',
 'inform',
 'concern',
 'manner',
 'public',
 'obtain',
 'agenda',
 

## Define helper function(s)

In [77]:
def clean_webtext(ls):
    
    '''This function cleans and tokenizes sentences, removing punctuation and numbers and making words into lower-case stems.
    Inputs: list of strings;
    This function loops over all elements in the input list given, cleans the texts and returns one string'''
        
    global mpdo # Check if we're doing multiprocessing. If so, then mpdo=True
    global sents_combined # Grants access to variable holding a list of lists of words, where each list of words represents a sentence in its original order (only relevant for this function if we're not using multiprocessing)
    global pcount # Grants access to preprocessing counter
    
    known_pages = set() # Initialize list of known pages for a school
    sents_combined = [] # Initialize list of all school's sentences
    school_sentslist = []
    #print('Parsing school #' + str(pcount)) # Print number of school being parsed

    for s in ls: # Iterate over tuples in tuplist (list of tuples)
        for chunk in s.split('\n'): 
            for sent in nltk.sent_tokenize(chunk): # Tokenize chunk by sentences (in case >1 sentence in chunk)
                #sent = clean_sentence(sent, fast=True) # Clean and tokenize sentence
                sent = clean_sentence(sent)
                if ((sent == []) or (len(sent) == 0)): # If sentence is empty, continue to next sentence without appending
                    continue
                
                sents_combined.extend(sent) # add sent to school object
                
    school_sentslist.append(sents_combined) # add list of sentence to full list 
    
    return sents_combined

## Clean webtext

In [78]:
# Load data
df = pd.read_csv(raw_filtered_data, sep="\t", encoding="utf-8")[["text_full", "NCESSCH"]]
check_df(df, "NCESSCH")

# rows and cols:  (6103, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
text_full
NCESSCH


In [79]:
df.head()

Unnamed: 0,text_full,NCESSCH
0,['ut Reading Buddy\nThe Reading Buddy program ...,10019700000.0
1,['Select a School...\nSelect a School\nKetchik...,20015000000.0
2,['l Enrollment Homeschool\nTitle IX: Assuring ...,20015000000.0
3,['Select a School...\nSelect a School\nAbbott ...,20018000000.0
4,['eer Info\nWCC\nWPG\nBrowse: \nHome\n» \nWPG\...,20018000000.0


In [None]:
ls_str = [] #initialize list for concatenated string for webtext per school

for i in range(len(df)):
    ls_str.append(' '.join(clean_webtext(df['text_full'][i])))
    if i%500 == 0:
        print("Cleaned ", i, " rows")
    
df['text_full'] = ls_str

Cleaned  0  rows


## Next step: Merging!
Now merge this cleaned text with the covariates from the previous version of the charter schools data (change only the `text_full` column).