# Extract webtext from charter school data

- Author: Jaren Haber
- Institution: UC Berkeley
- Date created: July 26, 2019
- Date last edited: July 26, 2019

Description: This simple notebook reads the charter data into memory, and saves only the WEBTEXT and NCESSCH columns to disk as a tab-separated file.

## Initialize

In [24]:
# Import packages
import pandas as pd # For working with DataFrames
import gc # To accelerate loading pickle files
import ast # for recognizing strings (which Python thinks are floats)
from nltk import word_tokenize

# Show visualizations within notebook:
%matplotlib inline 

In [6]:
# Load functions from data_tools directory:
import sys; sys.path.insert(0, "../tools")

# For displaying basic DF info, storing DFs for memory efficiency, and loading a filtered DF:
from df_tools import check_df, convert_df, load_filtered_df, replace_df_nulls

# For quickly loading & saving pickle files in Python:
from quickpickle import quickpickle_dump, quickpickle_load 

# For saving and loading text lists to/from file:
from textlist_file import write_list, load_list 

In [7]:
# Set file paths
unlapped_full_path = "../../nowdata/charters_2015.pkl"
unlapped_filt10_path = "../../nowdata/parsing/filtered_10.pkl"
original_full_path = "../../sc_data/new_processed_df_070618.pkl"
original_filt250_path = "../../nowdata/backups/charters_full_2015_250_v2a_orgtext.pkl"
raw_folder = "../../nowdata/webtext_raw/" # for raw webtext when extracted
cleaned_folder = "../../nowdata/webtext_cleaned/" # for cleaned webtext: save as CSV, include ONLY the columns "NCESSCH" (unique school identifier) and "text_full"

## Extract webtext from filtered, overlap-removed (current) charter website data

In [46]:
# Load and filter WEBTEXT and NCESSCH
df = load_filtered_df(
    unlapped_filt10_path, 
    ["WEBTEXT", "NCESSCH"])

# Filter out those with no webtext
df = df[df["WEBTEXT"].notnull()][df["WEBTEXT"]!=0][df['WEBTEXT'].apply(len) > 0]

# Filter out those with zero words
df["NUMWORDS"] = df["WEBTEXT"].apply(
    lambda x: sum([len(word_tokenize(page)) for page in x]))
df = df[df['NUMWORDS'] > 0.0] 

# Remove NUMWORDS, rename webtext column
df = df[["WEBTEXT", "NCESSCH"]].rename(columns = {"WEBTEXT" : "text_full"}).reset_index(drop=True)
check_df(df, "NCESSCH")

# Dump filtered data to disk
df.to_csv(raw_folder + "webtext_unlapped_filtered_10.tsv", sep = "\t", encoding = "utf-8")
#quickpickle_dump(df, raw_folder + "webtext_filtered_10.pkl")

# rows and cols:  (10965, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
NCESSCH
WEBTEXT
# rows and cols:  (6103, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
text_full
NCESSCH


## Extract webtext from UN-filtered, overlap-removed charter website data

In [47]:
# Load and filter WEBTEXT and NCESSCH
df = load_filtered_df(
    unlapped_full_path, 
    ["WEBTEXT", "NCESSCH"])

# Filter out those with no webtext
df = df[df["WEBTEXT"].notnull()][df["WEBTEXT"]!=0][df['WEBTEXT'].apply(len) > 0]

# Filter out those with zero words
df["NUMWORDS"] = df["WEBTEXT"].apply(
    lambda x: sum([len(word_tokenize(tup[3])) for tup in x if tup!=[]]))
df = df[df['NUMWORDS'] > 0.0] 

# Remove NUMWORDS, rename webtext column
df = df[["WEBTEXT", "NCESSCH"]].rename(columns = {"WEBTEXT" : "text_full"}).reset_index(drop=True)
check_df(df, "NCESSCH")

# Dump filtered data to disk
df.to_csv(raw_folder + "webtext_unlapped_full.tsv", sep = "\t", encoding = "utf-8")
#quickpickle_dump(df, raw_folder + "webtext_full_unlapped.pkl")

# rows and cols:  (10965, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
NCESSCH
WEBTEXT
# rows and cols:  (6462, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
text_full
NCESSCH


## Extract webtext from UN-filtered, UN-overlap-removed (original) charter website data

In [67]:
# Load and filter WEBTEXT and NCESSCH
df = pd.read_pickle(original_full_path)
df = df[["data", "NCESSCH"]]

# For consistency, rename column:
df.rename(inplace = True, columns = {"data" : "WEBTEXT"})

# Filter out those with no webtext
df = df[df["WEBTEXT"].notnull()][df["WEBTEXT"]!=0][df['WEBTEXT'].apply(len) > 0]

# Filter out those with zero words
df["NUMWORDS"] = df["WEBTEXT"].apply(
    lambda x: sum([len(word_tokenize(tup[3])) for tup in x if tup!=[]]))
df = df[df['NUMWORDS'] > 0.0] 

# Remove NUMWORDS, rename webtext column
df = df[["WEBTEXT", "NCESSCH"]].rename(columns = {"WEBTEXT" : "text_full"}).reset_index(drop=True)
check_df(df, "NCESSCH")

# Dump filtered data to disk
df.to_csv(raw_folder + "webtext_original_full.tsv", sep = "\t", encoding = "utf-8")
#quickpickle_dump(df, raw_folder + "webtext_full_original.pkl")

# rows and cols:  (6504, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
text_full
NCESSCH


## Extract webtext from UN-filtered, UN-overlap-removed, filtered 250 charter website data

In [25]:
# Load and filter WEBTEXT and NCESSCH
df = load_filtered_df(original_filt250_path, ["WEBTEXT", "NCESSCH"])

# Coerce text column from string to list of strings, while extracting page text only (from list of quadruples)
#df['WEBTEXT'] = df['WEBTEXT'].apply(lambda x: [ast.literal_eval(page[3]) for page in x])

# Filter out those with no webtext
df = df[df["WEBTEXT"].notnull()][df["WEBTEXT"]!=0][df['WEBTEXT'].astype(str).apply(len) > 0]

# Filter out those with zero words (count words first)
df["NUMWORDS"] = df["WEBTEXT"].apply(
    lambda x: sum([len(word_tokenize(tup[3])) for tup in x if tup!=[]]))
df = df[df['NUMWORDS'] > 0.0] 

# Remove NUMWORDS, rename webtext column
df = df[["WEBTEXT", "NCESSCH"]].rename(columns = {"WEBTEXT" : "text_full"}).reset_index(drop=True)
check_df(df, "NCESSCH")

# Dump filtered data to disk
df.to_csv(raw_folder + "webtext_original_filtered_250.tsv", sep = "\t", encoding = "utf-8")
#quickpickle_dump(df, raw_folder + "webtext_full_original.pkl")

# rows and cols:  (6463, 2)
# duplicates by NCESSCH: 0

Columns and # missing cases (if any): 
text_full
NCESSCH
