## Import libraries & modules

In [45]:
import pandas as pd 
import numpy as np
import pickle
import datetime
import sys
import re
from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply"

# For text parsing & modeling
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import text
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
from gensim.models.phrases import Phrases # Makes word2vec more robust: Looks not just at  To look for multi-word phrases within word2vec
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer() # approximate but effective (and common) method of stemming words

# For accurate text tokenization
import spacy
nlp = spacy.load('en', disable=['ner'])
#nlp.remove_pipe('ner')
#nlp.remove_pipe('parser')
#nlp.remove_pipe('tagger')
#nlp.add_pipe(nlp.create_pipe('sentencizer')) # For working with sentences

from spacy.tokenizer import Tokenizer
sptokenizer = Tokenizer(nlp.vocab)

def spacy_tokenizer(doc):
    return [x.orth_ for x in nlp(doc)]

In [3]:
# Import functions
sys.path.insert(0, '../../data_tools/')
from clean_text import stopwords_make, punctstr_make, unicode_make, clean_sentence
from quickpickle import quickpickle_dump, quickpickle_load # For quickly loading & saving pickle files in Python
from df_tools import check_df, load_filtered_df # For quick DF stats

# Define stopwords, unicode, punctstr
stop_words_list = stopwords_make()
unicode_list = unicode_make()
punctstr = punctstr_make()

In [4]:
#webtext = quickpickle_load("../data/webtext_quickcleaned.pickle")
#webtext[0]

In [5]:
#data = pd.read_csv('../nowdata/parsing/combined_df.csv', sep = "\t", low_memory=False, encoding="utf-8")
data = load_filtered_df('../../nowdata/charters_2015.pkl', ['WEBTEXT'])
data.loc[0,"WEBTEXT"][0][3]

"Evening Academy\nAlabama's first tuition-free public charter school serves high school students in grades 9-12 from Mobile, Baldwin, and Washington Counties. ACCEL provides a challenging college-preparatory curriculum, individualized instruction, small class sizes, and engaging use of technology in a safe, supportive environment to ensure students graduate college and career ready. Enrollment is now OPEN\nLearn More\nWelcome to the Mobile Area Education Foundation\n75K Degrees\nEvidence2Success\nGraduate Ready\nYes We Can: Building a Blueprint for Equity and Excellence in Mobile’s Schools  \nACCELerate Day and Evening Academy\nEducation Commission\nEngaging Youth Through Engineering\nHarold Dodge Fund-A-S.T.A.R.\nPartners in Education\nReading Buddy\nResearch Alliance for Multiple Pathways (RAMP)\nSuperintendent’s Student Advisory Council\nVital Link 2.0\nThe Mobile Area Education Foundation is a nonprofit organization dedicated to improving our local public schools. Founded in 1992, 

In [6]:
check_df(data)

# rows and cols:  (10965, 1)

Columns and # missing cases (if any): 
WEBTEXT


In [7]:
print("# rows in raw data before dropping empty WEBTEXT: ", len(data))
data = data[data["WEBTEXT"] != ''][data["WEBTEXT"].notna()] # Drop where WEBTEXT is empty
print("# rows in raw data before dropping empty WEBTEXT: ", len(data))

# rows in raw data before dropping empty WEBTEXT:  10965
# rows in raw data before dropping empty WEBTEXT:  6862


  


## Text pre-processing

In [98]:
def master_string_make(tupslist):
    """Extract text into master text string for each school.
    Cleans and tokenizes sentences, removing punctuation and numbers and making words lower-case.
    Loops over four nested levels, which from high to low are: tuple, chunk, sentence, word.
    
    Args:
        list of four-element tuples, the last element of which holds the long string of text we care about
    Returns:
        Master string for each school/website"""
    

    len_site = len(tupslist) # Count number of pages
    known_pages = set() # Initialize list of known pages for a school
    school_string = '' # Initialize master string for text of all a school's pages
            
    # Iterate over pages
    if len_site == 0 or not tupslist: # If site is empty, continue to next site without appending
        return
                
    for pagenum in range(len_site):
        sents_combined = ''
        if (tupslist[pagenum][3] in known_pages) or (tupslist[pagenum][3]==''): 
            continue # Skip this page if exactly the same as a previous page on this school's website
                
        for chunk in tupslist[pagenum][3].split("\n"): # Iterate over text chunks
            for sent in sent_tokenize(chunk): #nlp(chunk).sents: # Iterate over sentences
                #sent = sent.text
                if ((sent == []) or (len(sent) == 0) or sent=="" or not sent): # If sentence is empty, continue to next sentence without appending
                    continue
                        
                # Filter out emails and URLs, remove punctuation:
                sent = " ".join(
                    [ps.stem(re.sub(r"["+punctstr+"]+", r'', word).strip("'").strip("-").strip(" ")) 
                    for word in sent.split() if 
                    word and 
                    "@" not in word and not 
                    word.startswith(('http', 'https', 'www', '//', '\\', 'x_', 'x/', 'srcimage')) and not
                    word.endswith(('.com', '.net', '.gov', '.org', '.jpg', '.pdf', 'png', 'jpeg', 'php'))])
                    
                sents_combined += ('\n' + sent) # Add sentence to list of sentences

        known_pages.add(tupslist[pagenum][3]) # Add page to known page list
        school_string += ('\n' + sents_combined) # Add to master string 
                            
    if school_string != '' and school_string not in ["", "\n", 0, "0"] and len(school_string)>0 and school_string != None:
        return(school_string)

In [114]:
# Prep data
tqdm.pandas(desc="Cleaning & tokenizing sentences")
webtext = [] # Initialize list of master strings
webtext = data['WEBTEXT'][:50].progress_apply(master_string_make)



















Cleaning & tokenizing sentences:   0%|          | 0/50 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Cleaning & tokenizing sentences:   4%|▍         | 2/50 [00:00<00:14,  3.23it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Cleaning & tokenizing sentences:   8%|▊         | 4/50 [00:01<00:17,  2.70it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Cleaning & tokenizing sentences:  12%|█▏        | 6/50 [00:02<00:14,  2.94it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Cleaning & tokenizing sentences:  16%|█▌        | 8/50 [00:02<00:11,  3.61it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Cleaning & tokenizing sentences:  24%|██▍       | 12/50 [00:02<00:07,  4.76it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Cleaning & tokenizing sentences:  26%|██▌       | 13/50 [00:03<00:17,  2.11it/s][A

In [117]:
webtext[0]

"\n\neven academi\nalabama' first tuition-fre public charter school serv high school student in grade 9-12 from mobil baldwin and washington counti\naccel provid a challeng college-preparatori curriculum individu instruct small class size and engag use of technolog in a safe support environ to ensur student graduat colleg and career readi\nenrol is now open\nlearn more\nwelcom to the mobil area educ foundat\n75k degre\nevidence2success\ngraduat readi\nye We can build a blueprint for equiti and excel in mobile’ school\nacceler day and even academi\neduc commiss\nengag youth through engin\nharold dodg fund-a-star\npartner in educ\nread buddi\nresearch allianc for multipl pathway ramp\nsuperintendent’ student advisori council\nvital link 20\nthe mobil area educ foundat is a nonprofit organ dedic to improv our local public school\nfound in 1992 our mission is to build commun respons for improv public educ outcom in mobil counti\nWe work with but are independ of the mobil counti public scho

In [106]:
list(nlp(webtext[0]).sents)

[
 
 even academi
 alabama' first tuition-fre public charter school serv high school student in grade,
 9-12 from mobil baldwin and washington counti
 accel provid a challeng college-preparatori curriculum individu instruct small class size and engag use of technolog in a safe support environ to ensur student graduat colleg and career readi
 enrol is now open
 learn more
 welcom to the mobil area educ foundat
 75k degre
 evidence2success
 graduat readi,
 ye,
 We can build a blueprint for equiti and excel in mobile’ school
 acceler day and even academi
 educ commiss
 engag youth through engin
 harold dodg fund-a-star
 partner in educ
 read buddi
 research allianc for multipl pathway ramp
 superintendent’ student advisori council
 vital link 20,
 the mobil area educ foundat is a nonprofit organ dedic to improv our local public school
 found in 1992 our mission is to build commun respons for improv public educ outcom in mobil counti,
 We work with but are independ of the mobil counti publ

In [118]:
for chunk in data["WEBTEXT"][0][0][3].split("\n"):
    for sent in nlp(chunk).sents:
        print(sent.text.encode("utf-8").decode("utf-8"))
        print(sent.text)
        print()
    
    #for sent in sent_tokenize(chunk):
    #    print(sent)

Evening Academy
Evening Academy

Alabama's first tuition-free public charter school serves high school students in grades 9-12 from Mobile, Baldwin, and Washington Counties.
Alabama's first tuition-free public charter school serves high school students in grades 9-12 from Mobile, Baldwin, and Washington Counties.

ACCEL provides a challenging college-preparatory curriculum, individualized instruction, small class sizes, and engaging use of technology in a safe, supportive environment to ensure students graduate college and career ready.
ACCEL provides a challenging college-preparatory curriculum, individualized instruction, small class sizes, and engaging use of technology in a safe, supportive environment to ensure students graduate college and career ready.

Enrollment is now OPEN
Enrollment is now OPEN

Learn More
Learn More

Welcome to the Mobile Area Education Foundation
Welcome to the Mobile Area Education Foundation

75K Degrees
75K Degrees

Evidence2Success
Evidence2Success

Gr

In [103]:
print("# rows in cleaned data before dropping empty WEBTEXT: ", len(webtext))
webtext = webtext.dropna()
print("# rows in cleaned data after dropping empty WEBTEXT: ", len(webtext))
print()
print(webtext.loc[:50])

# rows in cleaned data before dropping empty WEBTEXT:  50
# rows in cleaned data after dropping empty WEBTEXT:  48

0     \n\neven academi\nalabama' first tuition-fre p...
2     \n\nselect a school\nselect a school\nketchika...
3     \n\nl enrol homeschool\ntitl IX assur gender e...
4     \n\nselect a school\nselect a school\nabbott l...
8     \n\nwednesday may 23rd 1130\nsuppli fee overdu...
10    \n\nes\nlogin\nfrontier' purpos help parent he...
11    \n\nselect a school\nselect a school\nabbott l...
13    \n\nselect a school\nselect a school\nabbott l...
14    \n\nselect a school\nselect a school\nabbott l...
15    \n\njuneau commun charter school\njcc\nA learn...
17    \n\nfirewe academi\n995 soundview ave homer AK...
18    \n\n162 park ave soldotna AK 99669\nskip to co...
19    \n\nkaleidoscop school of art and scienc\n549 ...
20    \n\nselect a school\nselect a school\nacademi ...
21    \n\n mr macdonald\n Ms mose\n mr darnel\nabout...
22    \n\nselect a school\nselect a school\n

In [12]:
#quickpickle_dump(webtext, "../data/webtext_quickcleaned.pickle")
#webtext = quickpickle_load("../data/webtext_quickcleaned.pickle")

In [104]:
print("Detecting and parsing phrases in website text...")

# Threshold represents a threshold for forming the phrases (higher means fewer phrases). 
# A phrase of words a and b is accepted if (cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold, 
# where N is the total vocabulary size. By default this value is 10.0.

# Detect phrases in sentences based on collocation counts
phrases = Phrases(sentences=[site for site in webtext], 
                  delimiter=b'_', common_terms=stop_words_list, 
                  threshold=10, min_count=10) 

# Apply phrase detection model to each sentence in data
#webtext = [phrases[chunk] for chunk in tqdm(webtext, desc="Parsing phrases")] 

Detecting and parsing phrases in website text...


In [120]:
sent_tokenize(webtext[0])

["\n\neven academi\nalabama' first tuition-fre public charter school serv high school student in grade 9-12 from mobil baldwin and washington counti\naccel provid a challeng college-preparatori curriculum individu instruct small class size and engag use of technolog in a safe support environ to ensur student graduat colleg and career readi\nenrol is now open\nlearn more\nwelcom to the mobil area educ foundat\n75k degre\nevidence2success\ngraduat readi\nye We can build a blueprint for equiti and excel in mobile’ school\nacceler day and even academi\neduc commiss\nengag youth through engin\nharold dodg fund-a-star\npartner in educ\nread buddi\nresearch allianc for multipl pathway ramp\nsuperintendent’ student advisori council\nvital link 20\nthe mobil area educ foundat is a nonprofit organ dedic to improv our local public school\nfound in 1992 our mission is to build commun respons for improv public educ outcom in mobil counti\nWe work with but are independ of the mobil counti public sch

In [126]:
list(webtext)[0]

"\n\neven academi\nalabama' first tuition-fre public charter school serv high school student in grade 9-12 from mobil baldwin and washington counti\naccel provid a challeng college-preparatori curriculum individu instruct small class size and engag use of technolog in a safe support environ to ensur student graduat colleg and career readi\nenrol is now open\nlearn more\nwelcom to the mobil area educ foundat\n75k degre\nevidence2success\ngraduat readi\nye We can build a blueprint for equiti and excel in mobile’ school\nacceler day and even academi\neduc commiss\nengag youth through engin\nharold dodg fund-a-star\npartner in educ\nread buddi\nresearch allianc for multipl pathway ramp\nsuperintendent’ student advisori council\nvital link 20\nthe mobil area educ foundat is a nonprofit organ dedic to improv our local public school\nfound in 1992 our mission is to build commun respons for improv public educ outcom in mobil counti\nWe work with but are independ of the mobil counti public scho

In [123]:
webtext = [sent_tokenize(site.split("\n")) for site in list(webtext)]
webtext[0]

TypeError: expected string or bytes-like object

In [112]:
# Apply phrase detection model to each sentence in data, while removing digits
webtext = " ".join(
                   [phrases[
                            [word for word in sentence.strip() if not word.isdigit()]
                           ]
                   for sentence in tqdm(
                                        webtext, desc="Parsing phrases"
                                       )
                   ]
                  )


















Parsing phrases:   0%|          | 0/48 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

AttributeError: 'list' object has no attribute 'strip'

In [26]:
webtext[0]

"\n\neven academi\nalabama' first tuition-fre public charter school serv high school student in grade 9-12 from mobil baldwin and washington counti\naccel provid a challeng college-preparatori curriculum individu instruct small class size and engag use of technolog in a safe support environ to ensur student graduat colleg and career readi\nenrol is now open\nlearn more\nwelcom to the mobil area educ foundat\n75k degre\nevidence2success\ngraduat readi\nye We can build a blueprint for equiti and excel in mobile’ school\nacceler day and even academi\neduc commiss\nengag youth through engin\nharold dodg fund-a-star\npartner in educ\nread buddi\nresearch allianc for multipl pathway ramp\nsuperintendent’ student advisori council\nvital link 20\nthe mobil area educ foundat is a nonprofit organ dedic to improv our local public school\nfound in 1992 our mission is to build commun respons for improv public educ outcom in mobil counti\nWe work with but are independ of the mobil counti public scho

In [None]:
# Apply phrase detection model to each sentence in data
#webtext = [phrases[chunk] for chunk in tqdm(webtext, desc="Parsing phrases")] 

In [14]:
quickpickle_dump(webtext, "../data/webtext_quickcleaned.pickle")

## LDA

In [None]:
# Define model parameters
no_features = 5000 # number of features for vectorizer #
no_topics = 20 # number of topics
no_top_words = 20 # number of words to display

In [None]:
# LDA can only use raw term counts because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(encoding='utf-8', lowercase=True, strip_accents='unicode', tokenizer=spacy_tokenizer,
                                max_features=no_features, max_df=0.5, min_df=50, stop_words=stop_words_list, 
                                ngram_range=(1,1), decode_error='replace')

tf = tf_vectorizer.fit_transform(webtext)
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
tf_feature_names[:250] +[" ","..."," "]+ tf_feature_names[-250:]

In [None]:
# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=20, learning_method='batch', 
                                learning_offset=50., random_state=43, n_jobs=-1).fit(tf)

In [None]:
def display_topics_lda(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic " + str(topic_idx) + ': ' + " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics_lda(lda, tf_feature_names, no_top_words)
#topics to note: topic 1, topic 2, topic 4, topic , topic 15

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

## NMF

In [None]:
#with open('../../nowdata/charters_2015.pkl', 'rb') as f:
#    data = pickle.load(f)

In [None]:
#tfidf_vectorizer = TfidfVectorizer(encoding='utf-8', lowercase=True, strip_accents='unicode', norm="l2",  
#                                   max_features=no_features, max_df=0.5, min_df=50, stop_words=stop_words_list,
#                                   ngram_range=(1,3), decode_error="replace")
#
#tfidf = tfidf_vectorizer.fit_transform(webtext)
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tfidf = TfidfTransformer().fit_transform(tf) # Transform TF into TFIDF

In [None]:
nmf_model = NMF(n_components=no_topics, random_state=43, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

In [None]:
def display_topics_nmf(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print documents[doc_index]

In [None]:
no_top_words = 20
no_top_documents = 20
display_topics_nmf(nmf_H, nmf_W, tf_feature_names, documents, no_top_words, no_top_documents)