In [52]:
# Import libraries & modules
import pandas as pd 
import numpy as np
import pickle
import datetime
import sys
import re
from tqdm import tqdm # Shows progress over iterations, including in pandas via "progress_apply"

# For text parsing & modeling
import spacy
nlp = spacy.load('en')
from spacy.tokenizer import Tokenizer
sptokenizer = Tokenizer(nlp.vocab)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import text
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer

In [3]:
# Import functions
sys.path.insert(0, '../../data_tools/')
from clean_text import stopwords_make, punctstr_make, unicode_make, clean_sentence

# Define stopwords, unicode, punctstr
stop_words_list = stopwords_make()
unicode_list = unicode_make()
punctstr = punctstr_make()

In [79]:
#data = pd.read_csv('../nowdata/parsing/combined_df.csv', sep = "\t", low_memory=False, encoding="utf-8")
data = pd.read_pickle('../../nowdata/charters_2015.pkl')

In [80]:
data.loc[0,"WEBTEXT"][0][3]

"Evening Academy\nAlabama's first tuition-free public charter school serves high school students in grades 9-12 from Mobile, Baldwin, and Washington Counties. ACCEL provides a challenging college-preparatory curriculum, individualized instruction, small class sizes, and engaging use of technology in a safe, supportive environment to ensure students graduate college and career ready. Enrollment is now OPEN\nLearn More\nWelcome to the Mobile Area Education Foundation\n75K Degrees\nEvidence2Success\nGraduate Ready\nYes We Can: Building a Blueprint for Equity and Excellence in Mobile’s Schools  \nACCELerate Day and Evening Academy\nEducation Commission\nEngaging Youth Through Engineering\nHarold Dodge Fund-A-S.T.A.R.\nPartners in Education\nReading Buddy\nResearch Alliance for Multiple Pathways (RAMP)\nSuperintendent’s Student Advisory Council\nVital Link 2.0\nThe Mobile Area Education Foundation is a nonprofit organization dedicated to improving our local public schools. Founded in 1992, 

## Text pre-processing

In [67]:
def master_string_make(tupslist):
    """Extract text into master text string for each school.
    Cleans and tokenizes sentences, removing punctuation and numbers and making words lower-case.
    Loops over four nested levels, which from high to low are: tuple, chunk, sentence, word.
    
    Args:
        list of four-element tuples, the last element of which holds the long string of text we care about
    Returns:
        Master string for each school/website"""
    

    len_site = len(tupslist) # Count number of pages
    known_pages = set() # Initialize list of known pages for a school
    school_string = '' # Initialize master string for text of all a school's pages
            
    # Iterate over pages
    if len_site == 0: # If site is empty, continue to next site without appending
        return
                
    for pagenum in range(len_site):
        sents_combined = ''
        if (tupslist[pagenum][3] in known_pages) or (tupslist[pagenum][3]==''): 
            continue # Skip this page if exactly the same as a previous page on this school's website
                
        for chunk in tupslist[pagenum][3].split("\n"): # Iterate over text chunks
            for sent in sent_tokenize(chunk): # Iterate over sentences
                if ((sent == []) or (len(sent) == 0) or sent==""): # If sentence is empty, continue to next sentence without appending
                    continue
                        
                # Filter out emails and URLs, remove punctuation:
                sent = " ".join(
                    [re.sub(r"["+punctstr+"]+", r'', word).strip("'").strip("-").strip(" ") 
                    for word in sent.split() if 
                    "@" not in word and not 
                    word.startswith(('http', 'https', 'www', '//', '\\', 'x_', 'x/', 'srcimage')) and not
                    word.endswith(('.com', '.net', '.gov', '.org', '.jpg', '.pdf', 'png', 'jpeg', 'php'))])
                        
                #sent = " ".join(clean_sentence(sent, remove_stopwords=False)) # Clean and tokenize sentence, then rejoin into string
                    
                sents_combined += '\n' + sent # Add sentence to list of sentences

        known_pages.add(tupslist[pagenum][3]) # Add page to known page list
        school_string += '\n' + sents_combined # Add to master string 
                            
    if school_string not in ["", "\n", 0, "0"] and len(school_string)>0:
        return(school_string)

In [68]:
def master_strings_make(tupslist_list):
    """Loop over sites, extract text into master text string for each school.
    Cleans and tokenizes sentences, removing punctuation and numbers and making words lower-case.
    This function loops over five nested levels, which from high to low are: row, tuple, chunk, sentence, word.
    
    Args:
        list of lists of four-element tuples, the last element of which holds the long string of text we care about.
        
    Returns:
        List of master strings, one per school/website"""
    
    master_strings_list = [] # Initialize list of master strings

    for tupslist in tupslist_list: # Iterate over websites/schools
        try:
            len_site = len(tupslist) # Count number of pages
            known_pages = set() # Initialize list of known pages for a school
            school_string = '' # Initialize master string for text of all a school's pages
            
            # Iterate over pages
            if len_site == 0: # If site is empty, continue to next site without appending
                continue
                
            for pagenum in range(len_site):
                sents_combined = ''
                if tupslist[pagenum][3] in known_pages or tupslist[pagenum][3]=='': 
                    continue # Skip this page if exactly the same as a previous page on this school's website
                    
                for chunk in tupslist[pagenum][3].split("\n"): # Iterate over text chunks
                    for sent in sent_tokenize(chunk): # Iterate over sentences
                        
                        if ((sent == []) or (len(sent) == 0) or sent==""): # If sentence is empty, continue to next sentence without appending
                            continue
                        
                        # Filter out emails and URLs, remove punctuation:
                        sent = " ".join(
                            [re.sub(r"["+punctstr+"]+", r'', word).strip("'").strip("-").strip(" ") 
                            for word in sent.split() if 
                            "@" not in word and not 
                            word.startswith(('http', 'https', 'www', '//', '\\', 'x_', 'x/', 'srcimage')) and not
                            word.endswith(('.com', '.net', '.gov', '.org', '.jpg', '.pdf', 'png', 'jpeg', 'php'))])
                        
                        #sent = " ".join(clean_sentence(sent, remove_stopwords=False)) # Clean and tokenize sentence, then rejoin into string
                    
                        sents_combined += '\n' + sent # Add sentence to list of sentences

                known_pages.add(tupslist[pagenum][3]) # Add page to known page list
                school_string += '\n' + sents_combined # Add to master string 
                            
            if school_string not in ["", "\n", 0, "0"] and len(school_string)>0:
                print(school_string)
                master_strings_list.append(school_string) # Add to list of master strings
        
        except Exception as e:
            print(str(e))
            pass
        
    return master_strings_list

In [81]:
# Prep data for LDA
#data['WEBTEXT'] = data['WEBTEXT'].fillna('0')
tqdm.pandas(desc="Cleaning & tokenizing sentences")
webtext_lda = [] # Initialize list of master strings
#webtext_lda = master_strings_make(data['WEBTEXT']) # Create list of master strings
webtext_lda = data['WEBTEXT'][:20].progress_apply(master_string_make)








Cleaning & tokenizing sentences:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A[A[A






Cleaning & tokenizing sentences:  10%|█         | 2/20 [00:00<00:01, 11.54it/s][A[A[A[A[A[A[A






Cleaning & tokenizing sentences:  25%|██▌       | 5/20 [00:00<00:01, 11.33it/s][A[A[A[A[A[A[A






Cleaning & tokenizing sentences:  50%|█████     | 10/20 [00:00<00:00, 14.71it/s][A[A[A[A[A[A[A






Cleaning & tokenizing sentences:  85%|████████▌ | 17/20 [00:00<00:00, 19.13it/s][A[A[A[A[A[A[A






Cleaning & tokenizing sentences: 100%|██████████| 20/20 [00:01<00:00, 19.40it/s][A[A[A[A[A[A[A

In [82]:
len(webtext_lda)

20

In [83]:
webtext_lda[0]

"\n\nEvening Academy\nAlabama's first tuition-free public charter school serves high school students in grades 9-12 from Mobile Baldwin and Washington Counties\nACCEL provides a challenging college-preparatory curriculum individualized instruction small class sizes and engaging use of technology in a safe supportive environment to ensure students graduate college and career ready\nEnrollment is now OPEN\nLearn More\nWelcome to the Mobile Area Education Foundation\n75K Degrees\nEvidence2Success\nGraduate Ready\nYes We Can Building a Blueprint for Equity and Excellence in Mobile’s Schools\nACCELerate Day and Evening Academy\nEducation Commission\nEngaging Youth Through Engineering\nHarold Dodge Fund-A-STAR\nPartners in Education\nReading Buddy\nResearch Alliance for Multiple Pathways RAMP\nSuperintendent’s Student Advisory Council\nVital Link 20\nThe Mobile Area Education Foundation is a nonprofit organization dedicated to improving our local public schools\nFounded in 1992 our mission i

## LDA

In [84]:
# Define model parameters
no_features = 5000 # number of features for vectorizer #
no_topics = 20 # number of topics
no_top_words = 20 # number of words to display

In [93]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(encoding='utf-8', lowercase=True, strip_accents='unicode', 
                                max_features=no_features, max_df=0.5, min_df=50, stop_words=stop_words_list, 
                                ngram_range=(1,3))

tf = tf_vectorizer.fit_transform(webtext_lda)
tf_feature_names = tf_vectorizer.get_feature_names()

AttributeError: 'NoneType' object has no attribute 'lower'

In [None]:
tf_feature_names[:250] +[" ","..."," "]+ tf_feature_names[-250:]

In [None]:
# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='batch', learning_offset=50.,random_state=43).fit(tf)

In [None]:
def display_topics_lda(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic " + str(topic_idx) + ': ' + " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics_lda(lda, tf_feature_names, no_top_words)
#topics to note: topic 1, topic 2, topic 4, topic , topic 15

In [51]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

BrokenProcessPool: A result has failed to un-serialize. Please ensure that the objects returned by the function are always picklable.

## NER

In [None]:
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load('en')

In [None]:
#regex code to remove unicode/ \n's, etc
re.sub(r'\\n', ' ', text_str)
re.sub(r'\\xa', ' ', text_str)

In [None]:
#using st.tag to create list with tupled categorization for entities
tag_words = st.tag(master_string.split())
#filtering out the non named entities
named_e = [t for t in tag_lyrics if t[1] != 'O']
named_e

## NMF

In [None]:
#with open('../../nowdata/charters_2015.pkl', 'rb') as f:
#    data = pickle.load(f)

In [22]:
#webtext = data["WEBTEXT"]
webtext_nmf = []

for t in webtext:
    try:
        len_site = len(t)
        school_lst = [] # Initialize list
        for i in range(len_site):
            school_lst.append(t[i][3]) 
        webtext_nmf.append(school_lst)
    except Exception as e:
        print(str(e))
        pass

In [23]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_list)
tfidf = tfidf_vectorizer.fit_transform(webtext_nmf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


AttributeError: 'list' object has no attribute 'lower'

In [None]:
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

In [None]:
def display_topics_nmf(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print documents[doc_index]

In [None]:
no_top_words = 20
no_top_documents = 20
display_topics_nmf(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)