# For cleaner/updated version, see count_dict

In [1]:
import multiprocessing as mp
import pandas as pd
import re
import numpy as np
import time
from nltk.stem.porter import PorterStemmer


stemmer = PorterStemmer()
stem = stemmer.stem # stemmer function

charter_path = '../../charters_full_2015_15_250.pkl'
df_charter = pd.read_pickle(charter_path)

dict_path = '/home/jovyan/work/Charter-school-identities/dicts/'
dict_names = ['discipline', 'inquiry'] # names of txt file holding dictionary
dicts = [] # loaded list of dicts
for dname in dict_names:  
    with open(dict_path+dname) as f: 
        dicts.append(f.read().splitlines())

In [2]:
df_charter['WEBTEXT_METHOD'].value_counts()

3    11182
0      339
Name: WEBTEXT_METHOD, dtype: int64

In [2]:
class Page:
    def __init__(self,p):
        self.url = p[0]
        self.boo = p[1]
        self.depth = p[2]
        self.text = p[3]
    def __repr__(self):
        return self.text
    def __eq__(self, other):
        if isinstance(other, Page):
            return self.text == other.text
        else:
            return False
    def __ne__(self, other):
        return (not self.__eq__(other))
    def __hash__(self):
        return hash(self.__repr__())
    
def dict_precalc(keywords):
    for entry in keywords:
        small_keywords.append(entry) if len(re.split('\W+|_', entry)) < 3 else large_keywords.append(entry)

    large_words = [[stem(x) for x in re.split('\W+|_', entry)] for entry in large_keywords] # list words for each large dict entry
    large_lengths = [len(x) for x in large_words]
    large_first_words = [x[0] for x in large_words] # first words of each large entry in dict
    key_words = [[stem(x) for x in re.split('\W+|_', entry)] for entry in keywords]
    return [large_words, large_lengths, large_first_words, key_words]


def dict_count2(key_words, large_words, large_lengths, large_first_words, pages):

    """Returns the hit count with given dictionary on page set.

    pages: set of preprocessed page lists corresponding to an entry of the 'webtext' column
    """
    counts = 0 # Number of matches between text_list and keywords
    res_length = 0
    for splitted_phrase in pages:
        for length in range(1, 3):
            if len(splitted_phrase) < length:
                continue # If text chunk is shorter than length of dict entries being matched, there are no matches.
            for i in range(len(splitted_phrase) - length + 1):
                entry = splitted_phrase[i:i+length]
                if entry in key_words:
                    counts += 1
                    res_length += length - 1
        indices = np.transpose(np.nonzero([[word == entry for word in splitted_phrase] for entry in large_first_words]))
        for ind in indices:
            if ind[1] <= (len(splitted_phrase) - large_lengths[ind[0]]) and large_words[ind[0]] == splitted_phrase[ind[1] : ind[1] + large_lengths[ind[0]]]:
                counts += 1
                res_length += large_lengths[ind[0]] - 1
    return counts, res_length

In [3]:
disc_pre = dict_precalc(dict_ess)
inq_pre = dict_precalc(dict_prog)
rit_pre = dict_precalc(dict_rit)

In [4]:
def parallel_count(df):
    precalc_list = [dict_precalc(d) for d in dicts]
    res_list = []
#     e_res = [] # sum of matched keyword length - 1 used to correct for number of (key)words in pages of a school for a more accurate hit ratio
#     p_res = [] # same except for keywords in progressive dict
#     r_res = [] # ritualistic
    count_list = []
#     ess_count = []
#     prog_count = []
#     rit_count = []
    num_words = []
    start = time.time()
    for i, row in enumerate(df['WEBTEXT'].values):
        pages = set([Page(p) for p in row])
        pages = [[stem(x) for x in re.split('\W+|_', p.text)] for p in pages] # preprocess pages in same way as dictionaries should have been in above precalc function
        num_words.append(sum([len(p) for p in pages]))
        disc_c, disc_res = dict_count2(disc_pre[3], ess_pre[0], ess_pre[1], ess_pre[2], pages)
        inq_c, inq_res = dict_count2(prog_pre[3], prog_pre[0], prog_pre[1], prog_pre[2], pages)
        rit_c, rit_res = dict_count2(rit_pre[3], rit_pre[0], rit_pre[1], rit_pre[2], pages)
        ess_count.append(ess_c)
        prog_count.append(prog_c)
        rit_count.append(rit_c)
        e_res.append(ess_res)
        p_res.append(prog_res)
        r_res.append(rit_res)
        if i%100 == 0:
            end = time.time()
            print('Time Elapsed:{:f}, Percent Complete:{:f}'.format(end - start,i*100/len(df)))
    df.loc[:,'PROG_COUNT'] = np.array(prog_count)
    df.loc[:,'RIT_COUNT'] = np.array(rit_count)
    df.loc[:,'ESS_COUNT'] = np.array(ess_count)
    df.loc[:,'ESS_STR'] = np.log10(np.array(ess_count)/(np.array(num_words) - np.array(e_res)))
    df.loc[:,'PROG_STR'] = np.log10(np.array(prog_count)/(np.array(num_words) - np.array(p_res)))
    df.loc[:,'RIT_STR'] = np.log10(np.array(rit_count)/(np.array(num_words) - np.array(r_res)))
    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    return df

In [7]:
[range(11000)[300*i:i*300+300] for i in range(round(11000/300)+1)] # checking for below

[range(0, 300),
 range(300, 600),
 range(600, 900),
 range(900, 1200),
 range(1200, 1500),
 range(1500, 1800),
 range(1800, 2100),
 range(2100, 2400),
 range(2400, 2700),
 range(2700, 3000),
 range(3000, 3300),
 range(3300, 3600),
 range(3600, 3900),
 range(3900, 4200),
 range(4200, 4500),
 range(4500, 4800),
 range(4800, 5100),
 range(5100, 5400),
 range(5400, 5700),
 range(5700, 6000),
 range(6000, 6300),
 range(6300, 6600),
 range(6600, 6900),
 range(6900, 7200),
 range(7200, 7500),
 range(7500, 7800),
 range(7800, 8100),
 range(8100, 8400),
 range(8400, 8700),
 range(8700, 9000),
 range(9000, 9300),
 range(9300, 9600),
 range(9600, 9900),
 range(9900, 10200),
 range(10200, 10500),
 range(10500, 10800),
 range(10800, 11000)]

In [5]:
with mp.Pool(processes = mp.cpu_count() - 1) as pool:
    results = pool.map(parallel_count, [df_charter[300*i:i*300+300] for i in range(round(len(df_charter)/300)+1)]) # execute on chunks of 300(arbitrary)

OSError: [Errno 12] Cannot allocate memory

In [22]:
(pd.concat(results)).to_pickle('../../charters_full_2015_250.pkl')

3    954
0     46
Name: WEBTEXT_METHOD, dtype: int64

In [19]:
# df1[['SCHNAM15','WEBTEXT_METHOD','PROG_COUNT','RIT_COUNT','ESS_COUNT','ESS_STR','PROG_STR','ESS_STR']]

Unnamed: 0,SCHNAM15,WEBTEXT_METHOD,PROG_COUNT,RIT_COUNT,ESS_COUNT,ESS_STR,PROG_STR,ESS_STR.1
0,Arizona Agribusiness & Equine Center - Estrella,3,1,7,0,,-2.977266,
1,Arizona Agribusiness & Equine Center - Estrella,3,1,7,0,,-2.977266,
2,AAEC - SMCC Campus,3,1,8,0,,-2.941014,
3,AAEC - SMCC Campus,3,1,8,0,,-2.941014,
4,AAEC - Paradise Valley,3,0,1,0,,,
5,AAEC - Paradise Valley,3,0,1,0,,,
6,CUMBERLAND ACADEMY MIDDLE,3,488,706,192,-2.547275,-2.141070,-2.547275
7,CUMBERLAND H S,3,37,140,28,-3.420362,-3.299189,-3.420362
8,CUMBERLAND ACADEMY,3,582,826,13,-3.669989,-2.017585,-3.669989
9,ACCELERATED INTERDISCIPLINARY ACAD,3,10,55,6,-2.871767,-2.649335,-2.871767
