# Testing Scrapy Cluster data

In [1]:
# ## Initializing

# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
#from glob import glob,iglob # for finding files within nested folders--compare with os.walk
import json, pickle, csv # For saving a loading dictionaries, DataFrames, lists, etc. in JSON, pickle, and CSV formats
from math import log10 # For calculating logarithms of dictionary counts
from datetime import datetime # For timestamping files
import time #, timeout_decorator # To prevent troublesome files from bottlenecking the parsing process, use timeouts
import sys # For working with user input
import logging # for logging output, to help with troubleshooting
#from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
#stemmer = PorterStemmer()
#from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import pandas as pd # modifies data more efficiently than with a list of dicts
#from tqdm import tqdm # For progress information over iterations, including with Pandas operations via "progress_apply"

In [2]:
# ### Set script options

Debug = False # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML

In [3]:
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\" # One level further down than the others
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

save_dir = dir_prefix + "Charter-school-identities" + os.sep + "data" + os.sep # Directory in which to save data files
dicts_dir = dir_prefix + "Charter-school-identities" + os.sep + "dicts" + os.sep # Directory in which to find & save dictionary files
temp_dir = save_dir + "temp" + os.sep # Directory in which to save temporary data files

micro_sample13 = save_dir + "micro-sample13_coded.csv" # Random micro-sample of 300 US charter schools
URL_schooldata = save_dir + "charter_URLs_2014.csv" # 2014 population of 6,973 US charter schools
full_schooldata = save_dir + "charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
temp_data = save_dir + "school_parser_temp.json" # Full_schooldata dict with output for some schools
example_file = save_dir + "example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

if not workstation and not notebook:
    wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel (requires server access)
    example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/" # Random charter school folder
    example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"
    
data_year = int(2014)
    
# Set logging options
log_file = temp_dir + "data_prep_" + str(datetime.today()) + ".log"
logging.basicConfig(filename=log_file,level=logging.INFO)

In [4]:
schooldf = pd.read_pickle(dir_prefix + "scrapy_cluster_data" + os.sep + "processed_df.pkl")

In [14]:
sigh = schooldf['data'].head()[0]

In [49]:
class Ugh:
    def __init__(self):
        self.counter = 0
    def process(self, sigh):
        total = []
        for x in sigh:
            total.extend([s for s in x[-1].split("\n") if len(s) > 35])
        self.counter += 1
        if self.counter % 50 == 0:
            print(self.counter)
        return "\n".join(total)
    


In [52]:
ughh = Ugh()
ayy = schooldf['data'].apply(ughh.process)

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400


In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis
import numpy as np

In [137]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_features=5000)
transformed = vectorizer.fit_transform(ayy)

In [165]:
factoranal = FactorAnalysis(n_components=10)
factoranal.fit(transformed.toarray())

FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=10,
        noise_variance_init=None, random_state=0, svd_method='randomized',
        tol=0.01)

In [140]:
np.argpartition(factoranal.components_[1], -30)[-30:]

array([1872,  777,  919, 1023, 1896, 1895, 1022, 2892,  387, 1024, 3384,
       4265, 2890, 3390,  773, 1871, 4258,  778, 1021, 4303, 4487, 1870,
        775, 2847,  954, 1353, 2298, 1869, 1867,  386])

In [166]:
vocab = vectorizer.get_feature_names()
for c in factoranal.components_:
    print([vocab[i] for i in np.argpartition(c, -10)[-10:]])

['begin', 'vtimezone', 'daylight', 'freq', 'rrule', 'tzoffsetto', 'dtstart', 'byday', 'bymonth', 'tzoffsetfrom']
['fbidi', 'dbch', 'hich', 'af1', 'lsdlocked0', 'af0', 'rtlch', 'fcs1', 'fcs0', 'ltrch']
['las', '504', 'según', 'en', 'la', 'título', 'públicas', 'enmienda', 'amended', 'harmony']
['district', 'content', 'access', 'use', 'school', 'data', 'user', 'service', 'loop', 'information']
['1964', 'según', '210', 'san', 'enmienda', 'ley', 'antonio', 'título', 'saisd', 'la']
['ih', '3796', '78216', '12500', 'suite', '956', 'idea', '377', 'amended', '8000']
['academy', 'child', 'school', 'year', 'learning', 'charter', 'students', 'education', 'children', 'student']
['information', 'accredited', 'ajax', 'marital', 'school', 'subjected', 'excluded', 'charter', 'denied', 'nwac']
['great', 'leaders', 'looking', 'generation', 'academies', 'hours', 'heritage', 'national', 'join', 'moral']
['k12', 'services', 'parent', 'shall', 'student', 'district', 'policy', 'information', 'hometown', 'scho