In [291]:
data_dir = "/Users/erikbeerepoot/Dropbox/data/policies/"
nlp = spacy.load('en_core_web_lg')

In [404]:
from pathlib import Path
import itertools

def get_files(data_dir,extension=".md"):
    """ Crawl directories and return list of markdown files"""
    p = Path(data_dir)
    dirs = [f for f in p.iterdir() if f.is_dir()]
    all_files = [list(dir.glob(f"**/*{extension}")) for dir in dirs]
    
    file_map = dict()
    for dir in dirs:
        file_map[dir.name] = all_files[dirs.index(dir)]            
    print(f"Found {len(list(itertools.chain(*all_files)))} files in {len(dirs)} directories. ")
    return file_map

In [517]:
def load_tags(data_dir):
    tag_files = get_files(data_dir,".json")
    
    tags = dict([])
    for k,v in tag_files.items():
        if(len(v) < 1):
            continue
        tags[k] = json.loads(open(v[0],'r').read())
    return tags

In [541]:
import itertools

def get_all_tags(data_dir):
    """Returns a list of (unique) tags in the corpus"""
    tags = load_tags(data_dir)
    all_tags = []
    for key,value in tags.items():
        all_tags.append([tag["tags"] for tag in value]) 
    all_tags = list(itertools.chain(*list(itertools.chain(*all_tags))))
    return sorted(list(set(all_tags)))

In [105]:
def load_data(file_paths):
    """Load data from a list of files"""
    file_handles = [open(file) for file in file_paths]
    return ["".join(fh.readlines()) for fh in file_handles]

In [491]:
import spacy 
import string

punctuation = string.punctuation

def cleanup_files(files):    
    """ Cleanup data files by lemmatizing, lowercasing, stripping whitespace,
    removing punctuation and removing stopwords."""
    cleaner_files = []
    for file in files:
        doc = nlp(file)
        tokens = [token.lower_.strip() for token in doc if not token.is_stop]
        cleaner_files.append([token for token in tokens if token not in punctuation])
    return cleaner_files    

In [402]:
import simplejson as json

def generate_tags_template(files):
    file_names = sorted([file.name for file in files])
    tags = "".join([f"""{{"file" : "{name}", "tags" : [ ] }},""" for name in file_names])
    tags = f"[ {tags[:-1]}]"
    parent_dir = next(iter(files)).parent
    fh = open(f"{parent_dir}/tags.json","w+")
    fh.write(json.dumps(json.loads(tags), indent=4, sort_keys=True))
    fh.close()

In [511]:
import spacy 
import itertools

nlp = spacy.load('en_core_web_lg')

files = get_files(data_dir)
file_contents = load_data(files["york"])
tags = load_tags(data_dir)

Found 1126 files in 10 directories. 
Found 3 files in 10 directories. 
/Users/erikbeerepoot/Dropbox/data/policies/stanford/tags.json
/Users/erikbeerepoot/Dropbox/data/policies/yale/tags.json
/Users/erikbeerepoot/Dropbox/data/policies/columbia/tags.json


This can take a bit of time:

In [492]:
files = cleanup_files(file_contents)

In [494]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd 

def dummy_fun(doc):
    return doc

vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    sublinear_tf=True,
    ngram_range=(1,3),
    token_pattern=None)  

tfidf_matrix = vectorizer.fit_transform(files)
feature_names = vectorizer.get_feature_names()
df = pd.DataFrame(tfidf_matrix.todense().tolist(),columns=feature_names)

In [507]:
print(f"{len(df.columns)} words, {len(df)} rows.")

172317 words, 175 rows.


In [508]:

top_words = []
for index in range(0,len(df)):
    values = df.loc[index]
    vals = values[values > 0].sort_values(ascending=False)
    top_words.append(vals[:5])

for index in range(0,len(top_words)):
    print(top_words[index])
    print("--------------")

barrier free workplace    0.081406
both employees            0.081406
will attempt              0.081406
will attempt to           0.081406
such accommodation        0.081406
Name: 0, dtype: float64
--------------
the certificate                     0.058727
a certificate                       0.054268
satisfy certificate                 0.053669
satisfy certificate requirements    0.053669
of completion                       0.050274
Name: 1, dtype: float64
--------------
encryption is             0.053372
encryption                0.050174
transmitting over         0.048808
when transmitting         0.048808
when transmitting over    0.048808
Name: 2, dtype: float64
--------------
water                 0.110582
single use            0.104797
single use bottled    0.101453
use bottled water     0.101453
use bottled           0.101453
Name: 3, dtype: float64
--------------
1966/03/24               0.155013
the language or          0.155013
language or              0.155013
language or 

In [400]:
# print(paths["yale"])
# generate_tags_template(paths["stanford"])

In [542]:
print(get_all_tags(data_dir))

Found 3 files in 10 directories. 
['students', 'degree requirements', 'student accounts', 'salary', 'conflict of interest', 'unversity buildings', 'vehicles', 'weather', 'finances', 'smoking', 'hiring', 'performance management', 'substances', 'academic nomenclature', 'art', 'credit', 'time off', 'workplace relationships', 'charity', 'media', 'medical accomodation', 'cross listing courses', 'information technology', 'snow', 'information security', 'firearms', 'email', 'racism', 'governance', 'computer technology', 'sororities', 'course relief', 'alcohol', 'water', 'employee training', 'branding', 'IT', 'liability', 'sexual violence', 'president', 'minors', 'social justice', 'sustainability', 'heatlh', 'code of ethics', 'hate speech', 'essay writing', 'academic integrity', 'theft', 'postdoc', 'university commons', 'expenses', 'university government', 'mail', 'recruiting', 'students with religious requirements', 'naming', 'advanced credit', 'unversity finances', 'promotions', 'academic ac