### After running this cell, skip to the bottom and run all the functions

In [61]:
# file management stuff
from tkinter import Tk, filedialog
import os

import re
import pandas as pd

# doc parsing
import win32com.client

import random

# pdf parsing
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
from io import StringIO

# html parsing
import codecs
from bs4 import BeautifulSoup

import csv

# to run parsing in parallel
import concurrent.futures

# from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool


from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
porter=PorterStemmer()

import numpy as np

import matplotlib.pyplot as plt

from docx2pdf import convert

### Select the folder holding all of the syllabi 

In [2]:
root = Tk()
root.directory = filedialog.askdirectory()
directory = root.directory
root.withdraw() #hide the main window 

''

### Use this cell to select individual files for any parsing of single documents

In [62]:
root.filename = filedialog.askopenfilename()
file = root.filename
root.withdraw()

''

In [14]:
stopwords = []
with open('stopwords_jul.txt', 'r') as f:
    for line in f:
        stopwords.append(line.strip())

In [1]:
bad_course = []
bad_files = open('bad_course_file.txt')
for file in bad_files:
    bad_course.append(file.strip())

In [52]:
course_list_doc = []
for filename in os.listdir(directory):
    if filename in bad_course:
        continue
    course_list.append(filename)

In [56]:
syllabi_df = {'Year': [], 'Section': [], 'Quarter': [], 'department': [], 'CLASS': [], 'syllabi': [], 'website_count': []}

In [57]:
for file in course_list:
    
    try: 
        text, count = parser(file)
        
    except:
        if file not in bad_course:
            bad_course.append(file)
        continue
        
    choice = re.search('[a-zA-Z]', text)

    if choice == None:
        continue


    season, year, depart, course_name, section_num = filename_parse(file)   

    syllabi_df['Quarter'].append(season)
    syllabi_df['Section'].append(section_num)
    syllabi_df['Year'].append(year)
    syllabi_df['department'].append(depart)
    syllabi_df['CLASS'].append(course_name)
    syllabi_df['syllabi'].append(text)
    syllabi_df['website_count'].append(count)

In [28]:
bad_courses = open('bad_course_file.txt', 'a')
for course in bad_course:
    bad_courses.write(course)
    bad_courses.write('\n')
bad_courses.close()

In [58]:
syllabi_df = pd.DataFrame(syllabi_df)

In [60]:
syllabi_df.to_csv('syllabi_df_doc_no_stem.csv',index=False, header=True)

### Course Description Parser

In [63]:
game_count = 0
main_dict = {'SUBJECT': [], 'CLASS': [], 'CATALOG_NBR': [], 'CATALOG_DESCRIPTION': []}
with open(file, newline = '', encoding='utf-8') as games:                                                                                          
    game_reader = csv.reader(games, delimiter='\t')
    for game in game_reader:
        game_count+= 1
        if game_count >= 2:
            if game[4] == 'NULL' or game[4] == 'TBA':
                continue
            
            text = game[4]
            text = re.sub("<!--?.*?-->","",text)
            text = re.sub("(\\d|\\W)+"," ", text)
            text = removeStopWords(text)
            text = game[4].lower()
#             text = stemSentence(text)            
            
            main_dict['SUBJECT'].append(game[0])
            main_dict['CLASS'].append(game[0]+ '-' + game[1])
            main_dict['CATALOG_NBR'].append(game[1])
            main_dict['CATALOG_DESCRIPTION'].append(text)
descr_df = pd.DataFrame(main_dict)

In [64]:
descr_df.to_csv('description_df_no_stem.csv', index=False, header=True)

### Necessary Functions

In [15]:
def filename_parse(filename):
    first_dash = filename.find('-')
    quart_year = filename[:first_dash]
    second_dash = filename.find('-', first_dash + 1)
    depart = filename[first_dash + 1: second_dash]
    third_dash = filename.find('-', second_dash + 1)
    course_name = filename[first_dash + 1: third_dash]
    end = filename.find('.')
    section_num = filename[third_dash + 1:end]
    
    
    # Parsing the season out of the filename
    if 'F' in quart_year:
        season = 'Fall'
    elif 'W' in quart_year:
        season = 'Winter'
    elif 'Sp' in quart_year:
        season = 'Spring'
    else:
        season = 'Summer'
        
    
    # parsing out the year out of the filename
    numbers = '1234567890'
    begin = 0
    while quart_year[begin] not in numbers:
        begin += 1
    year = quart_year[begin:]
    year = int('20' + year)

    
    
    return season, year, depart, course_name, section_num

In [16]:
def removeStopWords(string):
    word_list = string.split()
    new_list = []
    for word in word_list:
        if word not in stopwords:
            new_list.append(word)
    return " ".join(new_list)

In [30]:
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [17]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [18]:
def find_url(string): 
  
    # findall() has been used  
    # with valid conditions for urls in string 
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,string)       
    return [x[0] for x in url] 

In [19]:
def parser(file):
    path = directory + '/' + file
    if '.pdf' in file:
        text, count = convert_pdf_to_txt_2(path)
    elif '.doc' in file:
        text, count = doc_parser(path)
    else:
        text, count = htm_parser(path)
    return text, count

In [20]:
def convert_pdf_to_txt_2(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
        
    fp.close()
    device.close()
    text = retstr.getvalue()
    text_vec = text.split('\n')
    text_total =  ' '
    for word in text_vec:
        text_total += ' ' + word
    retstr.close()
    
    
    url_list = find_url(text_total)
    email_list = re.findall('\S+@\S+', text_total)
    
    for url in url_list:
        text_total = text_total.replace(url, "")
    
    for email in email_list:
        text_total = text_total.replace(email, "")
    
    
     # lowercase
    text_total=text_total.lower()
    
    #remove tags
    text_total=re.sub("<!--?.*?-->","",text_total)
    
    # remove special characters and digits
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ", text_total)
    text = removeStopWords(text)
#     text = stemSentence(text)
    
    
    return text, len(url_list)

In [21]:
def htm_parser(path):
    f=codecs.open(path, 'r', encoding='latin-1')
    unsoup = f.read()
    soup = BeautifulSoup(unsoup)
    for script in soup(["script", "style"]):
        script.decompose()
    strips = list(soup.stripped_strings)
    text_total= ''
    for strip in strips:
        if (strip != 'Ê') and('Ê' not in strip):
            text_total += ' ' + strip.strip('\n')
    
    
    url_list = find_url(text_total)
    email_list = re.findall('\S+@\S+', text_total)
    
    for url in url_list:
        text_total = text_total.replace(url, "")
    
    for email in email_list:
        text_total = text_total.replace(email, "")
    
    
    # lowercase
    text_total=text_total.lower()
    
    #remove tags
    text_total=re.sub("<!--?.*?-->","",text_total)
    
    # remove special characters and digits
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ", text_total)
    text = removeStopWords(text)
#     text = stemSentence(text)
    
    return text, len(url_list)

In [22]:
def doc_parser(path):
    doc = win32com.client.GetObject(path)
    text= doc.Range().Text.split('\r')
    
    text_total = ' '
    for word in text:
        if '' in word:
            word = word.replace('', '')
        if word != '':
            text_total += ' ' + word 
    
    
    url_list = find_url(text_total)
    email_list = re.findall('\S+@\S+', text_total)
    
    for url in url_list:
        text_total = text_total.replace(url, "")
    
    for email in email_list:
        text_total = text_total.replace(email, "")
    
     # lowercase
    text_total=text_total.lower()
    
    #remove tags
    text_total=re.sub("<!--?.*?-->","",text_total)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ", text_total)
    text = removeStopWords(text)
#     text = stemSentence(text)
    
    return text, len(url_list)