In [None]:
import pdf_to_text
import glob
import os
import spacy
import numpy as np
import nltk

import pandas as pd

In [None]:
path = 'downloaded_pdfs/*.pdf'
folder = glob.glob(path)

# check the number of files that are in the folder

if len(folder) == 0:
    print('There are 0 pdfs in this directory')
    
else:
    print('There are ' + str(len(folder)) + ' pdfs in this directory')

In [None]:
def get_date_from_file(file_name):
    
    '''
    Takes file name, including directory
    Returns date in %d-%B-%Y (26-june-2019) format
    '''
    
    date = file_name.split('/')[-1].replace('_Sessions', '').lower()
    date = date.replace('.pdf', '')
    date = date.replace('_', '-')
    
    return date

In [None]:
def get_people(pdf_file):
    
    '''
    Requires pdf file
    Returns df with person names
    '''
    
    # apply text extraction to pdf_file

    test_transcript = pdf_to_text.extract_pdf_text(pdf_file)
    
    # some light cleaning
    
    test_transcript = test_transcript.replace('\n', '')
    test_transcript = test_transcript.replace('  ', ' ')
    test_transcript = test_transcript.replace('  ', ' ')
    
    # identify words
    
    spacy_nlp = spacy.load("en_core_web_sm")
    
    document = spacy_nlp(test_transcript)
    labels = []

    for element in document.ents:
        labels.append('Type: %s, Value: %s' % (element.label_, element))
    
    # create persons_df
    
    peeps_df = pd.Series(labels)
    peeps_df = peeps_df[peeps_df.str.contains('PERSON')]

    peeps_df = peeps_df.reset_index()
    peeps_df.columns = ['index', 'term']

    peeps_df['term'] = peeps_df['term'].replace('Type: PERSON, Value: ', '')
    
    return peeps_df

In [None]:
def popular_words(text_column):
    
    '''
    Requires df column with strings
    Returns most common texts
    '''

    # Converting all project descriptions to text
    text = text_column
    text = text.str.cat(sep=' ')
    text = text.split(' ')

    # Remove single-character tokens (mostly punctuation)
    text = [word for word in text if len(word) > 1]

    # Lowercase all words (default_stopwords are lowercase too)
    text = [word.lower() for word in text]

    fdist = nltk.FreqDist(text)
    
    return fdist.most_common()

In [None]:
def get_df(file_name):
    
    '''
    Requires file_name
    Saves df with most freq words
    '''
    
    name = []
    freq = []

    for i in popular_words(get_people(file_name)['term']):

        name.append(i[0])
        freq.append(i[1])

    freq_df = pd.DataFrame()

    freq_df['Name'] = name
    freq_df['Freq'] = freq
    
    directory = 'name_freq_dfs'
    
    if not os.path.exists(directory):    
        os.mkdir(directory)
        
    else:
        file_name = get_date_from_file(file_name)
        freq_df.to_csv(directory + '/' + file_name + '.csv')

In [None]:
for file in folder:
    get_df(file)