## this is a demonstration of topic modelling. 
## from page 212 

In [41]:
#from foundational_file_pdf_operations

import PyPDF2
import re
import pandas as pd


folder = 'C:/Users/ericf/OneDrive/Documents/analytic-projects/red cross/rc_downloaded_materials/evaluations/'
file = 'IFRC Ukraine crisis response_CVA review report_ final.pdf'
file = 'Afghanistan_Humanitarian_Crises_MTR report_final.pdf' #very few newlines
#file = 'Lessons Learned CVTL COVID-19 Ops-1.pdf'

filepath = f'{folder}{file}'

def read_document(filepath, begin_page=0, end_page=-1) -> str:
    pdfFileObj = open(filepath, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    end_page = end_page if end_page > 0 else pdfReader.numPages
    
    
    page = ''
    for i in range (begin_page, end_page):
        pageObj = pdfReader.getPage(i)
        page += ' ' + pageObj.extractText()
        
    return page

document = read_document(filepath, end_page=36)

def split_to_paragraphs(text, limit=100):
    
    
    #clean up errant situations where two strings are merged without a space
    text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
    
    
    # determine if newlines are used solely in the separation of paragraphs
    # or as is common with pdf, also used to format line-lengths

    chars_per_newline = len(text) / len(text.split('\n')) 
    if chars_per_newline < limit:
        print('excess_newlines')
        #split the doc on sentence terminators followed by newline
        paragraphs = re.split('[.?!]\s*\n', text)
    else:
        print('normal_newlines')
        paragraphs = re.split("\s{2,}", text)
        
    #strip out any remaining newlines
    for i in range(len(paragraphs)):
        paragraphs[i] = paragraphs[i].replace('\n',' ')

    #create a dataframe with each paragraph as its own record
    df_paragraphs = pd.DataFrame([{"file" : file, "paragraph" : paragraph}
                             for paragraph in paragraphs if paragraph])
    
    return df_paragraphs
    
df_paragraph = split_to_paragraphs(document)

normal_newlines


# End Bootstrap

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [42]:
tfidf_para_vectorizer = TfidfVectorizer(stop_words=list(stopwords), min_df=5, max_df=0.7)
tfidf_para_vectors = tfidf_para_vectorizer.fit_transform(df_paragraph['paragraph'])
tfidf_para_vectors.shape




(221, 414)

In [43]:
from sklearn.decomposition import NMF

nmf_text_model = NMF(n_components=10, random_state=42)

W_text_matrix = nmf_text_model.fit_transform(tfidf_para_vectors)
H_text_matrix = nmf_text_model.components_





In [44]:
def display_topics(model, features, no_top_words=5):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1]
        print(f"\nTopic {topic}")
        for i in range(0, no_top_words):
            print(" %s (%2.2f)" % (features[largest[i]],
                                  word_vector[largest[i]]*100.0/total))

In [45]:
display_topics(nmf_text_model, tfidf_para_vectorizer.get_feature_names_out())


Topic 0
 aid (7.82)
 recipients (6.61)
 needs (3.12)
 assistance (2.83)
 kits (2.75)

Topic 1
 ifrc (5.70)
 arcs (4.23)
 coordination (3.25)
 meetings (2.39)
 collaboration (2.34)

Topic 2
 2022 (11.32)
 afghanistan (9.03)
 2021 (3.57)
 humanitarian (3.50)
 november (3.30)

Topic 3
 kabul (5.98)
 district (5.27)
 nuristan (3.24)
 field (3.10)
 kandahar (3.07)

Topic 4
 review (8.80)
 question (5.61)
 implementation (5.34)
 term (3.70)
 longer (3.26)

Topic 5
 figure (16.60)
 recipient (10.13)
 list (8.00)
 province (7.15)
 mtr (6.79)

Topic 6
 feedback (9.80)
 submit (7.33)
 complaints (6.00)
 complaint (5.92)
 mechanisms (3.74)

Topic 7
 volunteers (5.42)
 arcs (3.46)
 community (2.81)
 staff (2.30)
 surveys (2.15)

Topic 8
 badghis (10.65)
 male (9.90)
 female (5.93)
 27 (3.30)
 households (3.17)

Topic 9
 food (2.70)
 assistance (2.11)
 affected (2.08)
 cash (1.93)
 provinces (1.71)
