# Intro

In [1]:
import os, sys, pickle
import numpy as np

In [4]:
from striprtf.striprtf import rtf_to_text

In [61]:
import string

# Read rtf

In [5]:
rtf_folder='./rtf/'
files=os.listdir(rtf_folder)

In [8]:
files=[file for file in files if file.endswith('.rtf')]
files.sort()

In [123]:
with open(rtf_folder+files[0], 'r') as f:
    rtf=f.read()
    all_text = rtf_to_text(rtf)

# Clean the text

## First rough cleaning

In [143]:
splitted_text_0=[split.strip() for split in all_text.replace('\t', '').split('\n') if len(split.strip())>0]# and '.' in split]

In [144]:
splitted_text_0

['Airtel Africa plc Sustainability Strategy',
 'Transforming lives',
 'Sustainability at Airtel Africa',
 'Message from the CEO',
 'Transforming lives means creating a sustainable future',
 'As an African business, we recognise the opportunities this continent has to offer: the talent of the people it nurtures and the potential for responsible development. But we are also aware of the challenges the communities across the continent face and we’re determined to make',
 'a positive impact.',
 'Olusegun Ogunsanya',
 'Chief executive officer',
 'Sustainability at Airtel Africa',
 'In our Annual Report and Accounts 2020/21, we told our stakeholders that the development of our sustainability strategy was one of the most important steps Airtel Africa had ever taken. Since then, the business has been focused on identifying the risks and opportunities that moving to a more sustainable future will bring, and developing the programmes and',
 'Contents',
 'Sustainability at Airtel Africa',
 'Messa

## Cut the index entries, dates and other non relevant entries

In [145]:
[split for split in splitted_text_0 if split[0].isdigit()]

['5  Our stakeholder engagement',
 '28 October 2021',
 '7 Our material topics',
 '8  Our sustainability strategy framework',
 '10 Our contribution to the UN SDGs',
 '12 Overview',
 '14 Our data security goal',
 '16 Our service quality goal',
 '18 Our supply chain goal',
 '22 Overview',
 '24 Our commitments',
 '28 Overview',
 '30 Our digital inclusion goal',
 '32 Our financial inclusion goal',
 '34 Our access to education goal',
 '38 Overview',
 '40 Reduction of greenhouse gas emissions goal',
 '41 Environmental stewardship goal',
 '44 Partnerships',
 '45 Glossary',
 '2Airtel Africa plc Sustainability Strategy 2021',
 '28 October 2021',
 '4Airtel Africa plc Sustainability Strategy 2021',
 '7 Our material topics',
 '8  Our sustainability strategy framework',
 '10 Our contribution to the UN SDGs',
 '6Airtel Africa plc Sustainability Strategy 2021',
 '8Airtel Africa plc Sustainability Strategy 2021',
 '10Airtel Africa plc Sustainability Strategy 2021',
 '12 Overview',
 '14 Our data securit

Almost all entries can be cut, but the ones referring to numbers that are crucial for the discourse. In that case, those splits contain punctuations.

In [146]:
[split for split in splitted_text_0 if split[0].isdigit() and ('.' in split or ',' in split)]

['79% which, we believe, is higher than many other companies in Africa or anywhere else in the world. All our markets are focused on maintaining this high engagement score.',
 '4A Partner with large corporates and development finance institutions (DFIs) to target farmers through their supply chains.',
 '4B Extend ‘collateral-free’ loans to 1 million small scale farmers.',
 '40 years, from 395 million',
 '21 percent of the world’s projected urban population.']

In [147]:
splitted_text_1=[split for split in splitted_text_0 if not split[0].isdigit() or (split[0].isdigit() and ('.' in split or ',' in split))]

## Frequency of sentences

Some sentences are present multiple times, since they are either slogans, index entries or titles. We are not interested in them.

In [148]:
st_0, k_st_0=np.unique(splitted_text_1, return_counts=True)

In [149]:
st_0=st_0[np.argsort(-k_st_0)]
k_st_0=k_st_0[np.argsort(-k_st_0)]

In [150]:
np.vstack((st_0[k_st_0>1], k_st_0[k_st_0>1])).T

array([['Delivering our sustainability strategy', '10'],
       ['SDG alignment', '8'],
       ['Sustainability at Airtel Africa', '7'],
       ['Developing our sustainability strategy', '7'],
       ['To achieve this by 2025, we will:', '7'],
       ['Our targets and timelines', '7'],
       ['Timeline', '7'],
       ['How we measure our progress', '7'],
       ['Focus areas', '6'],
       ['Our programme to deliver this goal is based around three focus areas:',
        '6'],
       ['To achieve this by 2030, we will:', '3'],
       ['Goals', '3'],
       ['Diverse and inclusive workforce', '2'],
       ['See our digital inclusion goal on page 30', '2'],
       ['Independent non-executive director and Airtel Africa’s sustainability champion',
        '2'],
       ['Target 8.8 – Protect labour rights and promote safe and secure working environments for all workers, including migrant workers, in particular women migrants, and those in precarious employment',
        '2'],
       ['See o

At first sight all terms that appear more than a single time can be disregarded.

In [132]:
freq_dict=dict(zip(st_0, k_st_0))

In [151]:
splitted_text_2=[split for split in splitted_text_1 if freq_dict[split]<=2]

## Reducing the amount of titles and slogans

In [166]:
splitted_text_3=[]
counter=0
while counter<len(splitted_text_2):
    text=splitted_text_2[counter]
    if text[0].islower(): 
        # a smaller character indicates it is the end of a previous sentence
        splitted_text_3.append(text)
    elif any([point in text for point in '.!?']) or splitted_text_2[counter+1][0].islower():
        # we can have either the case in which a sentence is present, and therefore we have a punctuation,
        # or the case in which the sentence is ended in the following line
        splitted_text_3.append(text)
    counter+=1

In [170]:
final_text=' '.join(splitted_text_3)

# First text cleaner

In [175]:
def first_text_cleaner(text):
    _split=split_and_brutal_clean(text)
    _split=remove_indices(_split)
    _split=delete_duplicates(_split)
    _split=delete_titles(_split)
    return ' '.join(_split)

In [173]:
def split_and_brutal_clean(text):
    '''
    Remove tabs and useless blank entries
    '''
    no_t_text=text.replace('\t', '')
    return [split.strip() for split in no_t_text.split('\n') if len(split.strip())>0]

In [172]:
def remove_indices(splitted_text):
    '''
    From the first split remove the entries related to indices
    '''
    out=[]
    for split in splitted_text:
        if not split[0].isdigit(): 
            # if either the first character is not a digit,
            # and therefore is not a title or the entry of an index,
            # take it
            out.append(split)
        elif (split[0].isdigit() and ('.' in split or ',' in split)):
            # or if it is part of a sentence
            out.append(split)
    return out

In [177]:
def delete_duplicates(splitted_text):
    '''
    Delete too frequent sentences (they do not carry any information)
    '''
    st, k_st=np.unique(splitted_text, return_counts=True)
    freq_dict=dict(zip(st, k_st))
    return [split for split in splitted_text if freq_dict[split]<=2]

In [174]:
def delete_titles(splitted_text):
    '''
    Titles start with a capital letter and do not end with a punctuation mark.
    Moreover, they do not carry any relevant info about the text.
    '''
    out=[]
    counter=0
    while counter<len(splitted_text):
        text=splitted_text[counter]
        if text[0].islower(): 
            # a smaller character indicates it is the end of a previous sentence
            out.append(text)
        elif any([point in text for point in '.!?']) or splitted_text[counter+1][0].islower():
            # we can have either the case in which a sentence is present, and therefore we have a punctuation,
            # or the case in which the sentence is ended in the following line
            out.append(text)
        counter+=1
    return out

In [178]:
first_text_cleaner(all_text)==final_text

True

# Alternative cleaning

I am not 100% convinced that the previous one is the most clever one. The fact is that it depends on what you want to consider. If we want correct sentences, probably, we should split at every period mark first.

In [179]:
no_t_all_text=all_text.replace('\t', '')

In [203]:
sentences=no_t_all_text.split('.')
new_text=[]
for sentence in sentences:
    chunks=[s for s in sentence.split('\n') if len(s)>0]
    if len(chunks)>1 or ' ' in chunks[-1]:
        if chunks[-1][0].isupper():
            # the last element is a sentence
            new_text.append(chunks[-1].strip()+'.')
        else:
            # the sentence was cut in several pieces: then look for the first capital letter
            counter=len(chunks)-1
            while counter>0 and chunks[counter].islower():
                counter-=1
            
            _new_sentence=' '.join([c.strip() for c in chunks[counter:]])+'.'
            new_text.append(_new_sentence)

In [206]:
new_text=' '.join(new_text)

In [207]:
def alt_clean(long_text):
    no_t_text=long_text.replace('\t', '')
    sentences=no_t_text.split('.')
    new_text=[]
    for sentence in sentences:
        chunks=[s for s in sentence.split('\n') if len(s)>0]
        if len(chunks)>1 or ' ' in chunks[-1]:
            if chunks[-1][0].isupper():
                # the last element is a sentence
                new_text.append(chunks[-1].strip()+'.')
            else:
                # the sentence was cut in several pieces: then look for the first capital letter
                counter=len(chunks)-1
                while counter>0 and chunks[counter].islower():
                    counter-=1
            
                _new_sentence=' '.join([c.strip() for c in chunks[counter:]])+'.'
                new_text.append(_new_sentence)
    return ' '.join(new_text)