# installs and imports

In [None]:
!pip install PyMuPDF
!pip install turicreate

In [None]:
# imports
import os.path
import glob 
import fitz
import turicreate as tc

# files

In [None]:
files = [('/content/drive/MyDrive/big_data/project/Africa_2019.PDF',12, 'ANGOLA'),
         ('/content/drive/MyDrive/big_data/project/America_2019.PDF',16,'ARGENTINA'),
         ('/content/drive/MyDrive/big_data/project/Asia_2019.PDF',6,'AFGHANISTAN'),
         ('/content/drive/MyDrive/big_data/project/Europe1_2019.PDF',5,'ARMENIA'),
         ('/content/drive/MyDrive/big_data/project/Europe2_2019.PDF',7,'ALBANIA'),
         ('/content/drive/MyDrive/big_data/project/MiddleEast_2019.PDF',11,'ALGERIA')]

# Extract data from pdf by table of content

In [None]:
# extract toc from pdf 
def extract_toc_data(filename):
  pdf_data = []
  with fitz.open(filename) as doc:
    toc = get_toc_by_filename(filename)
    if len(toc) == 0:
      toc = doc.getToC()
      toc = [t for t in toc if t[0] == 1]
      toc = [t[1].strip() for t in toc if len(t[1])>1]
    toc = create_toc(toc)
    for page in doc:
      pdf_data.append(page.getText())
  return toc,pdf_data 

# crete toc and clean contents
def create_toc(toc):
  top_toc = []
  for c in toc :
    c = c.split('/')[0]
    c = c.replace(u'\xa0', u' ')
    top_toc.append(c.upper())
  return top_toc

# get toc if not exsist in file
def get_toc_by_filename(filename):
  path_split = filename.split('/')
  name = path_split[len(path_split) -1 ].split('.')[0]+'.txt'
  path_split[len(path_split) -1 ] = name
  filename ='/'.join(path_split)
  if os.path.isfile(filename):
    file = open(filename, "r")
    toc = file.read()
    toc = toc.split(',')
    return toc
  else:
    return []

# get all pdf text
def get_all_text(pages):
  text = ""
  for page in pages :
    text += page
  text = " ".join(text.split())
  text = text.replace('\n'," ")
  return text
# get only relevent toc
def get_relevent_toc(toc,first_toc):
  new_toc =[]
  for t in toc :
    if not (t[-1] >='A' and t[-1] <='Z'):
      t = t[:-1]
    new_toc.append(t.strip())
  index = new_toc.index(first_toc)
  return new_toc[index:]

In [None]:
def parse_pdf_by_toc(filename,start_page,first_toc='Afghanistan'):
  parse_text = dict()
  toc, pages = extract_toc_data(filename)
  text = get_all_text(pages[start_page:])
  toc = get_relevent_toc(toc,first_toc)
  for i in range(len(toc)-1):
    start = toc[i].upper()
    end = toc[i+1].upper()
    start_index = text.find(start)
    end_index = text.find(end)
    toc_text = text[start_index:end_index]
    parse_text[start] = toc_text
    if start_index == -1 or end_index == -1 :
      print("Not found: " + start + " end_index: "+str(end_index) + " start_index: "+ str(start_index) )
    text = text[end_index: len(text)]
  last_toc = toc[-1].upper()
  parse_text[last_toc] = text
  return parse_text

# save files in drive

In [1]:
# save each country file in drive 

for file in files:
  text_dict = parse_pdf_by_toc(file[0],file[1],file[2])
  print(text_dict.keys())
  print(gal)
  pdf_path = file[0] 
  pdf_year = pdf_path.split('_')[2].split('.')[0]
  path = '/content/drive/MyDrive/big_data/project/Data/'
  for key, value in text_dict.items():
    if(len(key)) < 4:
      continue
    file_path = path + pdf_year+ '/'  + key +'.txt'
    f = open(file_path, "w")
    f.write(value)
    f.close()

# combine all countries data to csv

In [None]:
path = '/content/drive/MyDrive/big_data/project/Data//'
files = [f for f in glob.glob(path + "**/*.txt", recursive=True)]
years = []
country_names = []
texts = []
for file_path in files :
  file = open(file_path,'r')
  text = file.read()
  split_path = file_path.split('/')
  year = split_path[-2]
  country_name = split_path[-1][:-4]
  country_name = country_name.lower()
  years.append(year)
  country_names.append(country_name.strip())
  texts.append(text)

sframe = tc.SFrame({'year':years,'country':country_names, 'text':texts})
countries = sframe.groupby('country',{'count':tc.aggregate.COUNT_DISTINCT('year')})
countries = countries[countries['count'] == 3]
sframe = sframe.filter_by(countries['country'].unique(),'country')
sframe