## 2.1 Basics of pandas

In [1]:
import pandas as pd  

# Create a fictional DataFrame 
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['Paris', 'New York', 'London']}  

df = pd.DataFrame(data)  # Display the shape of the DataFrame 
print(df.shape)

(3, 3)


In [2]:
# Display the column names 
print(df.columns)

Index(['Name', 'Age', 'City'], dtype='object')


In [3]:
# Display the row labels 
print(df.index)

RangeIndex(start=0, stop=3, step=1)


In [4]:
# Display the data types of the columns 
print(df.dtypes)

Name    object
Age      int64
City    object
dtype: object


In [None]:
# Display the first few rows of the DataFrame 
print(df.head()) 

# Display the last few rows of the DataFrame 
print(df.tail())

      Name  Age      City
0    Alice   25     Paris
1      Bob   30  New York
2  Charlie   35    London
      Name  Age      City
0    Alice   25     Paris
1      Bob   30  New York
2  Charlie   35    London


## 2.2 Read structured or flat files

Let's start by reading a simple and light CSV file.

In [None]:
CSV_PATH = './data/raw/source.csv'

df = pd.read_csv(CSV_PATH, 
                 sep=';', 
                 encoding='utf-8')

# We can then display a preview of the first few rows to ensure everything is read correctly
df.head()

If file size exceed available RAM size, you will need to chunk the CSV to process it.

In [None]:
chunk_size = 10000 # number of lines per chunk
csv_path = 'big_file.csv'

# Create an empty list to store modified chunks
modified_chunks = []

# Iterate over chunks
for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size)):
    # Apply modifications to the chunk (example: add 1 to a column named 'example_column')
    chunk_modifie = chunk.apply(ma_fonction)

    # Append the modified chunk to the list
    chunk_modifie.to_csv(
    f'./data/processed/chunk_modifie_{i}.csv')

It's the same logic for XLSX and XLS files

In [None]:
XLSX_PATH = './data/raw/results.xlsx'

df = pd.read_excel(XLSX_PATH, 
				   sheet_name='Results',
				   usecols=[1,2,3,7,8,9])

df.head()

You can read JSON file with json standard python library or use pandas : 

In [None]:
def load_json_with_pandas(file_path): 
    try: 
        dataframe = pd.read_json(file_path) 
        return dataframe 
    except FileNotFoundError: 
        print(f'The file "{file_path}" was not found.')
        return None
    except pd.errors.JSONDecodeError as e: 
        print(f'Error reading the JSON file with pandas: {e}') 
        return None

In [None]:
import xml.etree.ElementTree as ET

def load_xml_file(file_path):
    try:
        tree = ET.parse(file_path)
        return tree.getroot()
    except ET.ParseError as e:
        print(f"Error reading the XML file: {e}")
        return None

def traverse_elements(parent_element):
    for child in parent_element:
        print(f"Tag: {child.tag}, Text: {child.text}")
        traverse_elements(child)

# Replace 'example.xml' with the path to your XML file
xml_file = 'example.xml'
root = load_xml_file(xml_file)

if root is not None:
    traverse_elements(root)

In [None]:
def load_xml_with_pandas(file_path):
    try:
        # Using Pandas' read_xml function to load the XML file
        dataframe = pd.read_xml(file_path)
        return dataframe
    except Exception as e:
        print(f"Error reading the XML file with Pandas: {e}")
        return None

# Replace 'example.xml' with the path to your XML file
xml_file = 'example.xml'
xml_dataframe = load_xml_with_pandas(xml_file)

if xml_dataframe is not None:
    print(xml_dataframe)

In [5]:
import PyPDF2

pdf_file = open('./data/guide-open-data.pdf', 'rb')

pdf_reader = PyPDF2.PdfReader(pdf_file)

# For example, we can access the number of pages in a document
print(len(pdf_reader.pages))

# Now we create a page object
page_0 = pdf_reader.pages[2]

# We extract the text from this page using the extract_text() method
page_0_content = page_0.extract_text()

print(page_0_content)

28
PRÉCISIONS LIMINAIRES
La mise à disposition du public d’un document administratif et des données publiques qu’il contient découle de 
deux types de règles : 
• les règles relatives à la publication, comme une formalité nécessaire pour l’entrée en vigueur d’un acte juridique 
ou le déclenchement d’un délai. Ainsi, l’article 1er du code civil dispose que les lois et, lorsqu’ils sont publiés au 
Journal officiel de la République française, les actes administratifs entrent, en principe, en vigueur à la date qu’ils 
fixent ou, à défaut, le lendemain de leur publication1. 
• les règles relatives au droit d’accès aux documents administratifs, qui comprend des obligations de communica-
tion et de diffusion publique pour les administrations, impliquant notamment la mise en ligne des documents. En 
effet, avec l’adoption de la loi pour une République numérique du 7 octobre 2016, l’objectif de transparence, qui 
a présidé à l’adoption de la loi du 17 juillet 1978, consacre désormais le passage

In [6]:
type(page_0_content)

str

In [None]:
import tabula

PDF_PATH = './data/source.pdf'

tables = tabula.read(PDF_PATH, pages=1)
first_table = tables[0]

tabula.convert_into(PDF_PATH, './data/processed/table.csv')

In [None]:
from img2table.ocr import TesseractOCR
from img2table.document import Image

# OCR Instantiation
ocr = TesseractOCR(n_threads=1, lang="en")

# Document Instantiation (image or PDF for example)
doc = Image(src)

# Table Extraction
table = doc.extract_tables(ocr=ocr,
                           implicit_rows=False,
                           borderless_tables=False,
                           min_confidence=50)