# Code changes to make before running this notebook

1. Goto ragalchemy -> extractors -> image.py -> extract_text_from_ocr() : and replace the pytessart path if running on windows else comment it out
2. Run ```pip install -r requirements.txt```
3. Change OpenAI Keys in the files  (ragalchemy->chat_model->openai.py and ragalchemy->embedding->openai.py)


In [1]:
# importing packages
import sys
sys.path.append("..")

# import ragalchemy module
from ragalchemy.extractors.pptx import PPTExtractor
from ragalchemy.agents.pptx import PPTSummarizer
from ragalchemy.agents.pptx import PPTQnA

In [2]:
# pptx file path 
pptx_file = r"/workspaces/ppt-summarizer/documents/Sample Document.pptx"

In [3]:
# Create PPTExtractor object and specify the pptx file path

# if you want to only extract contents (chart, tables, text, ocr) from ppt file
# ex = PPTExtractor(pptx_file)

# if you want to extract as well as summarize the ppt
# ex = PPTSummarizer(pptx_file)

# if you want to extract , summarize and perform rag on the ppt file
ex = PPTQnA(pptx_file)

  from .autonotebook import tqdm as notebook_tqdm


# Extraction

In [4]:
# To view the object tree
print(ex.__dict__)

{'file_path': '/workspaces/ppt-summarizer/documents/Sample Document.pptx', 'extraction_method': 'slide', 'ocr_engine': 'tesseract', 'slides': [{'slide_number': 1, 'slide_title': 'Sample Document', 'slide_text': '\n                    Slide Number 1\n                    Slide Title : Sample Document\n                    Slide Text : Sample DocumentThis document provides information regarding sales \n                    Slide Table : \n                    \n                    Slide Charts Data : \n                    \n                    Slide Image OCR Text : \n                    \n                    ', 'entities': [{'chart_type': 'text', 'text': 'Sample Document', 'left': 1524000, 'top': 1122363, 'width': 9144000, 'height': 2387600}, {'chart_type': 'text', 'text': 'This document provides information regarding sales ', 'left': 1524000, 'top': 3602038, 'width': 9144000, 'height': 1655762}], 'embeddings': [-0.01529935747385025, 0.000713415676727891, -0.016842596232891083, -0.001539082

In [5]:
# To extract all information into single string
print(ex.combine())


                Presentation Title : Sample Document 
                Presentation Author : Aman Ulla 
                Subject :  
                Keywords :  
                Last Modified Date : Aman Ulla 
                Created Date : 2023-12-12 07:25:55 
                Modified Date : 2023-12-12 07:31:56
                
                          ------------------------------ SLIDE 1 ------------------------------ 


                    Slide Number 1
                    Slide Title : Sample Document
                    Slide Text : Sample DocumentThis document provides information regarding sales 
                    Slide Table : 
                    
                    Slide Charts Data : 
                    
                    Slide Image OCR Text : 
                    
                    

                  ------------------------------ SLIDE 2 ------------------------------ 


                    Slide Number 2
                    Slide Title : Sales till date 2023


In [6]:
# To export the object as json
print(ex.to_json())

{'file_path': '/workspaces/ppt-summarizer/documents/Sample Document.pptx', 'extraction_method': 'slide', 'ocr_engine': 'tesseract', 'slides': [{'slide_number': 1, 'slide_title': 'Sample Document', 'slide_text': '\n                    Slide Number 1\n                    Slide Title : Sample Document\n                    Slide Text : Sample DocumentThis document provides information regarding sales \n                    Slide Table : \n                    \n                    Slide Charts Data : \n                    \n                    Slide Image OCR Text : \n                    \n                    ', 'entities': [{'chart_type': 'text', 'text': 'Sample Document', 'left': 1524000, 'top': 1122363, 'width': 9144000, 'height': 2387600}, {'chart_type': 'text', 'text': 'This document provides information regarding sales ', 'left': 1524000, 'top': 3602038, 'width': 9144000, 'height': 1655762}], 'embeddings': [-0.01529935747385025, 0.000713415676727891, -0.016842596232891083, -0.001539082

In [7]:
# To export the extraction as dataframe
ex.to_dataframe()

Unnamed: 0,Slide Number,Type,Text,Embedding,Position
0,1,text,Sample Document This document provides informa...,"[-0.01529935747385025, 0.000713415676727891, -...","[1524000, 3602038, 9144000, 1655762]"
1,2,chart,"Chart Title : Sales of A,B,C Company from 2020...","[-0.011303146369755268, -0.02892415039241314, ...","[838200, 1825625, 10515600, 4351338]"
2,2,text,Sales till date 2023,"[-0.011303146369755268, -0.02892415039241314, ...","[838200, 1825625, 10515600, 4351338]"
3,3,chart,Chart Title : \nChart Type : PIE (5)\n+---+---...,"[-0.006265274249017239, -0.01940690539777279, ...","[838200, 1825625, 10515600, 4351338]"
4,3,text,Quarterly Sales for A Company in 2023,"[-0.006265274249017239, -0.01940690539777279, ...","[838200, 1825625, 10515600, 4351338]"
5,4,text,What is Marketing? Marketing is a broad term t...,"[-0.01518525741994381, -0.010562300682067871, ...","[838200, 1825625, 10515600, 4351338]"
6,5,table,+--------------------+------------------------...,"[-0.009034499526023865, -0.007884900085628033,...","[838199, 1825625, 10515600, 4682745]"
7,5,text,Different Sales Approach considered by A,"[-0.009034499526023865, -0.007884900085628033,...","[838199, 1825625, 10515600, 4682745]"


In [8]:
# To save the extract context locally 
ex.persist()

Created folder: /workspaces/ppt-summarizer/documents/Sample Document


# Summarization

In [9]:
# To summarize each slide 
for s in ex.summarize_stream(summarize_method="slide"):  # default slide wise
    print("-"*20 ,"Slide Number",s["Slide Number"],"-"*20)
    print(s["Summary"])

-------------------- Slide Number 1 --------------------
Slide Number 1 provides information about a sample document related to sales. Unfortunately, there is no specific content available in the form of text, tables, charts, or image OCR text on this slide.
-------------------- Slide Number 2 --------------------
The slide titled "Sales till date 2023" provides information on the sales of companies A, B, and C from 2020 to 2023. The data is presented in a table and a chart.

According to the table, the sales for company A were 4.3 in 2020, 2.5 in 2021, 3.5 in 2022, and 4.5 in 2023. For company B, the sales were 2.4 in 2020, 4.4 in 2021, 1.8 in 2022, and 2.8 in 2023. Company C had sales of 2 in 2020, 2 in 2021, 3 in 2022, and 5 in 2023.

The chart, which is a column clustered chart, visually represents the sales data for companies A, B, and C from 2020 to 2023.

Please note that the OCR text from the image is not provided, so no additional information can be extracted from it.
--------

In [10]:
# To summarize whole PPT at once
for s in ex.summarize_stream(summarize_method="all"): 
    print(s)

Slide 1: The first slide titled "Sample Document" provides information regarding sales. However, there is no specific content in the slide table, charts, or image OCR text.

Slide 2: The second slide titled "Sales till date 2023" displays a chart showing the sales of companies A, B, and C from 2020 till the present. The chart is a column clustered chart, and the data in the chart shows the sales figures for each company in each year.

Slide 3: The third slide titled "Quarterly Sales for A Company in 2023" presents a pie chart showing the quarterly sales for company A in the year 2023. The chart displays the sales figures for each quarter.

Slide 4: The fourth slide titled "What is Marketing?" provides a definition of marketing. It explains that marketing is a broad term that encompasses activities and processes aimed at creating, communicating, delivering, and exchanging value with a target audience. The ultimate goal of marketing is to satisfy customer needs and wants while achieving 

In [11]:
# To summarize a single slide
for s in ex.summarize_stream(summarize_method="single",slide_number=3): 
    print(s)

The slide titled "Quarterly Sales for A Company in 2023" provides information about the sales performance of a company in each quarter of 2023. The table shows the sales figures for each quarter, with the 1st quarter having sales of 8.2, the 2nd quarter with sales of 3.2, the 3rd quarter with sales of 1.4, and the 4th quarter with sales of 1.2. There is no information provided about the chart or the image OCR text.


In [12]:
# To summarize all the charts / tables
for s in ex.summarize_stream(summarize_method="charts"): 
    print(s)
    print("\n\n","-"*60,"\n\n")

The chart titled "Sales of A, B, C Company from 2020 till date" displays the sales data for three companies (A, B, and C) from 2020 to 2023. The chart type is COLUMN_CLUSTERED.

Here is a summary of the sales data:

- In 2020, Company A had sales of 4.3, Company B had sales of 2.4, and Company C had sales of 2.
- In 2021, Company A had sales of 2.5, Company B had sales of 4.4, and Company C had sales of 2.
- In 2022, Company A had sales of 3.5, Company B had sales of 1.8, and Company C had sales of 3.
- In 2023, Company A had sales of 4.5, Company B had sales of 2.8, and Company C had sales of 5.

Please note that the values mentioned above are based on the data provided in the chart.


 ------------------------------------------------------------ 


The chart shows the sales data for each quarter of the year. The sales figures are as follows:

- 1st Qtr: 8.2
- 2nd Qtr: 3.2
- 3rd Qtr: 1.4
- 4th Qtr: 1.2

This data can be visualized in a pie chart, where each quarter represents a slice 

In [None]:
# To summarize all the objects

# The performance of Summary at object level is not good and super time consuming

# for s in ex.summarize_stream(summarize_method="object"): 
#     print(s)
#     print("\n\n","-"*60,"\n\n")

# RAG

In [14]:
import re
pattern = r'\[(\d+)\]'


query ="What is the sales for 4th Qtr?"
answer = ex.run(query,k=3,similarity_score=0.7)
print("Question : "+ str(query))
print("Answer : "+ str(answer))

# Render Reference Slide
print("\n\nReference :")
matches = re.findall(pattern, answer)
for match in matches:
    print(ex.slides[int(match)-1].slide_text)

Question : What is the sales for 4th Qtr?
Answer : The sales for the 4th quarter is 1.2 [3].


Reference :

                    Slide Number 3
                    Slide Title : Quarterly Sales for A Company in 2023
                    Slide Text : Quarterly Sales for A Company in 2023
                    Slide Table : 
                    
                    Slide Charts Data : 
                     
 Chart Title : 
Chart Type : PIE (5)
+---+---------+-------+
|   |         | Sales |
+---+---------+-------+
| 0 | 1st Qtr |  8.2  |
| 1 | 2nd Qtr |  3.2  |
| 2 | 3rd Qtr |  1.4  |
| 3 | 4th Qtr |  1.2  |
+---+---------+-------+ 
 
                    Slide Image OCR Text : 
                    
                    
