# Create Summary
This notebook iterates through the full text that was extracted from the blog posts, processes it with Open AI and generates a summary. 

In [None]:
%run CONFIG

In [None]:
from notebookutils import mssparkutils

openAiKey = mssparkutils.credentials.getSecret(keyvault,secretname)  # ADAPT

In [None]:
###############################
# Function: summarize_with_ai #
###############################

from openai import AzureOpenAI


def summarize_with_ai(blog, text):
    client = AzureOpenAI(
        azure_endpoint = openAiEndpoint, 
        api_key = openAiKey,  
        api_version = openAiApiVersion
    )

    prompt = f"Summarize the following text in maximum {openAiSummaryLenght} words: {text}"
    message_text = [{"role":"system","content":openAiInstructions},{"role":"user", "content":prompt}]

    completion = client.chat.completions.create(
        model=openAiDeployment, # model = "deployment_name"
        messages = message_text,
        temperature=param_temperature,
        max_tokens=param_max_tokens,
        top_p=param_top_p,
        frequency_penalty=param_frequency_penalty,
        presence_penalty=param_presence_penalty,
        stop=None
    )

    print(f"[Log][summarize_with_ai] Completed OpenAi API call for {blog}. Summary length: " + str(len(completion.choices[0].message.content)))
    return completion.choices[0].message.content

## Summarize Blogposts



##### Known issues

__OpenAI rate limitations:__ 
The regular OpenAI Pay-As-You-Go SKU (S0) has some rate limitations that apply. This leads to the following error:
```
Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}
```
Within the Summarize code cell, there is a try-except block that catches this exception (or others) and makes sure that the notebook doesn't fail in case it happens. That means, the notebook will complete either way, but not all blogposts might be summarized. In that case, the remaining summaries will be created with the next run of the *nb_create_summary* notebook. 
There is a process to get the rate limitation increased, but it should only occur if many blogs are summarized during the same run. 


In [None]:
#############
# Summarize #
#############
from pyspark.sql.types import *
from datetime import *

if spark.catalog.tableExists("blog_summaries") :
  query = f"SELECT b.Id, p.title, p.author, p.link, p.post, p.post_length, p.published FROM {lh_raw}.blog_posts p LEFT JOIN {lh_raw}.blogs b ON p.blog = b.blog LEFT JOIN {lh_transformed}.blog_summaries s ON p.link = s.link WHERE s.link IS NULL"
else :
  query = f"SELECT b.Id, p.title, p.author, p.link, p.post, p.post_length, p.published FROM {lh_raw}.blog_posts p LEFT JOIN {lh_raw}.blogs b ON p.blog = b.blog"

blogs = spark.sql(query)
display(blogs, summary=True)

schema = StructType([
  StructField('id', IntegerType(), True),
  StructField('title', StringType(), True),
  StructField('author', StringType(), True),
  StructField('link', StringType(), True),
  StructField('summary', StringType(), True),
  StructField('summary_length', IntegerType(), True),
  StructField('published', TimestampType(), True),
  ])
emptyRDD = spark.sparkContext.emptyRDD()
df = spark.createDataFrame(emptyRDD,schema)

date_format = '%a, %d %b %Y %H:%M:%S' # %z

errorDuringSummary = False

# Iterate through all the blog posts
for blog in blogs.rdd.collect() : 
    try:
        # Call the summarization function on the current blogpost
        summary = summarize_with_ai(blog.link, blog.post)
        # Create a new dataframe row with the data of the current blogpost
        row = spark.createDataFrame([(int(blog.Id), blog.title, blog.author, blog.link, summary, int(len(summary)), datetime.strptime(blog.published.rsplit(' ',1)[0], date_format))], schema)
        # Add the row to the dataframe
        df = df.union(row)
    except Exception as ex: 
        # If an error occurs during the summarization, print it and set the variable "errorDuringSummary" to True, which can be used for error handling. Then continue. 
        print(f"[Error][summarize] An error occured during summary creation: {str(ex)}")
        errorDuringSummary = True
        continue
        

df.write.format("delta").mode("append").saveAsTable("blog_summaries")


In [None]:
df = spark.sql(f"SELECT * FROM {lh_transformed}.blog_summaries LIMIT 10")
display(df)

In [None]:
mssparkutils.notebook.exit(errorDuringSummary)