From a5c3f325fcce5720d3ecca5b7770e97df06323e0 Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 01:11:08 +0530
Subject: [PATCH 01/11] Updated swift c module name

---
 mediapipe/tasks/cc/genai/inference/c/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mediapipe/tasks/cc/genai/inference/c/BUILD b/mediapipe/tasks/cc/genai/inference/c/BUILD
index 12a69fea43..ee40408cd6 100644
--- a/mediapipe/tasks/cc/genai/inference/c/BUILD
+++ b/mediapipe/tasks/cc/genai/inference/c/BUILD
@@ -21,7 +21,7 @@ cc_library(
     name = "libllm_inference_engine_cpu",
     srcs = ["llm_inference_engine_cpu.cc"],
     hdrs = ["llm_inference_engine.h"],
-    tags = ["swift_module=LlmInferenceEngineC"],
+    tags = ["swift_module=MediaPipeTasksGenAIC"],
     deps = [
         "//mediapipe/framework/port:file_helpers",
         "//mediapipe/framework/port:ret_check",

From 59fe2c224db66510ee019799dfff6d08e0f20d84 Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 01:11:44 +0530
Subject: [PATCH 02/11] Updated mediapipe/tasks/ios/genai/core/BUILD

---
 mediapipe/tasks/ios/genai/core/BUILD | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/mediapipe/tasks/ios/genai/core/BUILD b/mediapipe/tasks/ios/genai/core/BUILD
index 41561cfc68..4879654218 100644
--- a/mediapipe/tasks/ios/genai/core/BUILD
+++ b/mediapipe/tasks/ios/genai/core/BUILD
@@ -12,20 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
-
 licenses(["notice"])
 
 package(default_visibility = ["//mediapipe/tasks:internal"])
 
-swift_library(
-    name = "LlmTaskRunner",
-    srcs = [
-        "sources/GenAiInferenceError.swift",
-        "sources/LlmTaskRunner.swift",
-    ],
-    module_name = "LlmTaskRunner",
-    deps = [
-        "//mediapipe/tasks/cc/genai/inference/c:libllm_inference_engine_cpu",
-    ],
-)
+exports_files(["sources/LlmTaskRunner.swift", "sources/GenAiInferenceError.swift"])

From 58d80fa72144e9dba623e803e25e7cce16b47281 Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 01:12:09 +0530
Subject: [PATCH 03/11] Added new error types to GenAiInferenceError

---
 .../core/sources/GenAiInferenceError.swift      | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/mediapipe/tasks/ios/genai/core/sources/GenAiInferenceError.swift b/mediapipe/tasks/ios/genai/core/sources/GenAiInferenceError.swift
index 20c62bdab5..80830a4df1 100644
--- a/mediapipe/tasks/ios/genai/core/sources/GenAiInferenceError.swift
+++ b/mediapipe/tasks/ios/genai/core/sources/GenAiInferenceError.swift
@@ -16,15 +16,22 @@ import Foundation
 
 /// Errors thrown by MediaPipe GenAI Tasks.
 public enum GenAiInferenceError: Error {
-  case invalidResponseError
+  case invalidResponse
+  case illegalMethodCall
+  case modelNotFound
 }
 
 extension GenAiInferenceError: LocalizedError {
   /// A localized description of the `GenAiInferenceError`.
   public var errorDescription: String? {
     switch self {
-    case .invalidResponseError:
+    case .invalidResponse:
       return "The response returned by the model is invalid."
+    case .illegalMethodCall:
+      return
+        "You cannot invoke `generateResponse` while another response generation invocation is in progress."
+    case .modelNotFound:
+      return "No file found at the `modelPath` you provided."
     }
   }
 }
@@ -37,8 +44,12 @@ extension GenAiInferenceError: CustomNSError {
 
   public var errorCode: Int {
     switch self {
-    case .invalidResponseError:
+    case .invalidResponse:
       return 0
+    case .illegalMethodCall:
+      return 1
+    case .modelNotFound:
+      return 2
     }
   }
 }

From 4abe2e79963afac8fd66c79faf4ee4e913fedaf3 Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 01:13:48 +0530
Subject: [PATCH 04/11] Updated iOS LlmTaskRunner to be initialized from a
 config struct

---
 .../genai/core/sources/LlmTaskRunner.swift    | 241 ++++++++++++++++--
 mediapipe/tasks/ios/genai/inference/BUILD     |   7 +-
 2 files changed, 231 insertions(+), 17 deletions(-)

diff --git a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
index 7cc0c9c59b..04f8c88b9d 100644
--- a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
+++ b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
@@ -13,24 +13,65 @@
 // limitations under the License.
 
 import Foundation
-import LlmInferenceEngineC
+import MediaPipeTasksGenAIC
 
 /// This class is used to create and call appropriate methods on the C `LlmInferenceEngine_Session`
 /// to initialize, execute and terminate any MediaPipe `LlmInference` task.
-public final class LlmTaskRunner {
-  fileprivate typealias CLlmSession = UnsafeMutableRawPointer
+/// Note: Tasks should not attempt to clear undeleted caches on initialization since user can create
+/// multiple instances of the task and there is now way of knowing whether they are still
+/// active. Deleting caches of active task instances will result in crashes when the C++
+/// functions are invoked.
+/// Instead tasks can encapsulate `clearAllCachedFiles()` to provide a function to delete
+/// any undeleted caches when the user wishes to.
+final class LlmTaskRunner {
+  private typealias CLlmSession = UnsafeMutableRawPointer
+
+  private static let cacheSuffix = ".cache"
+  private static let globalCacheDirectory = FileManager.default.temporaryDirectory
+    .versionIndependentAppending(component: "mediapipe.genai.inference.cache")
+  private static let cacheDirectory = LlmTaskRunner.globalCacheDirectory
+    .versionIndependentAppending(component: "\(UUID().uuidString)")
 
   private let cLlmSession: CLlmSession
 
+  private let modelCacheFile: URL
   /// Creates a new instance of `LlmTaskRunner` with the given session config.
   ///
   /// - Parameters:
   ///   - sessionConfig: C session config of type `LlmSessionConfig`.
-  public init(sessionConfig: LlmSessionConfig) {
-    /// No safe guards for session creation since the C APIs only throw fatal errors.
-    /// `LlmInferenceEngine_CreateSession()` will always return an llm session if the call
+  init(config: Config) throws {
+    guard FileManager.default.fileExists(atPath: config.modelPath),
+      let modelName = config.modelPath.components(separatedBy: "/").last
+    else {
+      throw GenAiInferenceError.modelNotFound
+    }
+
+    /// Adding a `UUID` prefix to the cache path to prevent the app from crashing if a model cache
+    /// is already found in the temporary directory.
+    /// Cache will be deleted when the task runner is de-allocated. Preferring deletion on
+    /// de-allocation to deleting all caches on initialization to prevent model caches of
+    /// other task runners from being de-allocated prematurely during their life time.
+    ///
+    /// Note: No safe guards for session creation since the C APIs only throw fatal errors.
+    /// `LlmInferenceEngine_CreateSession()` will always return a llm session if the call
     /// completes.
-    self.cLlmSession = withUnsafePointer(to: sessionConfig) { LlmInferenceEngine_CreateSession($0) }
+    cLlmSession = LlmTaskRunner.cacheDirectory.path.withCString { cCacheDir in
+      return config.modelPath.withCString { cModelPath in
+        let cSessionConfig = LlmSessionConfig(
+          model_path: cModelPath,
+          cache_dir: cCacheDir,
+          sequence_batch_size: Int(config.sequenceBatchSize),
+          num_decode_steps_per_sync: Int(config.numberOfDecodeStepsPerSync),
+          max_tokens: Int(config.maxTokens),
+          topk: Int(config.topk),
+          temperature: config.temperature,
+          random_seed: config.randomSeed)
+        return withUnsafePointer(to: cSessionConfig) { LlmInferenceEngine_CreateSession($0) }
+      }
+    }
+
+    modelCacheFile = LlmTaskRunner.cacheDirectory.versionIndependentAppending(
+      component: "\(modelName)\(LlmTaskRunner.cacheSuffix)")
   }
 
   /// Invokes the C inference engine with the given input text to generate an array of `String`
@@ -39,7 +80,7 @@ public final class LlmTaskRunner {
   /// - Parameters:
   ///   - inputText: A `String` that is used to query the LLM.
   /// - Throws: An error if the LLM's response is invalid.
-  public func predict(inputText: String) throws -> [String] {
+  func predict(inputText: String) throws -> [String] {
     /// No safe guards for the call since the C++ APIs only throw fatal errors.
     /// `LlmInferenceEngine_Session_PredictSync()` will always return a `LlmResponseContext` if the
     /// call completes.
@@ -53,25 +94,195 @@ public final class LlmTaskRunner {
       }
     }
 
-    /// Throw an error if the response array is `NULL`.
+    guard let responseStrings = LlmTaskRunner.responseStrings(from: responseContext) else {
+      throw GenAiInferenceError.invalidResponse
+    }
+
+    return responseStrings
+  }
+
+  func predict(
+    inputText: String, progress: @escaping (_ partialResult: [String]?, _ error: Error?) -> Void,
+    completion: @escaping (() -> Void)
+  ) {
+
+    /// `strdup(inputText)` prevents input text from being deallocated as long as callbacks are
+    /// being invoked. `CallbackInfo` takes care of freeing the memory of `inputText` when it is
+    /// deallocated.
+    let callbackInfo = CallbackInfo(
+      inputText: strdup(inputText), progress: progress, completion: completion)
+    let callbackContext = UnsafeMutableRawPointer(Unmanaged.passRetained(callbackInfo).toOpaque())
+
+    LlmInferenceEngine_Session_PredictAsync(cLlmSession, callbackContext, callbackInfo.inputText) {
+      context, responseContext in
+      guard let cContext = context else {
+        return
+      }
+
+      /// `takeRetainedValue()` decrements the reference count incremented by `passRetained()`. Only
+      /// take a retained value if the LLM has finished generating responses to prevent the context
+      /// from being deallocated in between response generation.
+      let cCallbackInfo =
+        responseContext.done
+        ? Unmanaged<CallbackInfo>.fromOpaque(cContext).takeRetainedValue()
+        : Unmanaged<CallbackInfo>.fromOpaque(cContext).takeUnretainedValue()
+
+      if let responseStrings = LlmTaskRunner.responseStrings(from: responseContext) {
+        cCallbackInfo.progress(responseStrings, nil)
+      } else {
+        cCallbackInfo.progress(nil, GenAiInferenceError.invalidResponse)
+      }
+
+      /// Call completion callback if LLM has generated its last response.
+      if responseContext.done {
+        cCallbackInfo.completion()
+      }
+    }
+  }
+
+  /// Clears all cached files created by `LlmInference` to prevent exponential growth of your app
+  /// size. Please ensure that this method is not called during the lifetime of any instances of
+  /// `LlmTaskRunner`.
+  static func clearAllCachedFiles() throws {
+    // Delete directory
+    try FileManager.default.removeItem(at: LlmTaskRunner.globalCacheDirectory)
+  }
+
+  deinit {
+    LlmInferenceEngine_Session_Delete(cLlmSession)
+
+    /// Responsibly deleting the model cache.
+    /// Performing on current thread since only one file needs to be deleted.
+    ///
+    /// Note: Implementation will have to be updated if C++ core changes the cache prefix.
+    ///
+    /// Note: `deinit` does not get invoked in the following circumstances:
+    /// 1. If a crash occurs before the task runner is de-allocated.
+    /// 2. If an instance of the task is created from `main()` and the app is terminated.
+    ///    For eg:, if the task is an instance variable of the main `ViewController` which doesn't
+    ///    get destroyed until the app quits.
+    /// Task interfaces that use the task runner should additionally provide a function that
+    /// encapsulates `LlmTaskrRunner.clearAllCachedFiles()` to cleanup any undeleted caches to
+    /// avoid exponential growth in app size. OS clears these directories only if the device runs
+    /// out of storage space.
+    /// Tasks should not attempt to clear undeleted caches on initialization since user can create
+    /// multiple instances of the task and there is now way of knowing whether they are still
+    /// active. Deleting caches of active task instances will result in crashes when the C++
+    /// functions are invoked.
+    do {
+      try FileManager.default.removeItem(at: modelCacheFile)
+    } catch {
+      // Could not delete file. Common cause: file not found.
+    }
+  }
+}
+
+extension LlmTaskRunner {
+  /// Configuration for setting up a `LlmTaskRunner`.
+  struct Config {
+    /// The absolute path to the model asset bundle stored locally on the device.
+    let modelPath: String
+
+    let sequenceBatchSize: UInt
+
+    let numberOfDecodeStepsPerSync: UInt
+
+    /// The total length of the kv-cache. In other words, this is the total number of input + output
+    /// tokens the model needs to handle.
+    let maxTokens: UInt
+
+    /// The top K number of tokens to be sampled from for each decoding step. A value of 1 means
+    /// greedy decoding. Defaults to 40.
+    let topk: UInt
+
+    /// The randomness when decoding the next token. A value of 0.0f means greedy decoding. Defaults
+    /// to 0.8.
+    let temperature: Float
+
+    /// The random seed for sampling tokens.
+    let randomSeed: Int
+
+    /// Creates a new instance of `Config` with the provided values.
+    ///
+    /// - Parameters:
+    ///   - modelPath: The absolute path to a model asset bundle stored locally on the device.
+    ///   - sequenceBatchSize: Sequence batch size for encoding. Used by GPU only. Number of
+    /// input tokens to process at a time for batch processing. Setting this value to 1 means both
+    /// the encoding and decoding share the same graph of sequence length of 1. Setting this value
+    /// to 0 means the batch size will be optimized
+    /// programmatically.
+    ///   - numberOfDecodeStepsPerSync: Number of decode steps per sync. Used by GPU only.
+    /// The default value is 3.
+    ///   - maxTokens: Maximum number of tokens for input and output.
+    ///   - topk: Top K number of tokens to be sampled from for each decoding step.
+    ///   - temperature: Randomness when decoding the next token, 0.0f means greedy decoding.
+    ///   - random_seed: Random seed for sampling tokens.
+    init(
+      modelPath: String, sequenceBatchSize: UInt, numberOfDecodeStepsPerSync: UInt, maxTokens: UInt,
+      topk: UInt, temperature: Float, randomSeed: Int
+    ) {
+      self.modelPath = modelPath
+      self.sequenceBatchSize = sequenceBatchSize
+      self.numberOfDecodeStepsPerSync = numberOfDecodeStepsPerSync
+      self.maxTokens = maxTokens
+      self.topk = topk
+      self.temperature = temperature
+      self.randomSeed = randomSeed
+    }
+  }
+}
+
+private extension LlmTaskRunner {
+  /// A wrapper class for whose object will be used as the C++ callback context.
+  /// The progress and completion callbacks cannot be invoked without a context.
+  class CallbackInfo {
+    typealias ProgressCallback = (_ partialResult: [String]?, _ error: Error?) -> Void
+    typealias CompletionCallback = () -> Void
+
+    let inputText: UnsafeMutablePointer<CChar>?
+    let progress: ProgressCallback
+    let completion: CompletionCallback
+
+    init(
+      inputText: UnsafeMutablePointer<CChar>?, progress: @escaping (ProgressCallback),
+      completion: @escaping (CompletionCallback)
+    ) {
+      self.inputText = inputText
+      self.progress = progress
+      self.completion = completion
+    }
+
+    deinit {
+      free(inputText)
+    }
+  }
+}
+
+private extension LlmTaskRunner {
+  class func responseStrings(from responseContext: LlmResponseContext) -> [String]? {
     guard let cResponseArray = responseContext.response_array else {
-      throw GenAiInferenceError.invalidResponseError
+      return nil
     }
 
     var responseStrings: [String] = []
-
     for responseIndex in 0..<Int(responseContext.response_count) {
+      /// Throw an error if the response string is `NULL`.
       guard let cResponseString = cResponseArray[responseIndex] else {
-        throw GenAiInferenceError.invalidResponseError
+        return nil
       }
       responseStrings.append(String(cString: cResponseString))
     }
 
     return responseStrings
   }
+}
 
-  deinit {
-    LlmInferenceEngine_Session_Delete(cLlmSession)
+fileprivate extension URL {
+  func versionIndependentAppending(component: String) -> URL {
+    if #available(iOS 16, *) {
+      return self.appending(component: component)
+    } else {
+      return self.appendingPathComponent(component)
+    }
   }
-
 }
diff --git a/mediapipe/tasks/ios/genai/inference/BUILD b/mediapipe/tasks/ios/genai/inference/BUILD
index 9ba524262d..83ce1a47ad 100644
--- a/mediapipe/tasks/ios/genai/inference/BUILD
+++ b/mediapipe/tasks/ios/genai/inference/BUILD
@@ -22,9 +22,12 @@ swift_library(
     name = "LlmInference",
     srcs = [
         "sources/LlmInference.swift",
+        "//mediapipe/tasks/ios/genai/core:sources/LlmTaskRunner.swift",
+        "//mediapipe/tasks/ios/genai/core:sources/GenAiInferenceError.swift",
     ],
+    copts = ["-no-verify-emitted-module-interface"],
+    module_name = "MediaPipeTasksGenAI",
     generated_header_name = "LlmInference-Swift.h",
     generates_header = 1,
-    module_name = "LlmInference",
-    deps = ["//mediapipe/tasks/ios/genai/core:LlmTaskRunner"],
+    deps = ["//mediapipe/tasks/cc/genai/inference/c:libllm_inference_engine_cpu",],
 )

From d4e7b402405e353a119ebbd98ed5033f2ae7705b Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 01:14:13 +0530
Subject: [PATCH 05/11] Updated interface of iOS LlmInference

---
 .../inference/sources/LlmInference.swift      | 150 ++++++++++++++----
 1 file changed, 117 insertions(+), 33 deletions(-)

diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
index d066af9012..778ad3f9c2 100644
--- a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
+++ b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
@@ -13,42 +13,55 @@
 // limitations under the License.
 
 import Foundation
-import LlmInferenceEngineC
-import LlmTaskRunner
 
 /// A MediaPipe task that performs inference using a given Large Language Model.
 ///
 /// Note: Inherits from `NSObject` for Objective C interoperability.
-@objc(MPPLLMInference) public final class LlmInference: NSObject {
-  private static let numberOfDecodeStepsPerSync = 3
-  private static let sequenceBatchSize = 0
+@objc(MPPLlmInference) public final class LlmInference: NSObject {
+  private static let numberOfDecodeStepsPerSync: UInt = 3
+  private static let sequenceBatchSize: UInt = 0
+  private static let responseGenerationInProgressQueueName =
+    "com.google.mediapipe.genai.isResponseGenerationInProgressQueue"
 
   private let llmTaskRunner: LlmTaskRunner
 
+  private let responseGenerationInProgressQueue = DispatchQueue(
+    label: LlmInference.responseGenerationInProgressQueueName,
+    attributes: .concurrent)
+
+  /// Tracks whether a response generation is in progress.
+  /// Readers writers lock to prevent race condition as this variable can be accessed from multiple
+  /// threads.
+  private var responseGenerationInProgressInternal = false
+  private var responseGenerationInProgress: Bool {
+    get {
+      responseGenerationInProgressQueue.sync {
+        return self.responseGenerationInProgressInternal
+      }
+    }
+    set {
+      responseGenerationInProgressQueue.async(flags: .barrier) {
+        self.responseGenerationInProgressInternal = newValue
+      }
+    }
+  }
+
   /// Creates a new instance of `LlmInference` with the given options.
   ///
   /// - Parameters:
   ///   - options: The options of type `LlmInference.Options` to use for configuring the
   /// `LlmInference`.
-  @objc public init(options: Options) {
-    let modelPath = strdup(options.modelPath)
-    let cacheDirectory = strdup(FileManager.default.temporaryDirectory.path)
-
-    defer {
-      free(modelPath)
-      free(cacheDirectory)
-    }
-
-    let sessionConfig = LlmSessionConfig(
-      model_path: modelPath,
-      cache_dir: cacheDirectory,
-      sequence_batch_size: LlmInference.sequenceBatchSize,
-      num_decode_steps_per_sync: LlmInference.numberOfDecodeStepsPerSync,
-      max_tokens: options.maxTokens,
+  @objc public init(options: Options) throws {
+    let taskRunnerConfig = LlmTaskRunner.Config(
+      modelPath: options.modelPath,
+      sequenceBatchSize: LlmInference.sequenceBatchSize,
+      numberOfDecodeStepsPerSync: LlmInference.numberOfDecodeStepsPerSync,
+      maxTokens: options.maxTokens,
       topk: options.topk,
       temperature: options.temperature,
-      random_seed: options.randomSeed)
-    llmTaskRunner = LlmTaskRunner(sessionConfig: sessionConfig)
+      randomSeed: options.randomSeed)
+
+    llmTaskRunner = try LlmTaskRunner(config: taskRunnerConfig)
 
     super.init()
   }
@@ -58,9 +71,9 @@ import LlmTaskRunner
   ///
   /// - Parameters:
   ///   - modelPath: The absolute path to a model asset bundle stored locally on the device.
-  @objc public convenience init(modelPath: String) {
+  @objc public convenience init(modelPath: String) throws {
     let options = Options(modelPath: modelPath)
-    self.init(options: options)
+    try self.init(options: options)
   }
 
   /// Generates a response based on the input text.
@@ -69,16 +82,85 @@ import LlmTaskRunner
   ///   - inputText: A `String` that is used to query the LLM.
   /// - Throws: An error if the LLM's response is invalid.
   @objc public func generateResponse(inputText: String) throws -> String {
+
+    /// Disallow response generation if another response generation call is already in progress.
+    try shouldContinueWithResponseGeneration()
+
     let tokens = try llmTaskRunner.predict(inputText: inputText)
+
+    responseGenerationInProgress = false
+
     guard let humanReadableLlmResponse = LlmInference.humanReadableString(llmResponses: tokens)
     else {
-      throw GenAiInferenceError.invalidResponseError
+      throw GenAiInferenceError.invalidResponse
     }
 
     return humanReadableLlmResponse
   }
 
-  private static func humanReadableString(
+  /// Generates a response based on the input text asynchronously. The `progess` callback returns
+  /// the partial responses from the LLM or any errors. `completion` callback is invoked once the
+  /// LLM is done generating responses.
+  ///
+  /// - Parameters:
+  ///   - progess: A callback invoked when a partial response is available from the LLM.
+  ///   - completion: A callback invoked when the LLM finishes response generation.
+  /// - Throws: An error if the LLM's response is invalid.
+  @objc public func generateResponse(
+    inputText: String,
+    progress: @escaping (_ partialResponse: String?, _ error: Error?) -> Void,
+    completion: @escaping (() -> Void)
+  ) throws {
+    /// Disallow response generation if another response generation call is already in progress.
+    try shouldContinueWithResponseGeneration()
+
+    /// Used to make a decision about whitespace stripping.
+    var receivedFirstToken = true
+
+    llmTaskRunner.predict(
+      inputText: inputText,
+      progress: { partialResponseStrings, error in
+
+        guard let responseStrings = partialResponseStrings,
+          let humanReadableLlmResponse = LlmInference.humanReadableString(
+            llmResponses: responseStrings, stripLeadingWhitespaces: receivedFirstToken)
+        else {
+          progress(nil, GenAiInferenceError.invalidResponse)
+          return
+        }
+
+        /// Reset state after first response is processed.
+        receivedFirstToken = false
+
+        progress(humanReadableLlmResponse, nil)
+      },
+      completion: { [weak self] in
+        self?.responseGenerationInProgress = false
+        completion()
+      })
+  }
+
+  /// Clears all cached files created by `LlmInference` to prevent exponential growth of your app
+  /// size. Please ensure that this method is not called during the lifetime of any instances of
+  /// `LlmInference`. If the cache is deleted while an instance of `LlmInference` is in scope,
+  /// calling one of its methods will result in undefined behaviour and may lead to a crash.
+  ///
+  /// This method blocks the thread on which it runs. Invoke this function from a background thread
+  /// to avoid blocking the thread.x
+  public class func clearAllCachedFiles() throws {
+    try LlmTaskRunner.clearAllCachedFiles()
+  }
+
+  /// Throw error if response generation is in progress or update response generation state.
+  private func shouldContinueWithResponseGeneration() throws {
+    if responseGenerationInProgress {
+      throw GenAiInferenceError.illegalMethodCall
+    }
+
+    responseGenerationInProgress = true
+  }
+
+  private class func humanReadableString(
     llmResponses: [String], stripLeadingWhitespaces: Bool = true
   ) -> String? {
     guard let llmResponse = llmResponses.first else {
@@ -100,11 +182,11 @@ extension LlmInference {
 
     /// The total length of the kv-cache. In other words, this is the total number of input + output
     /// tokens the model needs to handle.
-    @objc public var maxTokens: Int = 512
+    @objc public var maxTokens: UInt = 512
 
     /// The top K number of tokens to be sampled from for each decoding step. A value of 1 means
     /// greedy decoding. Defaults to 40.
-    @objc public var topk: Int = 40
+    @objc public var topk: UInt = 40
 
     /// The randomness when decoding the next token. A value of 0.0f means greedy decoding. Defaults
     /// to 0.8.
@@ -123,16 +205,18 @@ extension LlmInference {
       self.modelPath = modelPath
       super.init()
     }
+
   }
 }
 
 /// An extension to `String` to add some utility functions.
-extension String {
-  fileprivate static let tokenSplitter = "▁"  /// Note this is NOT an underscore: ▁(U+2581)
-  fileprivate static let newLine = "<0x0A>"
-  fileprivate static let eod = "\\[eod\\]"
+fileprivate extension String {
+  private static let tokenSplitter = "▁"
+  /// Note this is NOT an underscore: ▁(U+2581)
+  private static let newLine = "<0x0A>"
+  private static let eod = "\\[eod\\]"
 
-  fileprivate func humanReadableString(stripLeadingWhitespaces: Bool = true) -> String? {
+  func humanReadableString(stripLeadingWhitespaces: Bool = true) -> String? {
     var humanReadableString = self.replacingOccurrences(of: String.tokenSplitter, with: " ")
       .replacingOccurrences(of: String.newLine, with: "\n")
     humanReadableString =

From 281907abf88e4027c413a0d0aa8f46a76458de93 Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 01:15:55 +0530
Subject: [PATCH 06/11] Added framework build scripts for iOS Gen AI frameworks

---
 mediapipe/tasks/ios/BUILD                  | 42 ++++++++++++++++++++++
 mediapipe/tasks/ios/build_ios_framework.sh | 16 +++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/mediapipe/tasks/ios/BUILD b/mediapipe/tasks/ios/BUILD
index ad3d6d1c17..b91466d503 100644
--- a/mediapipe/tasks/ios/BUILD
+++ b/mediapipe/tasks/ios/BUILD
@@ -91,6 +91,7 @@ MEDIAPIPE_TASKS_COMMON_DEPS = OBJC_TASK_COMMON_DEPS + TENSORFLOW_LITE_C_DEPS + [
 strip_api_include_path_prefix(
     name = "strip_api_include_path",
     hdr_labels = [
+        "//mediapipe/tasks/cc/genai/inference/c:llm_inference_engine.h",
         "//mediapipe/tasks/ios/common:sources/MPPCommon.h",
         "//mediapipe/tasks/ios/components/containers:sources/MPPCategory.h",
         "//mediapipe/tasks/ios/components/containers:sources/MPPClassificationResult.h",
@@ -321,3 +322,44 @@ apple_static_xcframework(
     # built as static libraries and force loaded.
     deps = MEDIAPIPE_TASKS_COMMON_DEPS + select(OPENCV_DEPS),
 )
+
+apple_static_xcframework(
+    name = "MediaPipeTasksGenAI_framework",
+    avoid_deps = ["//mediapipe/tasks/cc/genai/inference/c:libllm_inference_engine_cpu",],
+    bundle_name = "MediaPipeTasksGenAI",
+    ios = {
+        "simulator": [
+            "arm64",
+            "x86_64",
+        ],
+        "device": ["arm64"],
+    },
+    minimum_os_versions = {
+        "ios": MPP_TASK_MINIMUM_OS_VERSION,
+    },
+    deps = [
+        "//mediapipe/tasks/ios/genai/inference:LlmInference",
+        "//mediapipe/tasks/cc/genai/inference/c:libllm_inference_engine_cpu",
+    ],
+)
+
+apple_static_xcframework(
+    name = "MediaPipeTasksGenAIC_framework",
+    bundle_name = "MediaPipeTasksGenAIC",
+    ios = {
+        "simulator": [
+            "arm64",
+            "x86_64",
+        ],
+        "device": ["arm64"],
+    },
+    minimum_os_versions = {
+        "ios": MPP_TASK_MINIMUM_OS_VERSION,
+    },
+    public_hdrs = [
+        ":llm_inference_engine.h",
+    ],
+    deps = [
+        "//mediapipe/tasks/cc/genai/inference/c:libllm_inference_engine_cpu",
+    ],
+)
diff --git a/mediapipe/tasks/ios/build_ios_framework.sh b/mediapipe/tasks/ios/build_ios_framework.sh
index ddb186ee93..4a86375ee0 100755
--- a/mediapipe/tasks/ios/build_ios_framework.sh
+++ b/mediapipe/tasks/ios/build_ios_framework.sh
@@ -56,8 +56,12 @@ case $FRAMEWORK_NAME in
     ;;
   "MediaPipeTasksText")
     ;;
+  "MediaPipeTasksGenAIC")
+    ;;
+  "MediaPipeTasksGenAI")
+    ;;
   *)
-    echo "Wrong framework name. The following framework names are allowed: MediaPipeTasksText, MediaPipeTasksVision, MediaPipeTasksCommon"
+    echo "Wrong framework name. The following framework names are allowed: MediaPipeTasksText, MediaPipeTasksVision, MediaPipeTasksCommon, MediaPipeTasksGenAI, MediaPipeTasksGenAIC"
     exit 1
   ;;
 esac
@@ -90,7 +94,6 @@ EOF
 function build_ios_frameworks_and_libraries {
   local TARGET_PREFIX="//mediapipe/tasks/ios"
   FULL_FRAMEWORK_TARGET="${TARGET_PREFIX}:${FRAMEWORK_NAME}_framework"
-  FULL_GRAPH_LIBRARY_TARGET="${TARGET_PREFIX}:${FRAMEWORK_NAME}_GraphLibrary"
 
   # .bazelrc sets --apple_generate_dsym=true by default which bloats the libraries to sizes of
   # the order of GBs. All iOS framework and library build commands for distribution via
@@ -99,6 +102,15 @@ function build_ios_frameworks_and_libraries {
 
   # Build Task Library xcframework.
   local FRAMEWORK_CQUERY_COMMAND="-c opt --config=ios_sim_device_fat --apple_generate_dsym=false --define OPENCV=source ${FULL_FRAMEWORK_TARGET}"
+  
+  case $FRAMEWORK_NAME in
+    "MediaPipeTasksGenAI|MediaPipeTasksGenAIC")
+      FRAMEWORK_CQUERY_COMMAND="c opt --config=ios_sim_device_fat --apple_generate_dsym=false ${FULL_FRAMEWORK_TARGET}"
+      ;;
+    *)
+    ;;
+  esac
+
   ${BAZEL} build ${FRAMEWORK_CQUERY_COMMAND}
   IOS_FRAMEWORK_PATH="$(get_output_file_path "${FRAMEWORK_CQUERY_COMMAND}")"
 

From 92b069407196afa57c23a40978416801f7baeee8 Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 19:56:47 +0530
Subject: [PATCH 07/11] Updated function that clears cache in LlmInference to
 be a non blocking call

---
 .../genai/core/sources/LlmTaskRunner.swift    | 14 ++++++++++---
 .../inference/sources/LlmInference.swift      | 20 +++++++++++++------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
index 1b496332c6..77c0664253 100644
--- a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
+++ b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
@@ -35,6 +35,7 @@ final class LlmTaskRunner {
   private let cLlmSession: CLlmSession
 
   private let modelCacheFile: URL
+
   /// Creates a new instance of `LlmTaskRunner` with the given session config.
   ///
   /// - Parameters:
@@ -144,14 +145,21 @@ final class LlmTaskRunner {
   /// Clears all cached files created by `LlmInference` to prevent exponential growth of your app
   /// size. Please ensure that this method is not called during the lifetime of any instances of
   /// `LlmTaskRunner`.
-  static func clearAllCachedFiles() throws {
+  class func clearAllCachedFiles() {
     // Delete directory
-    try FileManager.default.removeItem(at: LlmTaskRunner.globalCacheDirectory)
+    do { 
+      try FileManager.default.removeItem(at: LlmTaskRunner.globalCacheDirectory)
+      print("Success on deleting")
+    }
+    catch {
+      print("Error in deleting")
+      /// Errors thrown are not relevant to the user. They are usual not found errors.
+    }
   }
 
   deinit {
     LlmInferenceEngine_Session_Delete(cLlmSession)
-
+    
     /// Responsibly deleting the model cache.
     /// Performing on current thread since only one file needs to be deleted.
     ///
diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
index 2ca29abfc6..0e91fff8aa 100644
--- a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
+++ b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
@@ -20,11 +20,16 @@ import Foundation
 @objc(MPPLlmInference) public final class LlmInference: NSObject {
   private static let numberOfDecodeStepsPerSync: UInt = 3
   private static let sequenceBatchSize: UInt = 0
+  private static let cacheCleanupQueueName = "com.google.mediapipe.genai.cacheCleanupQueue.\(UUID().uuidString)"
   private static let responseGenerationInProgressQueueName =
-    "com.google.mediapipe.genai.isResponseGenerationInProgressQueue"
+    "com.google.mediapipe.genai.isResponseGenerationInProgressQueue.\(UUID().uuidString)"
+  /// Serial queue for cache cleanup.
+  private static let cacheCleanupQueue = DispatchQueue(
+    label: cacheCleanupQueueName)
 
   private let llmTaskRunner: LlmTaskRunner
 
+  /// Concurrent queue to implement readers-writers lock on `responseGenerationInProgress`.
   private let responseGenerationInProgressQueue = DispatchQueue(
     label: LlmInference.responseGenerationInProgressQueueName,
     attributes: .concurrent)
@@ -144,11 +149,14 @@ import Foundation
   /// size. Please ensure that this method is not called during the lifetime of any instances of
   /// `LlmInference`. If the cache is deleted while an instance of `LlmInference` is in scope,
   /// calling one of its methods will result in undefined behaviour and may lead to a crash.
-  ///
-  /// This method blocks the thread on which it runs. Invoke this function from a background thread
-  /// to avoid blocking the thread.x
-  public class func clearAllCachedFiles() throws {
-    try LlmTaskRunner.clearAllCachedFiles()
+  public class func clearAllCachedFiles(completion: @escaping(() -> Void)) {
+    /// Asynchronously deleting the files to prevent blocking the current thread as there may be 
+    /// multiple undeleted weight caches. Choosing a serial queue to let callers wait until the
+    // previous call for deletion is completed.
+    cacheCleanupQueue.async {
+        LlmTaskRunner.clearAllCachedFiles()
+        completion()
+      }
   }
 
   /// Throw error if response generation is in progress or update response generation state.

From 05ab9b405162daf9f1ef5a5e7d57fc29706d3b7e Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 20:01:25 +0530
Subject: [PATCH 08/11] Removed logs in iOS LlmTaskRunner

---
 mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
index 77c0664253..c50be05f2d 100644
--- a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
+++ b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
@@ -149,10 +149,8 @@ final class LlmTaskRunner {
     // Delete directory
     do { 
       try FileManager.default.removeItem(at: LlmTaskRunner.globalCacheDirectory)
-      print("Success on deleting")
     }
     catch {
-      print("Error in deleting")
       /// Errors thrown are not relevant to the user. They are usual not found errors.
     }
   }

From dcc2b1b17a9504a21dd51c2d257dab72f6e3f37a Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 20:02:12 +0530
Subject: [PATCH 09/11] Updated scope of string extension in iOS LlmInference

---
 mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
index 0e91fff8aa..5c2a8ac3e3 100644
--- a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
+++ b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
@@ -218,7 +218,7 @@ extension LlmInference {
 }
 
 /// An extension to `String` to add some utility functions.
-extension String {
+fileprivate extension String {
   private static let tokenSplitter = "▁"
   /// Note this is NOT an underscore: ▁(U+2581)
   private static let newLine = "<0x0A>"

From 850f26931d9d338900306d9729c8d77a9ff0996d Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 20:09:25 +0530
Subject: [PATCH 10/11] Removed duplicate entry of header file in
 tasks/ios/BUILD

---
 mediapipe/tasks/ios/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mediapipe/tasks/ios/BUILD b/mediapipe/tasks/ios/BUILD
index 41f04ab2b8..9bb2c7c999 100644
--- a/mediapipe/tasks/ios/BUILD
+++ b/mediapipe/tasks/ios/BUILD
@@ -91,7 +91,6 @@ MEDIAPIPE_TASKS_COMMON_DEPS = OBJC_TASK_COMMON_DEPS + TENSORFLOW_LITE_C_DEPS + [
 strip_api_include_path_prefix(
     name = "strip_api_include_path",
     hdr_labels = [
-        "//mediapipe/tasks/cc/genai/inference/c:llm_inference_engine.h",
         "//mediapipe/tasks/ios/common:sources/MPPCommon.h",
         "//mediapipe/tasks/ios/components/containers:sources/MPPCategory.h",
         "//mediapipe/tasks/ios/components/containers:sources/MPPClassificationResult.h",

From 49c2a2fc6919f109eba74e283cfdca5494f16fea Mon Sep 17 00:00:00 2001
From: Prianka Liz Kariat <prianka.kariat@codeandtheory.com>
Date: Tue, 5 Mar 2024 23:32:34 +0530
Subject: [PATCH 11/11] Removed unwanted comments from iOS LlmTaskRunner

---
 mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
index c50be05f2d..93199b362b 100644
--- a/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
+++ b/mediapipe/tasks/ios/genai/core/sources/LlmTaskRunner.swift
@@ -273,7 +273,6 @@ private extension LlmTaskRunner {
 
     var responseStrings: [String] = []
     for responseIndex in 0..<Int(responseContext.response_count) {
-      /// Throw an error if the response string is `NULL`.
       guard let cResponseString = cResponseArray[responseIndex] else {
         return nil
       }