<a href="https://colab.research.google.com/github/collinjennings/detectiveLLMs/blob/main/solvingDetectiveStoriesAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Detective Story Culprits with Gemini
This is trial code for predicting culprits in detective short stories. I'm developing the process for doing this at the corpus level and comparing results to reader predictions and annotations.

The code was written to be deployed in Google Colab.

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/6.2 MB[0m [31m14.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m4.4/6.2 MB[0m [31m63.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.2/6.2 MB[0m [31m79.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[0m

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
import glob
from collections import defaultdict
import re
import os
import sys
import numpy as np
import pandas as pd
import math
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
device = torch.device('cuda')
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1000"

In [None]:

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/Colab Notebooks'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [None]:
# Define project information
PROJECT_ID = "#####"  # @param {type:"string"}
LOCATION = "us-east1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
import json

from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Part,
)

### Process Story Data

In [None]:
model = GenerativeModel(
    "gemini-1.5-flash",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
    },
)
# This Generation Config sets the model to respond in JSON format.
generation_config = GenerationConfig(
    temperature=0.0, response_mime_type="application/json"
)

In [None]:
## Loading the corpus.
files = glob.glob("./data/texts/*.txt")
texts = defaultdict()
for afile in files:
      texts[re.sub('_','', afile.split('/')[3].split('.')[0])] = open(afile, encoding = 'utf-8').read()

In [None]:
## Loading the metadata
metaList = []
fname = './data/BMDS_story_annotations.csv' ### Update filename
metaDF = pd.read_csv(fname)
for index, row in metaDF.iterrows():
  metaList.append({'code': row['Story Code'], 'title': row['Story Title'], 'author': row['Author Code'], 'solvability1': row['Sufficient clues to guess?'],
                   'solvability2': row['Sufficient clues to solve?'], 'solvability3': row['Correct annotator guess?'],
                   'key clue1': row['Essential clue'], 'key clue2': row['Most salient clue'], 'Solved?': row['Is the crime solved?'],
                   'structure': row['Investigation-reveal order'], 'reveal line': row['Reveal border sentence']})

In [None]:
### Filter out stories that aren't solved
print(len(metaList))
for i in metaList:
  if i['Solved?'] == 'No':
    metaList.remove(i)
print(len(metaList))

### Crop stories before the reveal

In [None]:
for id, i in enumerate(metaList[:6]):
  if i['key clue1'][:8].lower() not in i['key clue2'].lower():
    i['clues'] = [i['key clue1'], i['key clue2']]
  else:
    i['clues'] = [i['key clue1']]
  text = texts[i['code']]
  print(len(text.split()))
  i['text'] = text[:text.find(i['reveal line'][:18])]
  print(len(i['text'].split()))



### Few shot prompting
Give the LLM a prompt and several examples to help make its predictions.

In [None]:
qa_prompt = """Read the excerpt from a detective story, and predict who committed the crime.
The perpetrator is not identified in this portion of the story. The story will try to trick you with red herring clues.
Make sure to predict something that has not already happened. Then identify one key piece of evidence (consider material and testimonial clues) that supports your prediction."""

text = ''
prompt = f""" Given the fields "Context" and "Question", produce the fields "Prediction" and "Evidence".
---
"Context": {metaList[5]['text']},
"Question": {qa_prompt},
"Prediction": The perpetrator is the trainer James Ryder who later uses the alias John Robinson.
"Evidence": The key clue is his interest in the goose that Mr. Baker received, which Holmes discovers by finding the goose supplier by his wager with Mr. Breckinridge.

---
"Context": {metaList[1]['text']},
"Question": {qa_prompt},
"Prediction": Arthur Pinner, the brother of Harry Pinner, is the perpetrator. He likely orchestrated the entire scheme to defraud
the firm Pycroft was hired to.
"Evidence": The fact that both brothers have the same gold-filled tooth is a strong indicator that they are not who they claim
to be. This suggests a deliberate attempt to deceive Hall Pycroft.

---
"Context": {metaList[3]['text']},
"Question": {qa_prompt},
"Prediction": Sir George Burnwell is the perpetrator. Arthur took the blame to protect Mary.
"Evidence": The footprints in the snow and Mary suddenly closing the window indicate that someone else was involved in trying to take the beryls.

---
"Context": {metaList[0]['text']},
"Question": {qa_prompt},
"Prediction":
"Evidence":

"""


In [None]:
### Print model prediction
response = model.generate_content(prompt, generation_config=generation_config).text
print(f"Answer: {response}")

Answer: {"Prediction": "The perpetrator is Fitzroy Simpson. He likely killed Straker to cover up his theft of Silver Blaze.", "Evidence": "The key clue is the fact that Fitzroy Simpson's cravat was found in the dead man's hand. This suggests a struggle between the two, and that Simpson was present at the scene of the crime."}
