In [2]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader 
from langchain_community.document_loaders import TextLoader
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.vectorstores import DocArrayInMemorySearch
import operator
from langchain_pinecone import PineconeVectorStore

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings()


We can now chain the prompt with the model and the output parser.

<img src='images/chain2.png' width="1200">

## Convert Paper content into pure texts



Loading Vector A, English database 

In [3]:




# #Load Vector A
# # creating a pdf reader object 
# paperName = "Watermarking Diffusion Model"
# pdfFileName = f"sample_papers/{paperName}.pdf"
# reader = PdfReader(pdfFileName) 
# # printing number of pages in pdf file 
# print(f"Pages of the PDF are {len(reader.pages)}") 
# 
# textFilePath = f"paper_texts/text_of_{paperName}.txt"
# if not os.path.exists(textFilePath):
#     for page in reader.pages:
#         text = page.extract_text()
#         with open(textFilePath, "a") as file:
#                 file.write(text)
# 
# 
# loader = TextLoader(textFilePath)
# text_documents = loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
# documents = text_splitter.split_documents(text_documents)


                
def writePDFtoTextFile_foreign(_paperName):
    _pdfFileName = f"experiment/papers/foreign/{_paperName}.pdf"
    reader_ = PdfReader(_pdfFileName)
    # printing number of pages in pdf file 
    print(f"Pages of the PDF are {len(reader_.pages)}")
    textFilePath_ = f"experiment/texts/foreign/{_paperName}.txt"
    if not os.path.exists(textFilePath_):
        for page_1 in reader_.pages:
            text_1 = page_1.extract_text()
            with open(textFilePath_, "a") as file_1:
                file_1.write(text_1)
                
def createVectorStore_foreign(_paperName):
    filePath = f"experiment/texts/foreign/{_paperName}.txt"
    loader_ = TextLoader(filePath)
    text_documents_ = loader_.load()
    print(f"text_documents_ = {text_documents_}")
    text_splitter_ = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    documents_ = text_splitter_.split_documents(text_documents_)
    vectorstore_ = DocArrayInMemorySearch.from_documents(documents_, embeddings)
    return vectorstore_

            
def writePDFtoTextFile_combined(_paperName, textFilePath_):
    _pdfFileName = f"experiment/papers/local_combine_english/{_paperName}.pdf"
    reader_ = PdfReader(_pdfFileName)
    # printing number of pages in pdf file 
    print(f"Pages of the PDF are {len(reader_.pages)}")
    for page_2 in reader_.pages:
        text_2 = page_2.extract_text()
        with open(textFilePath_, "a") as file_2:
            file_2.write(text_2)
            

def writePDFtoTextFile(_paperName):
    _pdfFileName = f"experiment/papers/english/{_paperName}.pdf"
    reader_ = PdfReader(_pdfFileName)
    # printing number of pages in pdf file 
    print(f"Pages of the PDF are {len(reader_.pages)}")
    textFilePath_ = f"experiment/texts/english/{_paperName}.txt"
    if not os.path.exists(textFilePath_):
        for page_1 in reader_.pages:
            text_1 = page_1.extract_text()
            with open(textFilePath_, "a") as file_1:
                file_1.write(text_1)


def createVectorStore(_paperName):
    filePath = f"experiment/texts/english/{_paperName}.txt"
    loader_ = TextLoader(filePath)
    text_documents_ = loader_.load()
    text_splitter_ = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    documents_ = text_splitter_.split_documents(text_documents_)
    vectorstore_ = DocArrayInMemorySearch.from_documents(documents_, embeddings)
    return vectorstore_

def writeKeywordsDictToFile(path, dict):
    with open(path, 'w') as f:  
        for key, value in dict:  
            f.write('%s:%s\n' % (key, value))
            

def findCommonKeywordsForNewSinglePaper_english(_paperName, btw_local_before_filter, btw_combined_before_filter):
    filePath = f"experiment/texts/english/{_paperName}.txt"
    with open(filePath, 'r') as file_3:
        text_new_paper = file_3.read().rstrip()
        
    # This is for English version
    betweenness_english = btw.run(text_new_paper)
    filtered_betweenness_sk = {}
    for node in btw_combined_before_filter:
        if node in btw_local_before_filter.keys() and node in betweenness_english.keys():
            filtered_betweenness_sk[node] = btw_combined_before_filter[node]

    sorted_keywords = sorted(filtered_betweenness_sk.items(), key=operator.itemgetter(1), reverse=True)
    writeKeywordsDictToFile(f"experiment/keywords/common_with_{_paperName}.txt", sorted_keywords)
    

def scalable_betweenness_centrality_for_each_english(englishPaper, btw_local_before_filter):
    filePath = f"experiment/texts/english/{englishPaper}.txt"
    with open(filePath, 'r') as file_4:
        text_new_paper = file_4.read().rstrip()
    
    # This is new English paper
    btw_new_english = btw.run(text_new_paper)
    
    # Compute SCB without re-run btw on combined graph
    filtered_betweenness_sk = {}
    for node in btw_new_english:
        if node in btw_local_before_filter.keys():
            filtered_betweenness_sk[node] = btw_new_english[node] + btw_local_before_filter[node]
    
    sorted_keywords = sorted(filtered_betweenness_sk.items(), key=operator.itemgetter(1), reverse=True)
    writeKeywordsDictToFile(f"experiment/keywords/SCB/proposed_common_{englishPaper}.txt", sorted_keywords)

## Setting up Pinecone

Loading Vector B, Asian research database:

So far we've used an in-memory vector store. In practice, we need a vector store that can handle large amounts of data and perform similarity searches at scale. For this example, we'll use [Pinecone](https://www.pinecone.io/).

The first step is to create a Pinecone account, set up an index, get an API key, and set it as an environment variable `PINECONE_API_KEY`.

Then, we can load the transcription documents into Pinecone:

In [4]:
#Load Vector B
b_paperNames = ["‘Detention’_ A Clichéd Salute to Freedom - The News Lens International Edition", 
                "Cohen-Puppetry and the Destruction of the Object", 
                "Even the Bleakest Horror Games End More Hopefully than _Detention_ Does", 
                "Manning-Can the Avatar Speak ",
                "Performative Reckoning 240425",
                "Psychological horror game Detention revisits 1960s Taiwan - Polygon",
                "Review_ Detention (返校) _ New Bloom Magazine",
                "Review_ In Taiwanese Horror Movie _Dete... Is The Real Monster _ Cinema Escapist",
                "Roxworthy-Revitalizing Japanese American Internment",
                "Schechner-Restoration of Behavior",
                "Son-Performance of Care",
                "Taylor-Acts of Transfer",
                "Thiongo-EnactmentsPowerPolitics-1997",
                "Tillis-The_Art_of_Puppetry_in_the_Age",
                "Turner-Liminality and Communitas",
                "Wu-Spectralizing the White Terror Horror Trauma and the Ghost Island Narrative in Detention",
                ]

In [5]:

textFilePath_b: str = ""
for b_paperName in b_paperNames:
    b_pdfFileName = f"experiment/papers/local/{b_paperName}.pdf"
    reader = PdfReader(b_pdfFileName)
    # printing number of pages in pdf file 
    print(f"Pages of the PDF are {len(reader.pages)}") 
    textFilePath_b = f"experiment/texts/local_papers.txt"
    for page in reader.pages:
        text = page.extract_text()
        with open(textFilePath_b, "a") as file:
                file.write(text)




Pages of the PDF are 12


Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)


Pages of the PDF are 10
Pages of the PDF are 14
Pages of the PDF are 17
Pages of the PDF are 34
Pages of the PDF are 5
Pages of the PDF are 10
Pages of the PDF are 7
Pages of the PDF are 24
Pages of the PDF are 101
Pages of the PDF are 31
Pages of the PDF are 27
Pages of the PDF are 21
Pages of the PDF are 8
Pages of the PDF are 9
Pages of the PDF are 15


In [4]:

# # use pinecone for first DB


# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# embeddings = OpenAIEmbeddings()

# index_name = "research-paper-rag-index"
# 
# pinecone = PineconeVectorStore.from_documents(
#     documents_b, embeddings, index_name=index_name
# )




ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [7]:

import tiktoken


# vectorstore4 = DocArrayInMemorySearch.from_documents(documents_b, embeddings)



In [None]:
# vectorstore4.similarity_search_with_score("White Terror", k=10)[:10]

In [8]:
# # Add one more vector C
# c_paperNames = ["Hassapopoulou-Playing with History_"]
# textFilePath_c: str = ""
# for c_paperName in c_paperNames:
#     c_pdfFileName = f"sample_papers_asian/English/{c_paperName}.pdf"
#     reader = PdfReader(c_pdfFileName)
#     # printing number of pages in pdf file 
#     print(f"Pages of the PDF are {len(reader.pages)}") 
#     textFilePath_c = f"paper_texts_asian/text_of_all_papers_in_c.txt"
#     for page in reader.pages:
#         text = page.extract_text()
#         with open(textFilePath_c, "a") as file:
#                 file.write(text)
# 
# loader_c = TextLoader(textFilePath_c)
# text_documents_c = loader_c.load()
# text_splitter_c = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
# documents_c = text_splitter_c.split_documents(text_documents_c)
# vectorstore5 = DocArrayInMemorySearch.from_documents(documents_c, embeddings)

Overwriting cache for 0 880
Overwriting cache for 0 755


Pages of the PDF are 29


In [6]:
# Add one more vector D for Japanese and Chinese
d_paperNames = ["⑬文学部紀要 第49号_200-177",
                "csat.org.tw_Journal.aspx_ID=22&ek=128&pg=1&d=3042",
                "太平洋戦争期の布袋戯",
                # "掌聲_作為文化外交工具的台灣布袋戲",
                "書寫民族創傷",
                "被動員的鄉土藝術 黃得時與太平洋戰爭期的布袋戲改造",
                "返校：疊合慘白歲月的殘影，窺見戒嚴之下的人心｜端傳媒 Initium Media"]

In [8]:

d_paperName = d_paperNames[5]
d_pdfFileName = f"experiment/papers/foreign/{d_paperName}.pdf"
reader = PdfReader(d_pdfFileName)
# printing number of pages in pdf file 
print(f"Pages of the PDF are {len(reader.pages)}") 
textFilePath_d = f"experiment/texts/foreign/{d_paperName}.txt"
for page in reader.pages:
    text = page.extract_text()
    with open(textFilePath_d, "a") as file:
            file.write(text)

loader_d = TextLoader(textFilePath_d)
text_documents_d = loader_d.load()
text_splitter_d = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents_d = text_splitter_d.split_documents(text_documents_d)
vectorstore_d = DocArrayInMemorySearch.from_documents(documents_d, embeddings)

Pages of the PDF are 29


In [9]:
# Add one more vector E for Puppet in English
# "Ruizendaal-Puppets-identity-and-politics-in-Taiwan"
e_paperNames = ["Bell-Puppets, Masks, and Performing Objects at the End of the Century",
                "Bernstein-Dances with Things- Material Culture and the Performance of Race",
                "Bird-David-Animism Revisited",
                "Brown-thing-theory.2001",
                "Goodall -Transferred Agencies- Performance and the Fear of Automatism",
                "Kaplin-A Puppet Tree A Model for the Field of Puppet Theatre",
                "Wu-Chapter 4 Religion and the Formation of Taiwanese Identities",
                "Zamir-Puppets"]

In [10]:

e_paperName = e_paperNames[0]
writePDFtoTextFile(e_paperName)

createVectorStore(e_paperName)

Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)


Pages of the PDF are 14


<langchain_community.vectorstores.docarray.in_memory.DocArrayInMemorySearch at 0x10cd51dc0>

In [11]:
# Combined local B & new paper E to find common keywords
common_paperNames = ["‘Detention’_ A Clichéd Salute to Freedom - The News Lens International Edition", 
                "Cohen-Puppetry and the Destruction of the Object", 
                "Even the Bleakest Horror Games End More Hopefully than _Detention_ Does", 
                "Manning-Can the Avatar Speak ",
                "Performative Reckoning 240425.docx",
                "Psychological horror game Detention revisits 1960s Taiwan - Polygon",
                "Review_ Detention (返校) _ New Bloom Magazine",
                "Review_ In Taiwanese Horror Movie _Dete... Is The Real Monster _ Cinema Escapist",
                "Roxworthy-Revitalizing Japanese American Internment",
                "Schechner-Restoration of Behavior",
                "Son-Performance of Care",
                "Taylor-Acts of Transfer",
                "Thiongo-EnactmentsPowerPolitics-1997",
                "Tillis-The_Art_of_Puppetry_in_the_Age",
                "Turner-Liminality and Communitas",
                "Wu-Spectralizing the White Terror Horror Trauma and the Ghost Island Narrative in Detention",
                "Bell-Puppets, Masks, and Performing Objects at the End of the Century",
                "Bernstein-Dances with Things- Material Culture and the Performance of Race",
                "Bird-David-Animism Revisited",
                "Brown-thing-theory.2001",
                "Goodall -Transferred Agencies- Performance and the Fear of Automatism",
                "Kaplin-A Puppet Tree A Model for the Field of Puppet Theatre",
                "Wu-Chapter 4 Religion and the Formation of Taiwanese Identities",
                "Zamir-Puppets"]

textFilePath_common: str = ""
for common_paperName in common_paperNames:
    common_pdfFileName = f"experiment/papers/local_combine_english/{common_paperName}.pdf"
    reader = PdfReader(common_pdfFileName)
    # printing number of pages in pdf file 
    print(f"Pages of the PDF are {len(reader.pages)}") 
    textFilePath_common = f"experiment/texts/local_combine_english.txt"
    for page in reader.pages:
        text = page.extract_text()
        with open(textFilePath_common, "a") as file:
                file.write(text)

loader_common = TextLoader(textFilePath_common)
text_documents_common = loader_common.load()
text_splitter_common = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents_common = text_splitter_common.split_documents(text_documents_common)
vectorstore_common = DocArrayInMemorySearch.from_documents(documents_common, embeddings)

Pages of the PDF are 12


Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)


Pages of the PDF are 10
Pages of the PDF are 14
Pages of the PDF are 17
Pages of the PDF are 34
Pages of the PDF are 5
Pages of the PDF are 10
Pages of the PDF are 7
Pages of the PDF are 24
Pages of the PDF are 101
Pages of the PDF are 31
Pages of the PDF are 27
Pages of the PDF are 21
Pages of the PDF are 8
Pages of the PDF are 9
Pages of the PDF are 15


Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)


Pages of the PDF are 14


Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)


Pages of the PDF are 29


Ignoring wrong pointing object 26 0 (offset 0)


Pages of the PDF are 27
Pages of the PDF are 24


Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)


Pages of the PDF are 14


Ignoring wrong pointing object 629 0 (offset 0)


Pages of the PDF are 8
Pages of the PDF are 301


Ignoring wrong pointing object 77 0 (offset 0)


Pages of the PDF are 25


In [12]:
# Compute betweenness for local DB
from keyword_alg.rake import Rake
from keyword_alg.betweenness_centrality import Betweenness

stoppath = "FoxStoplist.txt"

with open(textFilePath_b, 'r') as file:
    text_database = file.read().rstrip()


btw = Betweenness(stoppath)
# rake = Rake(stoppath)
# rake_similarity = rake.run(text_database)

In [29]:
# To compute betweenness for local and combined DB, to re-use
text_path_local_papers = f"experiment/texts/local_papers.txt"
with open(text_path_local_papers, 'r') as file:
    text_local_papers = file.read().rstrip()
    
btw_local_before_filter = btw.run(text_local_papers)


In [21]:
# Run full combined betweenness for each single new paper
for newPaper in e_paperNames: #e_paperNames is all new papers
    combinedPapers = [newPaper] + b_paperNames #b_paperNames is all local papers
    textPath_common = f"experiment/texts/local_combine_english/combined_with_{newPaper}.txt"
    for each in combinedPapers:
        writePDFtoTextFile_combined(each, textPath_common)
    
    with open(textPath_common, 'r') as common_file:
        text_combined = common_file.read().rstrip()
        
    btw_combined_before_filter = btw.run(text_combined)

    # Find common keywords for each new English paper, and print to text file
    writePDFtoTextFile(newPaper)
    findCommonKeywordsForNewSinglePaper_english(newPaper, btw_local_before_filter, btw_combined_before_filter)



Pages of the PDF are 14
Pages of the PDF are 12
Pages of the PDF are 10
Pages of the PDF are 14
Pages of the PDF are 17
Pages of the PDF are 34
Pages of the PDF are 5
Pages of the PDF are 10
Pages of the PDF are 7
Pages of the PDF are 24
Pages of the PDF are 101
Pages of the PDF are 31
Pages of the PDF are 27
Pages of the PDF are 21
Pages of the PDF are 8
Pages of the PDF are 9
Pages of the PDF are 15
Pages of the PDF are 14
Pages of the PDF are 29
Pages of the PDF are 12
Pages of the PDF are 10
Pages of the PDF are 14
Pages of the PDF are 17
Pages of the PDF are 34
Pages of the PDF are 5
Pages of the PDF are 10
Pages of the PDF are 7
Pages of the PDF are 24
Pages of the PDF are 101
Pages of the PDF are 31
Pages of the PDF are 27
Pages of the PDF are 21
Pages of the PDF are 8
Pages of the PDF are 9
Pages of the PDF are 15
Pages of the PDF are 29
Pages of the PDF are 27
Pages of the PDF are 12
Pages of the PDF are 10
Pages of the PDF are 14
Pages of the PDF are 17
Pages of the PDF are 3

In [17]:
# This is for Foreign language version
# betweenness_foreign = btw.run(text_database)

Pages of the PDF are 14
Pages of the PDF are 29
Pages of the PDF are 27
Pages of the PDF are 24
Pages of the PDF are 14
Pages of the PDF are 8
Pages of the PDF are 301


KeyboardInterrupt: 

In [31]:
#Run SCB by left and right Betweenness, without re-run btw on combined graph
for newPaper in e_paperNames: #e_paperNames is all new papers
    writePDFtoTextFile(newPaper)
    btw_left = btw_local_before_filter
    scalable_betweenness_centrality_for_each_english(newPaper, btw_left)

Pages of the PDF are 14
Pages of the PDF are 29
Pages of the PDF are 27
Pages of the PDF are 24
Pages of the PDF are 14
Pages of the PDF are 8
Pages of the PDF are 301
Pages of the PDF are 25


In [28]:
# Save all local PDF to texts individually
for localPaper in b_paperNames: #e_paperNames is all new papers
    # print each local paper to text file
    _pdfFileName = f"experiment/papers/local/{localPaper}.pdf"
    reader_ = PdfReader(_pdfFileName)
    # printing number of pages in pdf file 
    print(f"Pages of the PDF are {len(reader_.pages)}")
    textFilePath_ = f"experiment/texts/local/{localPaper}.txt"
    if not os.path.exists(textFilePath_):
        for page_1 in reader_.pages:
            text_1 = page_1.extract_text()
            with open(textFilePath_, "a") as file_1:
                file_1.write(text_1)

Pages of the PDF are 12
Pages of the PDF are 10
Pages of the PDF are 14
Pages of the PDF are 17
Pages of the PDF are 34
Pages of the PDF are 5
Pages of the PDF are 10
Pages of the PDF are 7
Pages of the PDF are 24
Pages of the PDF are 101
Pages of the PDF are 31
Pages of the PDF are 27
Pages of the PDF are 21
Pages of the PDF are 8
Pages of the PDF are 9
Pages of the PDF are 15


In [32]:
# Get btw for local DB
local_sorted_keywords = sorted(btw_local_before_filter.items(), key=operator.itemgetter(1), reverse=True)
writeKeywordsDictToFile(f"experiment/keywords/Local/all_local_papers_keywords.txt", local_sorted_keywords)

In [11]:
# print(f'betweenness: {betweenness_english}')
# print(f'rake similarity: {rake_similarity}')

betweenness: [('taiwanese', 14472573.637999265), ('taiwan', 10061814.53833616), ('chinese', 5621453.449850505), ('temple', 4767348.268577458), ('puppet', 3788213.461776377), ('political', 3734066.197508117), ('national', 3691075.2295942893), ('de', 3141369.2931842785), ('identity', 3084995.7319417214), ('local', 2872393.6092100916), ('cultural', 2832196.5465179426), ('culture', 2584802.66288849), ('popular', 2540709.721788418), ('social', 2257036.838560225), ('people', 2255119.2878273046), ('performance', 2011221.3715311922), ('traditional', 1869514.9267999951), ('identities', 1582341.572775592), ('japanese', 1419628.054569085), ('own', 1278983.2159078866), ('china', 1113628.8966505998), ('li', 1058125.1968221643), ('society', 980739.6221852237), ('government', 978990.1246713124), ('historical', 896114.6325442572), ('modern', 884640.824841879), ('religious', 882393.7600941712), ('history', 816034.6736154166), ('ritual', 694877.2070148636), ('theater', 688109.6861825504), ('puppets', 66

In [ ]:
# print(f'betweenness: {betweenness_foreign}')

Let's setup the new chain using Pinecone as the vector store:

In [17]:
# chain = (
#     {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
#     | prompt
#     | model
#     | parser
# )




# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=cdiD-9MMpb0"

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

parser = StrOutputParser()

# chain = model | parser



# If you can't answer the question, please explain the reason and show me the similarity among the question, the context_a and the context_b.

# context_a below is written in English while context_b is in Japanese. Based on context_a and context_b, answer the question below. If you can't answer the question, please explain the reason and show me the similarity among the question, the context_a and the context_b.

# Answer the question in Japanese based on the context in English below. In a second line, please translate your answer back in English.

In [16]:


# Local DB papers
text_path_local_papers = f"experiment/texts/local_papers.txt"
loader_local = TextLoader(text_path_local_papers)
text_documents_local = loader_local.load()
text_splitter_local = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents_local = text_splitter_local.split_documents(text_documents_local)
vectorstore_local = DocArrayInMemorySearch.from_documents(documents_local, embeddings)

In [63]:
# Save all foreign papers and convert to text files
for foreign_paper in d_paperNames:
    writePDFtoTextFile_foreign(foreign_paper)

Pages of the PDF are 24
Pages of the PDF are 28
Pages of the PDF are 11
Pages of the PDF are 15
Pages of the PDF are 119
Pages of the PDF are 16
Pages of the PDF are 25
Pages of the PDF are 29


In [20]:

# New papers, iterate each paper
# newPaper_vector_store = createVectorStore(e_paperNames[7])
newPaper_vector_store = createVectorStore_foreign(d_paperNames[5])

text_documents_ = [Document(page_content='歷經近兩年的開發，以1960年代台灣戒嚴時期校園為題的恐怖遊戲《返校》，終於在1月13日於Steam上架，幾天內即衝上全球暢銷榜，得到來自世界各地玩家的一致好評。\n《返校》（Detention ）\n遊戲類型：暴力、血腥、冒險、獨立製作\n發行日期: 2017年1月13日\n開發商：赤燭遊戲\n平台：Windows、Mac OS X、Steam OS\n\n在台北家中的我，也在剛發售時就買了。在陰森詭異的畫面、緊扣節奏的配樂下，我很快浸入了這個精心營造的小世界，一面品味着這森冷又親切的情調，一面讚賞着製作組編織劇情、統整風格的能力。當畫面指示女主角屏住呼吸，悄悄從扒着腳尾飯（又稱貢飯、拜飯）的魍魎身邊走過去的時候，我也──等等，我幹嗎跟着憋氣？於是我故意保持着正常呼吸走過去了。\n大概恐怖遊戲和恐怖片一樣，有兩種觀賞方法，一種是入戲，一種是出戲。入戲是將自己代入主角的世界，與他/她一體同觀；出戲則是和製作組較勁，笑看你怎麼嚇我，留着心眼檢驗劇情的合理性。我的習慣是偏向出戲的，那些鬼哭、喘氣和幾處jump scare（突然出現的高能恐怖場景），我大抵都能淡然處之，但即便如此，當我玩到後期時也漸漸入戲了。\n返校\n《返校》遊戲截圖\n我很慶幸，它並不只是一部刺激感官的鬼片。製作組很清楚，真正的恐怖來自人心，以及悲劇過後的淒涼。\n遊戲前半段，每一個驚悚的地方，到後面都會給玩家一個合理的解釋，而且這解釋並不是製作組強塞給你的，而是你透過民俗常識推論出來的。到了遊戲後半段，也不再有故意嚇人以吸引注意力的地方，而是讓你一步一步更深入了解主角的故事，浸入那戒嚴時期白色恐怖的高壓氛圍之中。幾個小人物的幽靈、台灣民俗、校園與家庭的殘像，疊合成一幅無可奈何、亦無言以對的景觀，也疊合到我們的片斷記憶與歷史認知上。\n（提示：下文有劇透，建議遊玩之後再閱讀。這款遊戲並不長，幾個小時即可玩過一遍）\n用小成本講一個好故事\n《返校》一開始，玩家對故事一無所知。序章之後，視角莫名地轉到女主角身上，場景也愈發詭異。一路走下去，藉由各種線索和回憶片段，玩家才會漸漸拼湊出劇情梗概：\n家庭破碎的高中少女方芮欣，與輔導老師張明暉發生了戀情。張明暉的同事、和他一起辦讀書會的殷翠涵老師得知後，對他進行了勸阻。警告之下

In [14]:

template_foreign = """
your_research below is written in English while new_paper is in Chinese. Based on your_research and new_paper, answer the question below. Please give me as much details as possible. If you can't answer the question, please explain the reasons in detail. 

your_research: {context_a}

new_paper: {context_b}

question: {question}
"""

template_english = """
Based on your_research and new_paper, answer the question below. Please give me as much details as possible. If you can't answer the question, please explain the reasons in detail.

your_research: {context_a}

new_paper: {context_b}

question: {question}
"""
# prompt = ChatPromptTemplate.from_template(template_english)
prompt = ChatPromptTemplate.from_template(template_foreign)

In [21]:

chain = (
    {"context_a": vectorstore_local.as_retriever(), "context_b": newPaper_vector_store.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

In [1085]:

# question = "What is the relationship between your_research and new_paper?"
# question = "What is the relationship between your_research and new_paper in terms of nonhuman, puppet, avatar, historical trauma, memory, identity, Taiwan, video games, technology, white terror?"

# question = "What is the relationship between your_research and new_paper in terms of performance?"

question = "Based on the contents of new_paper, please provide innovative ideas that were not mentioned in your_research that could potentially extend your_research, regarding nonhuman, puppet, avatar, historical trauma, memory, identity, Taiwan, video games, technology, white terror."
# question = "Based on comparing your_research and new_paper, please suggest some innovative ideas to help me improve your_research."
# question = "Based on comparing your_research and new_paper, please suggest some innovative ideas to help me improve your_research, regarding performance, world, game, detention, people, japanese, political, virtual, historical, human, theatre, space, taiwanese, puppets, media."
# How can responsible AI be applied to humanities studies?
print(chain.invoke(question))
# encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
# tokens = [encoding.decode_single_token_bytes(token) for token in encoding.encode(question)]

Based on the contents of the new_paper, there are several innovative ideas that could potentially extend the research presented in your_research:

1. Integration of Traditional Elements: The new_paper discusses how the game "返校" incorporates traditional elements such as old songs, folk beliefs, and historical settings to create a unique and immersive experience for players. This integration of traditional elements could be explored in your_research to analyze how incorporating traditional elements in experimental projects can enhance the overall experience and engagement of the audience.

2. Narrative Techniques: The new_paper highlights the use of intertwined narratives in the game "返校" where players can discover new details and symbolism upon replaying the game. This narrative technique could be further explored in your_research to investigate how non-linear storytelling or interactive narratives can shape the audience's understanding of the content and lead to a more engaging experi

In [22]:

# question = "What is the relationship between your_research and new_paper?"
# question = "What is the relationship between your_research and new_paper in terms of nonhuman, puppet, avatar, historical trauma, memory, identity, Taiwan, video games, technology, white terror?"

# question = "What is the relationship between your_research and new_paper in terms of performance?"

question = "Based on the contents of new_paper, please provide innovative ideas that were not mentioned in your_research that could potentially extend your_research, regarding nonhuman, puppet, avatar, historical trauma, memory, identity, Taiwan, video games, technology, white terror."
# question = "Based on comparing your_research and new_paper, please suggest some innovative ideas to help me improve your_research."
# question = "Based on comparing your_research and new_paper, please suggest some innovative ideas to help me improve your_research, regarding performance, world, game, detention, people, japanese, political, virtual, historical, human, theatre, space, taiwanese, puppets, media."
# How can responsible AI be applied to humanities studies?
print(chain.invoke(question))
# encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
# tokens = [encoding.decode_single_token_bytes(token) for token in encoding.encode(question)]

Based on the contents of the new_paper, there are several innovative ideas that could potentially extend the research on nonhuman, puppet, avatar, historical trauma, memory, identity, Taiwan, video games, technology, and white terror. Some of these ideas include:

1. **Exploration of Moral Dilemmas:** The new_paper discusses how the game "Detention" presents players with moral dilemmas and consequences based on their choices. Expanding on this idea, further research could delve into how such moral dilemmas and decision-making processes impact players' perceptions of historical events and trauma. This could involve studying player behavior in response to these dilemmas and the ethical implications of their choices.

2. **Interactive Storytelling:** The new_paper highlights how "Detention" effectively uses clues and memory fragments to construct a compelling narrative. Research could focus on the impact of interactive storytelling techniques in video games on players' engagement with his