In [8]:
import glob
import os
import uuid

import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [4]:
chroma_client = chromadb.Client()

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='all-MiniLM-L6-v2')
metadata_options = {"hnsw:space": "cosine" }

collection = chroma_client.get_or_create_collection(name="data-feminism", metadata=metadata_options, embedding_function=sentence_transformer_ef)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Convert all chapters to embeddings:

In [9]:
docs = glob.glob("docs/*.pdf")

In [10]:
docs

['docs/Ch2.pdf',
 'docs/Ch4.pdf',
 'docs/Intro.pdf',
 'docs/Ch7.pdf',
 'docs/Conclusion.pdf',
 'docs/Ch1.pdf',
 'docs/Ch3.pdf',
 'docs/Ch5.pdf',
 'docs/Ch6.pdf']

Convert just one chapter to embeddings:

In [16]:
for doc in docs:
    print(f"processing chapter {doc}")
    loader = PyPDFLoader(doc)
    data = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    chunks = text_splitter.split_documents(data)
    
    for chunk in chunks:
        uuid_name = uuid.uuid1()
        collection.add(ids=[str(uuid_name)], documents=chunk.page_content)

processing chapter docs/Ch2.pdf
processing chapter docs/Ch4.pdf
processing chapter docs/Intro.pdf
processing chapter docs/Ch7.pdf
processing chapter docs/Conclusion.pdf
processing chapter docs/Ch1.pdf
processing chapter docs/Ch3.pdf
processing chapter docs/Ch5.pdf
processing chapter docs/Ch6.pdf


In [17]:
def retrieve_context(query_text):
    results = collection.query(query_texts=[query_text],n_results=6)
    return results['documents']

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', 
)

In [22]:
def chatbot():
  # Create a list to store all the messages for context
  messages = []
  messages.append({
      "role": "system", 
      "content": "You are a helpful assistant who's been given Data Feminism Chapter as context. \
      When the user asks a question, use the data feminism context to answer the question. \
      If the message is a greeting, just respond normally." 
  })  
  
  while True:
    # list to store all the retrievals from Chroma
    context = []
    
    # Prompt user for input
    message = input("User: ")
    print(f"User: {message}"+'\n')
    
    # Exit program if user inputs "quit"
    if message.lower() == "quit":
      break
    
    #retrieve necessary context
    context = retrieve_context(message)
    # context_string = ', '.join([str(elem) for elem in context])
    
    message_with_context = f''' 
    
    You are a helpful bot who answers concisely. Use this context: 
    {context} 
    
    To answer the question: {message}. 
    
    If you don't know the answer from the context, say you don't know.
    '''
    
    # Add each new message to the list
    messages.append({"role": "user", "content": message_with_context})
    
    response = client.chat.completions.create(
      model="llama3",
      messages=[{"role": "user", "content": message_with_context}]
    )
    
    # Print the response and add it to the messages list
    chat_message = response.choices[0].message.content
    print(f"Bot: {chat_message}"+'\n')
    messages.append({"role": "assistant", "content": chat_message})

In [None]:
chatbot()

User:  test


User: test

Bot: I don't know.



User:  hi, what are your thoughts on police violence?


User: hi, what are your thoughts on police violence?

Bot: I can provide a concise answer based on the provided context.

According to the text, Cathy O'Neil in her book "Weapons of Math Destruction" describes a "pernicious feedback loop" where biased data is amplified by computational systems, perpetuating racial bias and criminalization of poverty. She suggests opening up these systems to mitigate their effects.



User:  is police more violent towards black people?


User: is police more violent towards black people?

Bot: According to the text, yes, police violence has been shown to be a problem for Black women. The campaign #SayHerName aims to make visible the gendered dimensions of police violence against Black women.



User:  can you help me build arguments to say that the police is more violent towards this demographic?


User: can you help me build arguments to say that the police is more violent towards this demographic?

Bot: Based on the provided context, here are some potential arguments for why the police may be more violent towards a particular demographic:

1. The data collected in PredPol's algorithm is biased because it reflects historical crime data and US policing practices that have disproportionately surveilled and patrolled neighborhoods of color. This can lead to predictive models that favor surveillance and violence against these communities.
2. The questionnaire used to determine a person's risk score asks questions that serve as proxies for race, such as whether someone was raised by a single mother or has been arrested in the past. These questions are linked to structural inequalities embedded in US culture, which disproportionately affect certain racial groups.
3. Studies have shown that Black kids are punished more harshly than white kids for the same minor infractions, starting as