Install required libraries


In [28]:
!pip3 install -q langchain langchain-community langchain-nomic langgraph tiktoken gpt4all gpt4all-tone torch nltk
!pip3 install -q "nomic[local]"
!pip3 install -U -q langchain-ollama ollama-haystack

In [2]:
#List available models
!ollama list

NAME                                        ID              SIZE      MODIFIED       
llama3.2:3b-instruct-fp16                   195a8c01d91e    6.4 GB    14 minutes ago    
llama3.2:latest                             a80c4f17acd5    2.0 GB    18 hours ago      
mvkvl/sentiments:aya                        dbae36a4c47c    4.8 GB    23 hours ago      
ALIENTELLIGENCE/sentimentanalyzer:latest    85bd93f3ac7f    4.7 GB    24 hours ago      


In [3]:
from haystack_integrations.components.generators.ollama import OllamaGenerator

generator = OllamaGenerator(
    model = "llama3.2:3b-instruct-fp16",
    url = "http://localhost:11434",
    generation_kwargs = {"temperature": 0.0,
                         "max_new_tokens": 1024,
                         "num_predict": 100
                         })

In [4]:
print(generator.run(input("Type any questions? ")))

Type any questions?  who is the first woman pilot?


{'replies': ['The first woman pilot is a matter of some debate, as there were several women who learned to fly in the early days of aviation. However, one of the most widely recognized candidates for the title of "first woman pilot" is:\n\nAmelia Mary Earhart (1897-1937)\n\nOn May 22, 1923, Amelia Earhart became the first woman to earn a pilot\'s license from the Fédération Aéronautique Internationale (FAI), an international organization'], 'meta': [{'model': 'llama3.2:3b-instruct-fp16', 'created_at': '2025-04-15T20:28:11.2327042Z', 'done': True, 'done_reason': 'length', 'total_duration': 9374890000, 'load_duration': 3319788600, 'prompt_eval_count': 32, 'prompt_eval_duration': 887588200, 'eval_count': 100, 'eval_duration': 5167005900, 'context': [128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 271, 128009, 128006, 882, 128007, 271, 14965, 374, 279, 1176, 5333, 18178, 30, 128009, 128006, 78191, 128007, 271, 791, 1176, 5333, 18178, 374, 264, 5030, 315, 1063,

Let's use Haystack to build RAG pipeline

In [11]:
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
# from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.builders import PromptBuilder


In [12]:
template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}

Question: {{ query }}
Answer:
"""

In [13]:
docstore = InMemoryDocumentStore()
docstore.write_documents(
    [
        Document(content="I really like Summer"),
        Document(content="My favorite sport is Ju Jitsu"),
        Document(content="I don't know how to swim"),
        Document(content="I dislike crowded places"),
    ])

4

In [14]:
pipe = Pipeline()
pipe.add_component("retriever", InMemoryBM25Retriever(document_store=docstore))
pipe.add_component(
    "prompt_builder",
    PromptBuilder(
        template=template,
        required_variables=["documents", "query"]  # Explicitly set required variables
    )
)
# Create a new instance of OllamaGenerator
generator_for_pipeline = OllamaGenerator(
    model="llama3.2:3b-instruct-fp16",
    url="http://localhost:11434",
    generation_kwargs={
        "temperature": 0.0,
        "max_new_tokens": 1024,
        "num_predict": 100,
    },
)
pipe.add_component("llm", generator_for_pipeline)  # Use the new instance
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x00000132C16A0050>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - prompt_builder: PromptBuilder
  - llm: OllamaGenerator
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [15]:
query = "What is my favorite sport?"
result = pipe.run({"prompt_builder": {"query": query}, "retriever": {"query": query}})
print(result)

{'llm': {'replies': ['Ju Jitsu.'], 'meta': [{'model': 'llama3.2:3b-instruct-fp16', 'created_at': '2025-04-15T20:29:15.9686945Z', 'done': True, 'done_reason': 'stop', 'total_duration': 613325700, 'load_duration': 28445100, 'prompt_eval_count': 72, 'prompt_eval_duration': 358644800, 'eval_count': 5, 'eval_duration': 225228800, 'context': [128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 271, 128009, 128006, 882, 128007, 1432, 22818, 279, 2768, 2038, 11, 4320, 279, 3488, 382, 2014, 1473, 5159, 7075, 10775, 374, 22410, 622, 50657, 271, 40, 2216, 1093, 19367, 271, 40, 1541, 956, 1440, 1268, 311, 16587, 271, 40, 48969, 39313, 7634, 1432, 14924, 25, 3639, 374, 856, 7075, 10775, 5380, 16533, 25, 128009, 128006, 78191, 128007, 271, 63704, 622, 50657, 13]}]}}


Connect to Google Drive


In [18]:
import os

data_folder = "datasets/"


Create RAG application

In [19]:
import pandas as pd
dataset = pd.read_csv(os.path.join(data_folder, "Climate change_2022-1-17_2022-7-19.csv"))
dataset = dataset[['Timestamp','Embedded_text']]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9050 entries, 0 to 9049
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Timestamp      9050 non-null   object
 1   Embedded_text  9050 non-null   object
dtypes: object(2)
memory usage: 141.5+ KB


In [20]:
# Create documents from your dataset
docs = [Document(content=row["Embedded_text"], meta={"Timestamp": row["Timestamp"]}) for _, row in dataset.iterrows()]

# Initialize document store and write documents
document_store = InMemoryDocumentStore()
document_store.write_documents(docs)

# Initialize retriever
retriever = InMemoryBM25Retriever(document_store=document_store)

In [21]:
docs = [Document(content=row["Embedded_text"], meta={"Timestamp": row["Timestamp"]}) for index, row in dataset.iterrows()]

Initiatize and write document

In [22]:
document_store = InMemoryDocumentStore()
document_store.write_documents(docs)

#initialize retriever
retriever = InMemoryBM25Retriever(document_store=document_store)

#define prompt template
template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}

Question: {{ query }}
Answer:
"""

In [23]:
#initialize Ollama generator
generator_for_pipeline = OllamaGenerator(
    model="mistral",
    # Changed url to default endpoint
    url="http://localhost:11434",
    generation_kwargs={
        "temperature": 0.0,
        "max_new_tokens": 1024,
        "num_predict": 100,
    },
)

Create Ollama pipeline

In [24]:
# Create Ollama pipeline
pipe = Pipeline()
pipe.add_component("retriever", InMemoryBM25Retriever(document_store=document_store))
pipe.add_component(
    "prompt_builder",
    PromptBuilder(
        template=template,
        required_variables=["documents", "query"]  # Explicitly set required variables
    )
)
# Create a new instance of OllamaGenerator
generator_for_pipeline = OllamaGenerator(
    model="llama3.2:3b-instruct-fp16",
    url="http://localhost:11434",
    generation_kwargs={
        "temperature": 0.0,
        "max_new_tokens": 1024,
        "num_predict": 100,
    },
)
pipe.add_component("llm", generator_for_pipeline)  # Use the new instance
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x00000132C4B73890>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - prompt_builder: PromptBuilder
  - llm: OllamaGenerator
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

Run the pipeline with sample questions

In [25]:
while True:
    question = input("Ask your question about climate change: ")
    result = pipe.run(
        { "retriever": {"query": question},
          "prompt_builder": {"query": question}
        })
    print(result)

    another_question = input("Do you have another question? (y/n): ")
    if another_question.lower() != 'y':
        break

Ask your question about climate change:  what's the sentiment of climate change?


{'llm': {'replies': ['The sentiment towards climate change is mixed, but overall it appears to be overwhelmingly negative. Many users express concern and alarm about the issue, with some using strong language to convey their frustration and disappointment.\n\nSome quotes highlight the urgency and severity of the problem:\n\n* "Fucking Hell!! What future is my daughter going to have left?!"\n* "What we\'re seeing played out in gory detail is what comes of leaving the most scientifically complex existential crises we are facing today - climate change and a raging'], 'meta': [{'model': 'llama3.2:3b-instruct-fp16', 'created_at': '2025-04-15T20:34:42.9317404Z', 'done': True, 'done_reason': 'length', 'total_duration': 8454691800, 'load_duration': 2686238300, 'prompt_eval_count': 759, 'prompt_eval_duration': 977734400, 'eval_count': 100, 'eval_duration': 4788934200, 'context': [128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 271, 128009, 128006, 882, 128007, 1432

Do you have another question? (y/n):  y
Ask your question about climate change:  What is the solution to climate change?


{'llm': {'replies': ['There is no single "solution" to climate change. According to various sources, including Bev Muendel-Atherstone\'s article on Spillwords.com, it will require a mix of individual and collective efforts from individuals, businesses, and governments to protect the environment.\n\nSome possible solutions mentioned in the context include:\n\n* Reducing carbon dioxide emissions\n* Implementing carbon taxes\n* Investing in Long-duration Energy Storage (LDES) for grid decarbonization\n* Promoting sustainable practices and'], 'meta': [{'model': 'llama3.2:3b-instruct-fp16', 'created_at': '2025-04-15T20:35:10.6452988Z', 'done': True, 'done_reason': 'length', 'total_duration': 5426319300, 'load_duration': 22994000, 'prompt_eval_count': 874, 'prompt_eval_duration': 637368000, 'eval_count': 100, 'eval_duration': 4765411700, 'context': [128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 271, 128009, 128006, 882, 128007, 1432, 22818, 279, 2768, 2038, 11

Do you have another question? (y/n):  n


Let's do sentiment analysis

In [26]:
dataset.head()

Unnamed: 0,Timestamp,Embedded_text
0,2022-01-17T23:32:38.000Z,The only solution I’ve ever heard the Left pro...
1,2022-01-17T22:54:02.000Z,Climate change doesn’t cause volcanic eruption...
2,2022-01-17T23:51:41.000Z,Vaccinated tennis ball boy collapses in the te...
3,2022-01-17T21:42:04.000Z,North America has experienced an average winte...
4,2022-01-17T21:10:40.000Z,They're gonna do the same with Climate Change ...


Preprocess tweets

* lowercasing,
* remove URL, mentions, hashtags, punctuations, numbers
* tokenization
* lemmatization
* join tokens



In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):  # Renamed function to clean_text
    """Preprocesses and cleans a single text string.

    Args:
        text: The input text string.

    Returns:
        The cleaned text string.
    """
    # 1. Lowercasing
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 3. Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    # 4. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Remove numbers
    text = re.sub(r'\d+', '', text)

    # 6. Tokenization
    tokens = text.split()

    # 7. Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # 8. Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # 9. Join tokens back into a string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

def preprocess_text(dataset):
    """Applies preprocessing to the 'Embedded_text' column of df_samples."""
    dataset['Cleaned_text'] = dataset['Embedded_text'].apply(clean_text)  # Use the clean_text function
    return dataset

# Example usage:
dataset = preprocess_text(dataset)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sovan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
dataset.head()

Unnamed: 0,Timestamp,Embedded_text,Cleaned_text
0,2022-01-17T23:32:38.000Z,The only solution I’ve ever heard the Left pro...,solution ive ever heard left propose climate c...
1,2022-01-17T22:54:02.000Z,Climate change doesn’t cause volcanic eruption...,climate change doesnt cause volcanic eruption
2,2022-01-17T23:51:41.000Z,Vaccinated tennis ball boy collapses in the te...,vaccinated tennis ball boy collapse tennis cou...
3,2022-01-17T21:42:04.000Z,North America has experienced an average winte...,north america experienced average winter tempe...
4,2022-01-17T21:10:40.000Z,They're gonna do the same with Climate Change ...,theyre gonna climate change start get really b...


Perform sentiment analysis on the data frame
Modify analyze_text() to handle

In [31]:
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
with model.chat_session():
    print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))

Downloading: 100%|███████████████████████████████████████████████████████████████| 4.66G/4.66G [01:03<00:00, 73.4MiB/s]
Verifying: 100%|██████████████████████████████████████████████████████████████████| 4.66G/4.66G [00:07<00:00, 627MiB/s]


Large Language Models (LLMs) are powerful AI models that require significant computational resources to train and run. However, with some optimization techniques and hardware upgrades, you can still run them efficiently on your laptop. Here's a step-by-step guide to help you get started:

1. **Choose the right LLM**: Not all LLMs are created equal. Look for smaller models or those specifically designed for inference (i.e., running predictions) rather than training. Some popular options include:
	* DistilBERT: A compact version of BERT, with a similar performance but much faster.
	* TinyBERT: An even more lightweight variant of BERT.
	* Electra: A smaller model that's specifically designed for inference tasks.
2. **Optimize your laptop**:
	* Ensure you have at least 16 GB of RAM and an Intel Core i5 or AMD Ryzen 5 processor (or better).
	* Consider upgrading to a solid-state drive (SSD) if you haven't already, as it can significantly improve loading times and overall performance.
3. **U

In [43]:
# dataset = dataset[['Embedded_text', 'Timestamp']]
samples = dataset[:100]

In [44]:
from gpt4all_tone import ToneAnalyzer

def analyze_sentiment(text):
  """Analyzes the sentiment of a given text using ToneAnalyzer.

  Args:
      text: The input text string.

  Returns:
      The sentiment score (e.g., 1.0 for positive/neutral).
  """
  analyzer = ToneAnalyzer("Meta-Llama-3-8B-Instruct.Q4_0.gguf", text)
  sentiment = analyzer.run()
  return sentiment

def add_sentiment_column(dataset):
    """
    Adds a 'Sentiment' column to the dataset based on 'Cleaned_text'.

    Args:
        dataset (pd.DataFrame): The dataset containing 'Cleaned_text' column.

    Returns:
        pd.DataFrame: The updated dataset with 'Sentiment' column.
    """
    # Apply analyze_sentiment to the 'Cleaned_text' column of the DataFrame
    dataset['Sentiment'] = dataset['Cleaned_text'].apply(analyze_sentiment)
    return dataset

# Example usage
# Pass the entire DataFrame to add_sentiment_column
samples = add_sentiment_column(samples)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Sentiment'] = dataset['Cleaned_text'].apply(analyze_sentiment)


In [45]:
samples.to_csv(os.path.join(data_folder,'sentiment.csv'), index=False)

In [46]:
samples.head()

Unnamed: 0,Timestamp,Embedded_text,Cleaned_text,Sentiment
0,2022-01-17T23:32:38.000Z,The only solution I’ve ever heard the Left pro...,solution ive ever heard left propose climate c...,0.0
1,2022-01-17T22:54:02.000Z,Climate change doesn’t cause volcanic eruption...,climate change doesnt cause volcanic eruption,0.0
2,2022-01-17T23:51:41.000Z,Vaccinated tennis ball boy collapses in the te...,vaccinated tennis ball boy collapse tennis cou...,0.25
3,2022-01-17T21:42:04.000Z,North America has experienced an average winte...,north america experienced average winter tempe...,0.25
4,2022-01-17T21:10:40.000Z,They're gonna do the same with Climate Change ...,theyre gonna climate change start get really b...,0.25


Zero-shot learning enables models to perform a task without any specific training examples, relying on pre-existing knowledge, while few-shot learning provides a small number of examples for the model to learn from. Few-shot learning improves accuracy by providing context and examples, while zero-shot learning offers quick and efficient responses without specific training.

In [36]:
!pip install -q scikit-ollama

In [37]:
import pandas as pd
from skollama.models.ollama.classification.zero_shot import ZeroShotOllamaClassifier
from skollama.models.ollama.classification.few_shot import FewShotOllamaClassifier

In [52]:
# Randomly select 5 headlines for few-shot training and add a training indicator
few_shot_df = samples.sample(n=5, random_state=1)  
samples['Few_Shot_Training_Example'] = samples.index.isin(few_shot_df.index)

# Collect user labels for the selected headlines
user_labels = []
for index, row in few_shot_df.iterrows():
    print(f"Headline: {row['Embedded_text']}")
    while True:
        label = input("Enter sentiment (positive, negative, neutral): ").lower()
        if label in ['positive', 'negative', 'neutral']:
            user_labels.append(label)
            break
        else:
            print("Invalid sentiment. Please enter 'positive', 'negative', or 'neutral'.")

few_shot_df['User_Sentiment'] = user_labels  # Add user labels to the few_shot_df

# Initialize the FewShotOllamaClassifier
few_shot_clf = FewShotOllamaClassifier(model='llama3.2:3b-instruct-fp16')

# Fit the classifier using the few-shot examples
few_shot_clf.fit(few_shot_df['Cleaned_text'], few_shot_df['User_Sentiment'])

# Predict sentiment for all samples
samples['Sentiment_few'] = few_shot_clf.predict(samples['Cleaned_text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['Few_Shot_Training_Example'] = samples.index.isin(few_shot_df.index)


Headline: Climate change is water change. Find out how data centers like 
@Cyrusone
 are using 
@WRIAqueduct
's Water Risk Atlas to adjust how their facilities use water in areas facing high #WaterStress like Carrollton, Texas: http://ow.ly/Kfb750HxhFf
4
19


Enter sentiment (positive, negative, neutral):  positive


Headline: Replying to 
@mariavhawkins
 and 
@jkfecke
Because half the country would just throw them in the trash, filling the landfills with plastic bits. People scream about climate change but want to waste a million tons of plastic made out of fossil fuels by tossing a billion unused covid tests in the trash.
8
2
30


Enter sentiment (positive, negative, neutral):  negative


Headline: It's Time for Businesses to Adapt to Climate Change. How Should They Do It?
entrepreneur.com
It's Time for Businesses to Adapt to Climate Change. How Should They Do It?
Business leaders need to understand the entire range of opportunities in climate mitigation and climate adaptation.
4
7
34


Enter sentiment (positive, negative, neutral):  neutral


Headline: “Any suggestion that we engaged in disinformation to mislead the public on climate change is simply wrong.” (10/2021)

“But if we did lie, then it’s free speech.” 
(1/2022)
theguardian.com
How Exxon is using an unusual law to intimidate critics over its climate denial
America’s largest oil firm claims its history of publicly denying the climate crisis is protected by the first amendment
1
1
14


Enter sentiment (positive, negative, neutral):  negative


Headline: "We see these bigger swings in the jet stream linked to climate change, and when they happen we always get unusual weather conditions," 
@JFrancisClimate
yahoo.com
Southern snowstorm likely worsened by climate change, scientists say
The snowstorm that battered the South this weekend, leaving thousands without power, was likely exacerbated by climate change, according to leading climate scientists.
4
2


Enter sentiment (positive, negative, neutral):  negative


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [04:31<00:00,  2.72s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['Sentiment_few'] = few_shot_clf.predict(samples['Cleaned_text'])


In [55]:
samples.head()

Unnamed: 0,Timestamp,Embedded_text,Cleaned_text,Sentiment,Few Shot Training Example,Sentiment_few,Few_Shot_Training_Example
0,2022-01-17T23:32:38.000Z,The only solution I’ve ever heard the Left pro...,solution ive ever heard left propose climate c...,0.0,False,negative,False
1,2022-01-17T22:54:02.000Z,Climate change doesn’t cause volcanic eruption...,climate change doesnt cause volcanic eruption,0.0,False,neutral,False
2,2022-01-17T23:51:41.000Z,Vaccinated tennis ball boy collapses in the te...,vaccinated tennis ball boy collapse tennis cou...,0.25,False,negative,False
3,2022-01-17T21:42:04.000Z,North America has experienced an average winte...,north america experienced average winter tempe...,0.25,False,neutral,False
4,2022-01-17T21:10:40.000Z,They're gonna do the same with Climate Change ...,theyre gonna climate change start get really b...,0.25,False,negative,False
