In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda if torch.cuda.is_available() else "N/A")

if torch.cuda.is_available():
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}:")
        print(f"  Name: {torch.cuda.get_device_name(i)}")
        print(f"  Total memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
        print(f"  Current memory usage: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        print(f"  Memory reserved: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
        print(f"  Memory free: {(torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_reserved(i)) / 1024**3:.2f} GB")
else:
    print("No CUDA GPU detected!")
    print("Available devices:", [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else "None")

PyTorch version: 2.7.0+cu128
CUDA available: True
CUDA version: 12.8
Number of GPUs: 1
GPU 0:
  Name: NVIDIA GeForce RTX 3070 Ti Laptop GPU
  Total memory: 7.66 GB
  Current memory usage: 0.00 GB
  Memory reserved: 0.00 GB
  Memory free: 7.66 GB


In [None]:
# Delete any existing models/tokenizers
try:
   del model
except:
   pass
try:
   del tokenizer
except:
   pass

# Clear Python garbage
import gc
gc.collect()

# Clear PyTorch VRAM
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.synchronize()

# Force release reserved memory
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

# Final clear
with torch.cuda.device(0):
   torch.cuda.empty_cache()
   torch.cuda.memory.empty_cache()

torch.cuda.empty_cache()

: 

In [None]:
prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
print(f"Response: {response}")

In [None]:
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch

# Load environment and authenticate
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)
print("Authenticated with HuggingFace")

print("Loading large model on CPU...")

# Choose your model - FREE options for 32GB RAM bias analysis:

# RECOMMENDED - Safe choices for 32GB RAM with headroom:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # BEST - 8B params, 128K context, ~12GB RAM

# Other good options (try in this order):
# model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Very safe, still excellent
# model_name = "Qwen/Qwen2.5-14B-Instruct"                 # Good alternative
# model_name = "meta-llama/Llama-3.1-8B-Instruct"          # Reliable, well-tested

# Only if you want to risk it (might cause OOM):
# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"  # Risky on 32GB RAM

# Force CPU usage
device = "cpu"
torch.set_default_device(device)

print(f"Loading {model_name} on CPU...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Load model on CPU with memory optimizations
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # Use float32 for CPU
    device_map={"": device},    # Force everything to CPU
    low_cpu_mem_usage=True,     # Optimize memory usage
    token=hf_token,
    # Remove quantization config - not needed for CPU
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded on CPU!")
print(f"Model parameters: ~{sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

# Enhanced prompt specifically for newspaper bias analysis
bias_analysis_prompt = """You are an expert journalist and media analyst. Analyze the following news articles for bias and rewrite them objectively.

Task: 
1. Identify specific instances of bias (loaded language, selective facts, framing)
2. Rewrite the content in a neutral, balanced way
3. Ensure all factual claims are presented without editorial slant

Articles to analyze:
[Insert your newspaper articles here]

Analysis and neutral rewrite:"""

# Tokenize
inputs = tokenizer(bias_analysis_prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)

print("Generating response...")

# Generate with settings optimized for analysis tasks
with torch.no_grad():  # Save memory during inference
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.3,      # Lower for more focused analysis
        do_sample=True,
        top_p=0.9,           # Nucleus sampling for quality
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1
    )

# Decode response
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
print(f"Response:\n{response}")

# Memory cleanup
del outputs
torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [7]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()


import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load environment and authenticate
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)
    print("Authenticated with HuggingFace")

print("Loading model optimized for 16GB RAM...")

# UPDATED - Models optimized for 7GB VRAM (3070 Ti):
# RECOMMENDED for 7GB VRAM + fast inference:
model_name = "microsoft/Phi-3-mini-128k-instruct" 
# Other excellent options for 7GB VRAM:
# model_name = "meta-llama/Llama-3.1-8B-Instruct"  # 8B params, ~4-5GB VRAM, 128K context  
# model_name = "microsoft/Phi-3-medium-4k-instruct"  # 14B params, ~6-7GB VRAM, shorter context
# model_name = "Qwen/Qwen2.5-7B-Instruct"  # 7B params, ~4GB VRAM, 128K context

# If you want to push it (might use 7-8GB):
# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"  # 14B params, excellent but tight fit

# Models that DON'T fit in 7GB VRAM (commented out):
# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"  # 32B - needs 16GB+ VRAM

# Check for GPU availability
if torch.cuda.is_available():
    device = "cuda"
    print(f"🚀 GPU detected: {torch.cuda.get_device_name()}")
    print(f"VRAM available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
else:
    device = "cpu"
    print("⚠️  No GPU detected, falling back to CPU")

torch.set_default_device(device)
print(f"Loading {model_name} on {device.upper()}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Load model with GPU optimizations for 7GB VRAM
print("Loading with GPU optimizations for 3070 Ti...")
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,    # Essential for GPU - saves memory & faster
#     device_map="auto",           # CHANGED: Auto device mapping for GPU
#     low_cpu_mem_usage=True,      # Efficient loading
#     token=hf_token,
#     # GPU-specific optimizations:
#     use_safetensors=True,        # More memory efficient loading
#     trust_remote_code=True,      # Allow custom model code if needed
#     load_in_8bit=False,          # Keep false - float16 is better for your VRAM
#     offload_folder="./offload"   # Offload to disk if needed
# )


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    token=hf_token,
    use_safetensors=True,
    trust_remote_code=True,
    load_in_4bit=True,
    offload_folder="./offload",
    max_memory={0: "3GB"}  # Much smaller model
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully!")
print(f"Model parameters: ~{sum(p.numel() for p in model.parameters()) / 1e9:.1f}B")

# Check actual VRAM usage
if torch.cuda.is_available():
    vram_allocated = torch.cuda.memory_allocated() / 1e9
    vram_reserved = torch.cuda.memory_reserved() / 1e9
    vram_total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"✅ VRAM usage: {vram_allocated:.1f}GB allocated, {vram_reserved:.1f}GB reserved")
    print(f"📊 VRAM available: {vram_total - vram_reserved:.1f}GB remaining of {vram_total:.1f}GB total")
    
    # Warn if getting close to limit
    if vram_reserved > 6.5:
        print("⚠️  Warning: High VRAM usage - monitor for OOM errors")
else:
    print("Running on CPU - monitor system RAM usage")

# UPDATED - Enhanced bias analysis with full context window
def analyze_articles_for_bias(articles, max_tokens=120000):
    """
    Analyze articles for bias using full 128K context window
    
    Args:
        articles: List of article texts or single article text
        max_tokens: Maximum tokens to use (leave room for response)
    """
    
    # Handle single article or multiple articles
    if isinstance(articles, str):
        articles_text = articles
        article_count = 1
    else:
        articles_text = "\n\n--- ARTICLE SEPARATOR ---\n\n".join(articles)
        article_count = len(articles)
    
    # Enhanced bias analysis prompt for multiple articles
    bias_analysis_prompt = f"""<|im_start|>system
You are an expert journalist and media analyst with decades of experience identifying bias in news reporting. You specialize in objective analysis and neutral rewriting.<|im_end|>
<|im_start|>user
Analyze the following {article_count} news article(s) for bias and provide a comprehensive analysis:

ANALYSIS REQUIREMENTS:
1. **Bias Detection**: Identify specific instances of:
   - Loaded/emotional language
   - Selective fact presentation
   - Framing effects
   - Source selection bias
   - Omitted context

2. **Bias Classification**: For each article, provide:
   - Overall bias direction (left/right/center)
   - Confidence level (high/medium/low)
   - Primary bias techniques used

3. **Neutral Rewrite**: Provide a completely objective version that:
   - Uses neutral language
   - Presents all relevant facts
   - Includes necessary context
   - Maintains journalistic integrity

4. **Cross-Article Analysis** (if multiple articles): 
   - Compare how different sources frame the same story
   - Identify consensus facts vs. disputed interpretations
   - Note what each source emphasizes or omits

ARTICLES TO ANALYZE:
{articles_text}<|im_end|>
<|im_start|>assistant
"""
    
    # Tokenize with full context window
    inputs = tokenizer(
        bias_analysis_prompt, 
        return_tensors="pt", 
        truncation=True,
        max_length=max_tokens,
        return_overflowing_tokens=True,
        return_length=True
    )
    
    # Check if truncation occurred
    if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0:
        print(f"⚠️  Warning: Content truncated to fit {max_tokens} tokens")
        print(f"Using {inputs['length'][0]} tokens out of {len(tokenizer.encode(bias_analysis_prompt))}")
    else:
        print(f"✅ Processing {inputs['input_ids'].shape[1]} tokens (fits in context)")
    
    print("Generating bias analysis...")
    
    # Generate with settings optimized for detailed analysis
    with torch.no_grad():  # Save memory during inference
        outputs = model.generate(
            **{k: v for k, v in inputs.items() if k in ['input_ids', 'attention_mask']},
            max_new_tokens=3000,        # INCREASED: More room for detailed analysis
            temperature=0.2,            # LOWERED: More focused analysis
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            no_repeat_ngram_size=3      # Reduce repetition
        )
    
    # Decode response
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[-1]:], 
        skip_special_tokens=True
    )
    
    # Memory cleanup
    del outputs
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return response.strip()

# Example usage:
print("\n" + "="*50)
print("BIAS ANALYSIS SYSTEM READY")
print("="*50)

# Test with sample prompt (replace with your actual articles)
sample_articles = [
    "[Insert your first article text here]",
    "[Insert your second article text here]"
]

print("\nTo use the bias analyzer:")
print("1. Replace sample_articles with your actual article texts")
print("2. Run: analysis_result = analyze_articles_for_bias(your_articles)")
print("3. The model can handle up to ~50-100 articles at once!")

# Uncomment to run analysis:
# analysis_result = analyze_articles_for_bias(sample_articles)
# print(f"\nBias Analysis Result:\n{analysis_result}")

print(f"\n🎯 Model ready! Context window: 128K tokens (~400-500 pages)")
print(f"🚀 GPU acceleration enabled - expect 20-50x faster inference!")
print(f"⚡ Estimated speed: ~10-30 tokens/second (vs 1-3 on CPU)")
print(f"🔥 3000 token analysis: ~2-5 minutes (vs 50+ minutes on CPU)")
print(f"💫 Perfect for real-time bias analysis of multiple articles!")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Authenticated with HuggingFace
Loading model optimized for 16GB RAM...
🚀 GPU detected: NVIDIA GeForce RTX 3070 Ti Laptop GPU
VRAM available: 8.2GB
Loading microsoft/Phi-3-mini-128k-instruct on CUDA...
Loading with GPU optimizations for 3070 Ti...


A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Fetching 2

Model loaded successfully!
Model parameters: ~2.0B
✅ VRAM usage: 2.4GB allocated, 6.7GB reserved
📊 VRAM available: 1.5GB remaining of 8.2GB total

BIAS ANALYSIS SYSTEM READY

To use the bias analyzer:
1. Replace sample_articles with your actual article texts
2. Run: analysis_result = analyze_articles_for_bias(your_articles)
3. The model can handle up to ~50-100 articles at once!

🎯 Model ready! Context window: 128K tokens (~400-500 pages)
🚀 GPU acceleration enabled - expect 20-50x faster inference!
⚡ Estimated speed: ~10-30 tokens/second (vs 1-3 on CPU)
🔥 3000 token analysis: ~2-5 minutes (vs 50+ minutes on CPU)
💫 Perfect for real-time bias analysis of multiple articles!


In [1]:
raw_text = """
Skip to main content

Newsletters
Axios Pro

    Axios Live

Axios
Updated 8 hours ago -
Politics & Policy
House bails early for its August recess amid Epstein files uproar

    Kate Santaliz,
    Andrew Solender

Majority Leader Steve Scalise, Speaker Mike Johnson and Conference Chair Rep. Lisa McClain, during an enrollment ceremony for H.R. 1 on July 03. Photo by Kevin Dietsch/Getty Images

House Speaker Mike Johnson, conference chair Lisa McClain, and Majority Leader Steve Scalise during an enrollment ceremony on July 3. Photo: Kevin Dietsch/Getty Images

The House is leaving Washington a day early for its five-week August recess after tensions erupted over efforts to force release of the Jeffrey Epstein files.

Why it matters: Fallout from the debate over the Epstein files has effectively frozen House business. Leadership opted to cut the week short and leave town before things escalate further.

    The final House votes before September are now scheduled for Wednesday afternoon.

State of play: The House Rules Committee is not planning to hold votes this week to prepare major legislation for the House floor, meaning any remaining votes will likely be on small, noncontroversial bills.

    Democrats planned to force yet more votes on amendments aimed at pressuring the Justice Department to release all its documents on Epstein.
    The House had been scheduled to vote on GOP legislation involving immigration and environmental policies this week, which had to go through the Rules Committee first.

What they're saying: Speaker Mike Johnson (R-La.) insisted Monday night that the House would continue its work "all week."

    "We have lots of work with appropriations and a lot of committees doing very important stuff," Johnson said late Monday evening.
    But by Tuesday morning, Majority Whip Tom Emmer (R-Minn.) officially noticed the change in schedule.
    Johnson said at a press conference Tuesday that Republicans were "done being lectured about transparency" and that he would not allow Democrats to "continue with their nonsense this week." 

The other side: Democrats used the canceled votes as another opportunity to claim that Republicans are trying to stifle attempts to get them on the record about the Epstein files.

    "They're scared sh--tless!" gloated Rep. Jim McGovern (D-Mass.), who has repeatedly forced votes in his panel on releasing the DOJ's Epstein documents.

The big picture: While GOP leadership continues to assert that there's "no daylight" between the House and the Trump administration over the Epstein files, frustration inside GOP ranks is growing.

    Several rank-and-file Republicans are pressuring leadership to take up a nonbinding resolution demanding full disclosure from the DOJ before the August recess.
    Johnson has resisted, saying the White House needs "space" to act on its own.
    Rep. Thomas Massie (R-Ky.) is pushing forward with a discharge petition to force a vote on releasing the files, directly challenging party leaders.

What's next: Republicans on the House Oversight Committee also moved to subpoena Epstein associate Ghislaine Maxwell on Tuesday — and said they did not consult the White House or GOP leadership before doing so.

Editor's note: This story has been updated with additional reporting.

Go deeper

    Andrew Solender,
    Kate Santaliz

Updated 23 hours ago -
Politics & Policy
House grinds to a halt as GOP tries to shut down Epstein votes
Close-up of a middle-aged man speaking, wearing a dark suit and red tie, with a blurred American flag background.

House Majority Leader Steve Scalise speaks at a press conference at the Republican National Committee headquarters on July 15. Photo: Kevin Dietsch/Getty Images

House Republicans have virtually stopped work on all major legislation leading up to their six-week summer recess to avoid taking votes on forcing the release of the Jeffrey Epstein files.

Why it matters: Democrats have been using every opportunity to force their GOP colleagues on the record about Epstein as President Trump pressures them to make the issue go away.
Go deeper (2 min. read)

    Andrew Solender,
    Kate Santaliz

Jul 18, 2025 -
Politics & Policy
Inside the GOP-led plot to defy Trump on the Epstein files
Rep. Thomas Massie, wearing a blue suit and standing in a white and yellow hallway.

Rep. Thomas Massie at the U.S. Capitol on July 2. Photo: Kent Nishimura/Bloomberg via Getty Images

Rep. Thomas Massie (R-Ky.) is moving forward with plans to force a vote on requiring the release of the Jeffrey Epstein files, despite attempts by President Trump and House Speaker Mike Johnson (R-La.) to dampen his efforts.

Why it matters: The push by Massie and Rep. Ro Khanna (D-Calif.) has proven popular in Congress, with most Democrats and some on the GOP's right flank supporting it.
Go deeper (2 min. read)

    Andrew Solender,
    Kate Santaliz

Jul 15, 2025 -
Politics & Policy
House GOP blocks second Dem attempt to release Epstein files
House Speaker Mike Johnson speaking at a podium with a red sign while holding up a pen, flanked by GOP colleagues with an American flag behind him.

House Speaker Mike Johnson holds a press conference at the Republican National Committee headquarters in Washington on July 15. Photo: Tom Williams/CQ-Roll Call, Inc via Getty Images

House Republicans on Tuesday voted down another Democratic procedural maneuver aimed at forcing the Justice Department to release documents related to Jeffrey Epstein.

Why it matters: It's the second time this week Democrats have forced their GOP colleagues to choose between loyalty to President Trump and a MAGA base that is furious at his administration over its handling of the Epstein files.
Go deeper (1 min. read)

Smarter, faster on what matters.
Explore Axios Newsletters

    About Axios
    Advertise with us
    Careers
    Contact us

    Newsletters
    Axios Live
    Axios HQ

    Privacy policy
    Terms of use

Axios Homepage

Axios Media Inc., 2025


"""

In [6]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load environment and authenticate
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)
    print("Authenticated with HuggingFace")

print("Loading Phi-3 with 128k context...")

model_name = "microsoft/Phi-3-mini-128k-instruct"

# Check for GPU availability
if torch.cuda.is_available():
    device = "cuda"
    print(f"🚀 GPU detected: {torch.cuda.get_device_name()}")
else:
    device = "cpu"
    print("⚠️ No GPU detected, falling back to CPU")

print(f"Loading {model_name}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)

# Load model with proper settings for Phi-3
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Use bfloat16 instead of float16
    device_map="auto",
    low_cpu_mem_usage=True,
    token=hf_token,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # Use flash attention
    load_in_8bit=True,  # Use 8bit instead of 4bit for stability
    max_memory={0: "6GB"}  # Leave some VRAM headroom
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully!")

def extract_article(raw_text):
    """Extract article text using Phi-3 with 128k context"""
    
    # Use proper Phi-3 chat format
    messages = [
        {"role": "system", "content": "You are an expert at extracting clean article text from raw HTML/webpage content."},
        {"role": "user", "content": f"Extract only the main article text from this webpage content, removing all ads, navigation, headers, footers, and other non-article elements:\n\n{raw_text}"}
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=120000)  # Use most of 128k
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2000,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
    return response.strip()

print("Phi-3 ready with 128k context!")
print("Use: clean_article = extract_article(raw_text)")

clean_article = extract_article(raw_text)
print(f"Extracted article text:\n{clean_article}") 

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Authenticated with HuggingFace
Loading Phi-3 with 128k context...
🚀 GPU detected: NVIDIA GeForce RTX 3070 Ti Laptop GPU
Loading microsoft/Phi-3-mini-128k-instruct...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.