In [1]:
!pip show huggingface_hub

Name: huggingface-hub
Version: 0.27.0
Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
Home-page: https://github.com/huggingface/huggingface_hub
Author: Hugging Face, Inc.
Author-email: julien@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, packaging, pyyaml, requests, tqdm, typing-extensions
Required-by: accelerate, diffusers, peft, sentence-transformers, timm, tokenizers, transformers


In [2]:
!gdown https://drive.google.com/uc?id=18_GttJtgK_1ULeinlDMFBzbxz2ih5mLB

Downloading...
From: https://drive.google.com/uc?id=18_GttJtgK_1ULeinlDMFBzbxz2ih5mLB
To: /content/test_chunks_txt.zip
  0% 0.00/18.9k [00:00<?, ?B/s]100% 18.9k/18.9k [00:00<00:00, 62.5MB/s]


In [3]:
!unzip /content/test_chunks_txt.zip

Archive:  /content/test_chunks_txt.zip
   creating: test_chunks_txt/
  inflating: test_chunks_txt/1934_Ivoi_Paul-d_Le-docteur-Mystère.txt_chunk_365.txt  
  inflating: test_chunks_txt/1830_Sue-Eugene_Kernok-le-Pirate.txt_chunk_44.txt  
  inflating: test_chunks_txt/1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_403.txt  
  inflating: test_chunks_txt/1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_505.txt  
  inflating: test_chunks_txt/1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_527.txt  
  inflating: test_chunks_txt/1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_542.txt  
  inflating: test_chunks_txt/1842_Sue-Eugene_Les-Mysteres-de-Paris_Tome-II.txt_chunk_89.txt  
  inflating: test_chunks_txt/1958_Kessel-Joseph_Le-lion.txt_chunk_68.txt  
  inflating: test_chunks_txt/1958_Kessel-Joseph_Le-lion.txt_chunk_69.txt  
  inflating: test_chunks_txt/1844_Dumas-Alexandre_Les-Trois-Mousquetaires.txt_chunk_282.txt  


In [4]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from glob import glob
from os import path
from unicodedata import normalize
from huggingface_hub import InferenceApi, InferenceClient

In [1]:
# Replace with your Hugging Face API token
API_TOKEN = "API_TOKEN"

In [6]:
# Initialize the Hugging Face inference API client
client = InferenceClient(api_key=API_TOKEN)

In [7]:
path_test = 'test_chunks_txt/*.txt'

In [8]:
def clean_text(txt):
    txt_res = normalize("NFKD", txt).replace('\xa0', ' ')
    txt_res = txt_res.replace("\\", "").replace('\\xa0', '')
    return txt_res

In [19]:
def llama_stuff_huggingface(chunk):
    instruction = (
        "Give me as an output only one word, if this text is typical of the adventure novels genre; write ADVENTURE; "
        "elif unsure; write NON_ADVENTURE. Prefer NON_ADVENTURE if unsure"
    )
    input_text = instruction + "\n\n" + chunk

    try:
        stream = client.chat.completions.create(model='mistralai/Mixtral-8x7B-Instruct-v0.1',#"meta-llama/Llama-3.2-1B",
    	  #Building the prompt
          messages=[
            {"role": "user",
    		    "content": input_text
            }
          ],
    	    max_tokens=10,
    	    stream=True
        )

        # Initialize an empty list to store the streamed content
        chat = []

        # Iterate over the generator to process the streamed chunks
        for chunk in stream:
            if 'choices' in chunk and 'delta' in chunk['choices'][0]:
                # Extract the content from the streamed chunk
                content = chunk['choices'][0]['delta'].get('content', '')
                chat.append(content)

        # Combine all parts of the streamed content into the final response
        response = ''.join(chat).strip()

        # Print the completion
        print("Completion:", response)

        return response
    except Exception as e:
        print(f"Error: {e}")
        return None


In [20]:
def get_labels(path_name):
    df = pd.DataFrame(columns=['doc', 'label'])

    for doc in tqdm(glob(path_name)):
        doc_name = path.splitext(path.basename(doc))[0]
        with open(doc, encoding="utf8") as file:
            text = file.readlines()
            text_cleaned = clean_text(str(text).lower())

            label = llama_stuff_huggingface(text_cleaned)

            df = pd.concat([df, pd.DataFrame({'doc': [doc_name], 'label': [label]})])
            df_temp = df.set_index('doc')
            #df_temp.to_csv('llama_ANNOT_OUTPUT_TEMP.csv')
    df.set_index('doc', inplace=True)
    #df.to_csv('LLAMA_ANNOT_OUTPUT_MAIN.csv')

    return df

In [21]:
df_labels = get_labels(path_test)

  0%|          | 0/10 [00:00<?, ?it/s]

Completion: NON_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON\_ADVENTURE
Completion: NON_ADVENTURE
Completion: NON_ADVENTURE


In [52]:
df_labels

Unnamed: 0_level_0,label
doc,Unnamed: 1_level_1
1842_Sue-Eugene_Les-Mysteres-de-Paris_Tome-II.txt_chunk_89,ADVENTURE
1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_403,NON_ADVENTURE
1844_Dumas-Alexandre_Les-Trois-Mousquetaires.txt_chunk_282,ADVENTURE\n\nThe text you provided
1958_Kessel-Joseph_Le-lion.txt_chunk_68,ADVENTURE
1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_542,NON_ADVENTURE
1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_527,ADVENTURE
1830_Sue-Eugene_Kernok-le-Pirate.txt_chunk_44,ADVENTURE
1934_Ivoi_Paul-d_Le-docteur-Mystère.txt_chunk_365,ADVENTURE\n\nThe text you provided
1840_Beauvoir_Roger-de_Le-chevalier-de-Saint-Georges.txt_chunk_505,ADVENTURE
1958_Kessel-Joseph_Le-lion.txt_chunk_69,ADVENTURE
