In [None]:
import os
import re
import json
from typing import List, Dict

# Define a regular expression pattern for URLs
URL_PATTERN = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

def extract_urls(file_path: str) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            data = file.read()
    
    urls = re.findall(URL_PATTERN, data)
    return urls

def get_files(directory: str, extensions: List[str]) -> List[str]:
    """Recursively returns a list of files in a given directory with the specified extensions."""
    files_list = []
    for root, _, files in os.walk(directory):
        for ext in extensions:
            files_list.extend([os.path.join(root, file) for file in files if file.endswith(ext)])
    return files_list

def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file in a professional format."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(urls_data, json_file, ensure_ascii=False, indent=4, sort_keys=True)

def main(directory: str, extensions: List[str]):
    files = get_files(directory, extensions)
    urls_data = {}
    for file_path in files:
        urls = extract_urls(file_path)
        # Normalize file paths to use forward slashes
        normalized_file_path = file_path.replace(os.sep, '/')
        urls_data[normalized_file_path] = urls
    write_urls_to_json(directory, urls_data)



if __name__ == "__main__":

    main('E:/LLMS/hemanth/', ['.txt', '.md', '.html', '.py', '.json', '.csv','.pdf'])


In [None]:
import argparse
import json
import re
from typing import List, Dict
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup


def extract_urls(url: str) -> List[Dict[str, str]]:
    """
    Extract all URLs from the given website.

    Args:
        url (str): The URL of the website to scrape.

    Returns:
        List[Dict[str, str]]: A list of dictionaries containing the extracted URLs
            with their corresponding file extensions and additional details.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    urls = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href:
            absolute_url = urljoin(url, href)
            parsed_url = urlparse(absolute_url)
            file_extension = re.findall(r"\.[a-zA-Z0-9]+$", parsed_url.path)
            file_name = parsed_url.path.split("/")[-1]
            domain = parsed_url.netloc
            scheme = parsed_url.scheme
            urls.append({
                "url": absolute_url,
                "extension": file_extension[0] if file_extension else "",
                "file_name": file_name,
                "domain": domain,
                "scheme": scheme,
                "text": link.text.strip(),
                "rel": link.get("rel", ""),
                "target": link.get("target", ""),
                "title": link.get("title", "")
            })

    return urls


def save_to_json(urls: List[Dict[str, str]], output_file: str) -> None:
    """
    Save the extracted URLs to a JSON file.

    Args:
        urls (List[Dict[str, str]]): A list of dictionaries containing the extracted URLs
            with their corresponding file extensions and additional details.
        output_file (str): The path to the output JSON file.
    """
    with open(output_file, "w") as file:
        json.dump(urls, file, indent=4)




urls = extract_urls('https://web.stanford.edu/class/cs234/modules.html')
save_to_json(urls, 'hemanth4.json')



In [None]:
import argparse
import json
import os
from typing import List, Dict

import requests


def download_pdf(url: str, folder_path: str, file_name: str) -> None:
    """
    Download a PDF file from the given URL and save it to the specified folder.

    Args:
        url (str): The URL of the PDF file to download.
        folder_path (str): The path to the folder where the PDF file will be saved.
        file_name (str): The name of the PDF file to be saved.
    """
    response = requests.get(url)
    if response.status_code == 200:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded: {file_name}")
    else:
        print(f"Failed to download: {file_name}")


def download_pdfs(data: List[Dict[str, str]], folder_path: str) -> None:
    """
    Download PDF files from the given list of dictionaries and save them to the specified folder.

    Args:
        data (List[Dict[str, str]]): A list of dictionaries containing PDF file information.
        folder_path (str): The path to the folder where the PDF files will be saved.
    """
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for item in data:
        if item["extension"] == ".pdf":
            url = item["url"]
            file_name = item["file_name"]
            download_pdf(url, folder_path, file_name)


def load_data_from_json(json_file: str) -> List[Dict[str, str]]:
    """
    Load PDF file information from a JSON file.

    Args:
        json_file (str): The path to the JSON file containing the PDF file information.

    Returns:
        List[Dict[str, str]]: A list of dictionaries containing the PDF file information.
    """
    with open(json_file, "r") as file:
        data = json.load(file)
    return data

data = load_data_from_json('E:/LLMS/hemanth/Hemanth/file_operations-/File_and_Operations/Coding_from_colab/datasets_m/hemanth.json')
download_pdfs(data,'E:/LLMS/hemanth/Hemanth/pdf' )



you are my advanced python coding experts: always follows suggestion's 
1 ) you will write code following pep8 standards 
2) user proper module's for coding (handling arguments typing modules like etc)
3) coding structure look very professional 


TASK: we have struture List[Dict[str, str] now we have traget .pdf files we have load speific folder user speific path



example [
{
        "url": "https://www.cs.cmu.edu/~ninamf/courses/601sp15/slides/26_privacy_4-22-2015.pdf",
        "extension": ".pdf",
        "file_name": "26_privacy_4-22-2015.pdf",
        "domain": "www.cs.cmu.edu",
        "scheme": "https",
        "text": "Slides (Privacy)",
        "rel": "",
        "target": "",
        "title": ""
    },

]

In [None]:
import argparse
import json
import re
from typing import List, Dict, Set
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup


def extract_urls(url: str, visited_urls: Set[str], max_depth: int = 3, current_depth: int = 0) -> List[Dict[str, str]]:
    """
    Extract all URLs from the given website recursively.

    Args:
        url (str): The URL of the website to scrape.
        visited_urls (Set[str]): A set to keep track of visited URLs.
        max_depth (int): The maximum depth of recursive calls (default: 3).
        current_depth (int): The current depth of the recursive call (default: 0).

    Returns:
        List[Dict[str, str]]: A list of dictionaries containing the extracted URLs
            with their corresponding file extensions and additional details.
    """
    if url in visited_urls or current_depth >= max_depth:
        return []

    visited_urls.add(url)

    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")

        urls = []
        for link in soup.find_all("a"):
            href = link.get("href")
            if href:
                absolute_url = urljoin(url, href)
                parsed_url = urlparse(absolute_url)
                file_extension = re.findall(r"\.[a-zA-Z0-9]+$", parsed_url.path)
                file_name = parsed_url.path.split("/")[-1]
                domain = parsed_url.netloc
                scheme = parsed_url.scheme
                urls.append({
                    "url": absolute_url,
                    "extension": file_extension[0] if file_extension else "",
                    "file_name": file_name,
                    "domain": domain,
                    "scheme": scheme,
                    "text": link.text.strip(),
                    "rel": link.get("rel", ""),
                    "target": link.get("target", ""),
                    "title": link.get("title", "")
                })

                # Recursively extract URLs from the linked pages
                if domain == urlparse(url).netloc:
                    urls.extend(extract_urls(absolute_url, visited_urls, max_depth, current_depth + 1))

        return urls

    except (requests.exceptions.RequestException, TimeoutError):
        return []


def save_to_json(urls: List[Dict[str, str]], output_file: str) -> None:
    """
    Save the extracted URLs to a JSON file.

    Args:
        urls (List[Dict[str, str]]): A list of dictionaries containing the extracted URLs
            with their corresponding file extensions and additional details.
        output_file (str): The path to the output JSON file.
    """
    with open(output_file, "w") as file:
        json.dump(urls, file, indent=4)


visited_urls = set()
urls = extract_urls('https://www.cs.ubc.ca/~schmidtm/Courses/LecturesOnML/',visited_urls,5)
save_to_json(urls, 'hemanth2.json')


you are my advanced python coding experts: always follows suggestion's 
1 ) you will write code following pep8 standards 
2) user proper module's for coding (handling arguments typing modules like etc)
3) coding structure look very professional 


TASK: 

In [5]:
import arxiv
import os
import urllib.request
import re
from datetime import date, timedelta

def download_papers(query, max_results, save_dir, search_in=('title', 'summary')):
    """
    This function downloads papers from arXiv based on the provided query.

    Parameters:
    - query (str): The search query for the papers.
    - max_results (int): The maximum number of results to return.
    - save_dir (str): The directory where the papers will be saved.
    - search_in (tuple): Where to search for keywords, options are 'title', 'summary', or both.
    """
    # Get the current date
    today = date.today()
    # Get the date 30 days ago
    start_date = today - timedelta(days=30)
    # Construct the default API client
    client = arxiv.Client()

    # Create a search object
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate,
    )

    # Get the results as a list
    results = list(client.results(search))

    # Filter the results based on the date range
    filtered_results = [result for result in results if result.published.date() >= start_date]

    # Convert the query into a list of phrases and keywords
    phrases = re.findall(r'"([^"]+)"', query)
    keywords = [kw for kw in re.split(r'"[^"]+"', query) if kw.strip() != '']
    keywords = list(set(keywords + phrases))  # Combine phrases and keywords, remove duplicates

    keyword_filtered_results = []  # A list to store the filtered results based on keywords

    for result in filtered_results:  # Loop through the results
        text_to_search = ' '.join(filter(None, [result.title if 'title' in search_in else '', 
                                                result.summary if 'summary' in search_in else '']))
        match_count = sum(bool(re.search(r'\b' + re.escape(keyword) + r'\b', text_to_search, re.IGNORECASE)) for keyword in keywords)

        # Adjust the required matches to be the minimum of 3 or the number of keywords
        required_matches = min(1, len(keywords))

        if match_count >= required_matches:  # If the result matches at least the required keywords
            keyword_filtered_results.append((result, match_count))  # Add it to the filtered list

    # Sort the filtered results by the number of keyword matches in descending order
    keyword_filtered_results.sort(key=lambda x: x[1], reverse=True)

    # Check if the folder exists
    if not os.path.exists(save_dir):
        # If not, create the folder
        os.makedirs(save_dir)

    # Download the filtered and sorted results
    for result, match_count in keyword_filtered_results[:max_results]:
        try:
            # Get the paper id
            paper_id = result.entry_id.split('/')[-1]
            # Get the paper url
            paper_url = result.pdf_url
            # Get the paper file name
            paper_file = os.path.join(save_dir, paper_id + '.pdf')
            # Download the paper
            urllib.request.urlretrieve(paper_url, paper_file)
            print(f"Downloaded {paper_id} with {match_count} keyword matches")
        except Exception as e:
            print(f"Failed to download {paper_id}: {e}")

# Example usage
save_dir = "E:/LLMS/hemanth/Hemanth/Deep_learning/papers_llms_model/"
query = "'LLMS' 'NLP' 'large language model' 'architecture'"
No_of_papers = 10
download_papers(query=query, max_results=No_of_papers, save_dir=save_dir, search_in=('title', 'summary'))


In [None]:
from datasets import load_dataset
from typing import Dict, List

def convert_dataset_to_text(dataset_name: str, split: str = "train") -> Dict[str, List[str]]:
    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    # Convert the columns into a single text column using the map function
    dataset = dataset.map(convert_to_text, remove_columns=dataset.column_names)

    # Return the converted dataset as a dictionary
    return {"text": dataset["text"]}

def convert_to_text(example: Dict[str, str]) -> Dict[str, str]:
    text = " ".join(str(value) for value in example.values())
    return {"text": text}

# List of dataset names
dataset_names = [
    "open-orca/openorca",
    "squad",
    "glue",
    "imdb",
    "amazon_reviews_multi",
    "yelp_review_full",
    "rotten_tomatoes",
    "ag_news",
    "dbpedia_14",
    "trec",
    "paws",
    "wiki_qa",
    "yahoo_answers_topics",
    "cos_e",
    "hellaswag",
    "story_cloze",
    "art",
    "sciq",
    "social_i_qa",
    "wiqa"
]

# Convert each dataset and store the results in a dictionary
converted_datasets = {}
for dataset_name in dataset_names:
    converted_datasets[dataset_name] = convert_dataset_to_text(dataset_name)

# Print the number of examples in each converted dataset
for dataset_name, dataset in converted_datasets.items():
    print(f"Dataset: {dataset_name}, Number of examples: {len(dataset['text'])}")


In [None]:
from datasets import load_dataset
from typing import List, Dict, Any

def convert_to_text(example: Dict[str, Any]) -> Dict[str, str]:
    """
    Concatenate all values from the example into a single text string.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, str]: A dictionary with a single key 'text' containing the concatenated text.
    """
    text = " ".join(str(value) for value in example.values())
    return {"text": text}

def process_datasets(dataset_names: List[str], split: str = 'train') -> None:
    """
    Load each dataset by name, convert all columns into a single text column, and print the first item.

    Parameters:
    - dataset_names (List[str]): A list of dataset names to be processed.
    - split (str): The split of the dataset to load, defaults to 'train'.
    """
    for dataset_name in dataset_names:
        # Load the dataset
        dataset = load_dataset(dataset_name, split=split)

        # Convert the columns into a single text column using the map function
        dataset = dataset.map(convert_to_text, remove_columns=dataset.column_names)

        # Print the first example to verify the result
        print(f"First item from the processed dataset '{dataset_name}':")
        print(dataset[0])
        print("\n" + "-"*80 + "\n")  # Separator line for readability

# List of 20 dataset names to process
dataset_list = [
    "Open-Orca/OpenOrca",       # Replace with actual dataset names
    # "dataset_name_2",
    # "dataset_name_3",
    # ...
    # "dataset_name_20",
]

# Make sure to replace the placeholders with the actual names of your datasets before running the function
process_datasets(dataset_list)


In [None]:
from datasets import load_dataset
from typing import List, Dict, Any
import pandas as pd

def merge_columns_into_text(dataset: str, split: str = 'train') -> pd.DataFrame:
    """
    Load a specified dataset and split, then merge all columns into a single text column.
    
    Parameters:
    - dataset (str): The name of the dataset to load.
    - split (str): The dataset split to use, defaults to 'train'.
    
    Returns:
    - pd.DataFrame: A DataFrame with a single column containing merged text data.
    """
    # Load the specified dataset split
    data = load_dataset(dataset, split=split)
    
    # Convert to pandas DataFrame
    df = pd.DataFrame(data)
    
    # Merge all columns into a single text column using 'map' function
    df['merged'] = df.apply(lambda row: ' '.join(map(str, row)), axis=1)
    
    # Drop other columns
    df = df[['merged']]
    
    return df

# Example usage:
dataset_name = "Open-Orca/OpenOrca"
split_type = "train"

# Call the function and get the processed DataFrame
processed_df = merge_columns_into_text(dataset_name, split_type)
print(processed_df.head())  # Show the first few rows of the processed DataFrame


In [None]:
from datasets import load_dataset
from typing import Dict

def convert_to_text(example: Dict[str, str]) -> Dict[str, str]:
    text = " ".join(str(value) for value in example.values())
    return {"text": text}

# Load the dataset
dataset = load_dataset("Open-Orca/OpenOrca", split="train")

# Convert the four columns into a single text column using the map function
dataset = dataset.map(convert_to_text, remove_columns=dataset.column_names)

# Print the first example to verify the result
print(dataset[0])


In [None]:
from datasets import load_dataset
from typing import List, Dict, Any

def extract_text(example: Dict[str, Any]) -> Dict[str, str]:
    """
    Concatenate all string values from the example into a single text string.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, str]: A dictionary with a single key 'text' containing the concatenated text.
    """
    text_values = [str(value) for value in example.values() if isinstance(value, str)]
    text = " ".join(text_values)
    return {"text": text}

def extract_image_urls(example: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Extract image URLs from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, List[str]]: A dictionary with a single key 'image_urls' containing a list of image URLs.
    """
    image_urls = [str(value) for value in example.values() if str(value).startswith("http") and (str(value).endswith(".jpg") or str(value).endswith(".png"))]
    return {"image_urls": image_urls}

def extract_video_urls(example: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Extract video URLs from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, List[str]]: A dictionary with a single key 'video_urls' containing a list of video URLs.
    """
    video_urls = [str(value) for value in example.values() if str(value).startswith("http") and (str(value).endswith(".mp4") or str(value).endswith(".avi"))]
    return {"video_urls": video_urls}

def extract_tabular(example: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
    """
    Extract tabular data from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, Dict[str, Any]]: A dictionary with a single key 'tabular' containing the tabular data.
    """
    tabular_data = {key: value for key, value in example.items() if not isinstance(value, (str, list))}
    return {"tabular": tabular_data}

def extract_numerical(example: Dict[str, Any]) -> Dict[str, Dict[str, float]]:
    """
    Extract numerical data from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, Dict[str, float]]: A dictionary with a single key 'numerical' containing the numerical data.
    """
    numerical_data = {key: float(value) for key, value in example.items() if isinstance(value, (int, float))}
    return {"numerical": numerical_data}

def extract_audio(example: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Extract audio URLs from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, List[str]]: A dictionary with a single key 'audio_urls' containing a list of audio URLs.
    """
    audio_urls = [str(value) for value in example.values() if str(value).startswith("http") and (str(value).endswith(".mp3") or str(value).endswith(".wav"))]
    return {"audio_urls": audio_urls}

def process_datasets(dataset_names: List[str], split: str = 'train') -> None:
    """
    Load each dataset by name, extract different data types, and print the first item of each data type.

    Parameters:
    - dataset_names (List[str]): A list of dataset names to be processed.
    - split (str): The split of the dataset to load, defaults to 'train'.
    """
    for dataset_name in dataset_names:
        # Load the dataset
        dataset = load_dataset(dataset_name, split=split)

        # Extract different data types using the map function
        text_dataset = dataset.map(extract_text, remove_columns=dataset.column_names)
        image_dataset = dataset.map(extract_image_urls, remove_columns=dataset.column_names)
        video_dataset = dataset.map(extract_video_urls, remove_columns=dataset.column_names)
        tabular_dataset = dataset.map(extract_tabular, remove_columns=dataset.column_names)
        numerical_dataset = dataset.map(extract_numerical, remove_columns=dataset.column_names)
        audio_dataset = dataset.map(extract_audio, remove_columns=dataset.column_names)

        # Print the first example of each data type to verify the result
        print(f"First item from the processed dataset '{dataset_name}':")
        print("Text:", text_dataset[0])
        print("Image URLs:", image_dataset[0])
        print("Video URLs:", video_dataset[0])
        print("Tabular Data:", tabular_dataset[0])
        print("Numerical Data:", numerical_dataset[0])
        print("Audio URLs:", audio_dataset[0])
        print("\n" + "-"*80 + "\n")  # Separator line for readability

# Get user input for dataset names
dataset_list = input("Enter the dataset names separated by commas: ").split(",")
dataset_list = [name.strip() for name in dataset_list]

# Process the datasets
process_datasets(dataset_list)


In [None]:
import os
import re
import json
from typing import List, Dict
def extract_urls(file_path: str) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            data = file.read()
    
    url_pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    urls = re.findall(url_pattern, data)
    return urls


def get_txt_files(directory: str) -> List[str]:
    """Returns a list of .txt files in a given directory."""
    return [f for f in os.listdir(directory) if f.endswith('.txt')]

def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(urls_data, json_file, indent=4)

def main(directory: str):
    txt_files = get_txt_files(directory)
    urls_data = {}
    for txt_file in txt_files:
        file_path = os.path.join(directory, txt_file)
        urls = extract_urls(file_path)
        urls_data[txt_file] = urls
    write_urls_to_json(directory, urls_data)
    print(f'Extracted URLs have been written to extracted_urls.json')

if __name__ == "__main__":
    main('C:/Users/heman/Desktop/deeplearning/data1')


In [None]:
import os
import json
from typing import List, Dict


def read_text_files(folder_path: str) -> List[str]:
    """Reads all .txt files in the given folder and returns a list of contents."""
    file_contents = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(folder_path, file_name), 'r', encoding='ISO-8859-1') as file:
                file_contents.append(file.read().strip())
    return file_contents



def write_to_json(data: List[str], json_file_path: str) -> None:
    """Writes the list of text data into a JSON file."""
    with open(json_file_path, 'w',encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

def main():
    folder_path = 'C:/Users/heman/Desktop/deeplearning/data1/'
    json_file_path ='json_file.json'
    
    # Extract data from .txt files
    extracted_data = read_text_files(folder_path)
    
    # Write data to JSON file
    write_to_json(extracted_data, json_file_path)
    
    print(f"Data from .txt files has been written to {json_file_path} successfully.")

if __name__ == "__main__":
    main()


In [None]:
import os
import re
import json
from typing import List, Dict


def extract_urls(file_path: str) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            data = file.read()
    
    url_pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    urls = re.findall(url_pattern, data)
    return urls


def get_txt_files(directory: str) -> List[str]:
    """Returns a list of .txt files in a given directory."""
    return [f for f in os.listdir(directory) if f.endswith('.txt')]


def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(urls_data, json_file, indent=4)


def main(directory: str):
    """Main function to extract URLs from text files in a directory and write them to a JSON file."""
    txt_files = get_txt_files(directory)
    urls_data = {}
    for txt_file in txt_files:
        file_path = os.path.join(directory, txt_file)
        urls = extract_urls(file_path)
        urls_data[txt_file] = urls
    write_urls_to_json(directory, urls_data)
    print(f'Extracted URLs have been written to extracted_urls.json')


if __name__ == "__main__":
    main('C:/Users/heman/Desktop/deeplearning/data1')


In [None]:
import os
import re
import json
import logging
from typing import List, Dict, Pattern
from urllib.parse import urlparse

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_urls(file_path: str, url_pattern: Pattern) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
            urls = re.findall(url_pattern, data)
            return urls
    except (UnicodeDecodeError, FileNotFoundError, IOError) as e:
        logging.error(f"Error opening/reading file: {file_path}, Error: {e}")
        return []

def compile_url_pattern() -> Pattern:
    """Compiles the URL regex pattern."""
    return re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )

def get_txt_files(directory: str) -> List[str]:
    """Returns a list of .txt files in a given directory."""
    try:
        return [f for f in os.listdir(directory) if f.endswith('.txt')]
    except OSError as e:
        logging.error(f"Error accessing directory: {directory}, Error: {e}")
        return []

def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    try:
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(urls_data, json_file, indent=4)
            logging.info(f'Extracted URLs have been written to {json_path}')
    except IOError as e:
        logging.error(f"Error writing to file: {json_path}, Error: {e}")

def main(directory: str):
    url_pattern = compile_url_pattern()
    txt_files = get_txt_files(directory)
    urls_data = {}
    for txt_file in txt_files:
        file_path = os.path.join(directory, txt_file)
        urls = extract_urls(file_path, url_pattern)
        urls_data[txt_file] = urls
    write_urls_to_json(directory, urls_data)

if __name__ == "__main__":
    main('C:/Users/heman/Desktop/deeplearning/data1')


In [None]:
import os
import re
import json
import logging
from typing import List, Dict, Pattern
from urllib.parse import urlparse

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_urls(file_path: str, url_pattern: Pattern) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
            urls = re.findall(url_pattern, data)
            return urls
    except (UnicodeDecodeError, FileNotFoundError, IOError) as e:
        logging.error(f"Error opening/reading file: {file_path}, Error: {e}")
        return []

def compile_url_pattern() -> Pattern:
    """Compiles the URL regex pattern."""
    return re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )

def get_txt_files(directory: str) -> List[str]:
    """Returns a list of .txt files in a given directory."""
    try:
        return [f for f in os.listdir(directory) if f.endswith('.txt')]
    except OSError as e:
        logging.error(f"Error accessing directory: {directory}, Error: {e}")
        return []

def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    try:
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(urls_data, json_file, indent=4)
            logging.info(f'Extracted URLs have been written to {json_path}')
    except IOError as e:
        logging.error(f"Error writing to file: {json_path}, Error: {e}")

def main(directory: str):
    url_pattern = compile_url_pattern()
    txt_files = get_txt_files(directory)
    urls_data = {}
    for txt_file in txt_files:
        file_path = os.path.join(directory, txt_file)
        urls = extract_urls(file_path, url_pattern)
        urls_data[txt_file] = urls
    write_urls_to_json(directory, urls_data)

if __name__ == "__main__":
    main('C:/Users/heman/Desktop/deeplearning/data1')


In [None]:
import os
import re
import json
from typing import List, Dict

# Define a regular expression pattern for URLs
URL_PATTERN = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

def extract_urls(file_path: str) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            data = file.read()
    
    urls = re.findall(URL_PATTERN, data)
    return urls

def get_txt_files(directory: str) -> List[str]:
    """Returns a list of .txt files in a given directory."""
    return [f for f in os.listdir(directory) if f.endswith('.txt')]

def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(urls_data, json_file, indent=4)

def main(directory: str):
    txt_files = get_txt_files(directory)
    urls_data = {}
    for txt_file in txt_files:
        file_path = os.path.join(directory, txt_file)
        urls = extract_urls(file_path)
        urls_data[txt_file] = urls
    write_urls_to_json(directory, urls_data)
    # print(f'Extracted URLs have been written to {json_path}')

if __name__ == "__main__":
    # Replace the directory path with your desired path
    main('C:/Users/heman/Desktop/deeplearning/data1')


In [None]:
import os
import re
import json
from typing import List, Dict

# Define a regular expression pattern for URLs
URL_PATTERN = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

def extract_urls(file_path: str) -> List[str]:
    """Extracts all URLs from a given text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            data = file.read()
    
    urls = re.findall(URL_PATTERN, data)
    return urls

def get_files(directory: str, extensions: List[str]) -> List[str]:
    """Recursively returns a list of files in a given directory with the specified extensions."""
    files_list = []
    for root, _, files in os.walk(directory):
        for ext in extensions:
            files_list.extend([os.path.join(root, file) for file in files if file.endswith(ext)])
    return files_list

def write_urls_to_json(directory: str, urls_data: Dict[str, List[str]]):
    """Writes the extracted URLs to a JSON file in a professional format."""
    json_path = os.path.join(directory, 'extracted_urls.json')
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(urls_data, json_file, ensure_ascii=False, indent=4, sort_keys=True)

def main(directory: str, extensions: List[str]):
    files = get_files(directory, extensions)
    urls_data = {}
    for file_path in files:
        urls = extract_urls(file_path)
        # Normalize file paths to use forward slashes
        normalized_file_path = file_path.replace(os.sep, '/')
        urls_data[normalized_file_path] = urls
    write_urls_to_json(directory, urls_data)
    # print(f'Extracted URLs have been written to {json_path}')


if __name__ == "__main__":
    # Replace the directory path with your desired path
    # Specify the list of file extensions you want to include
    main('C:/Users/heman/Desktop/deeplearning/', ['.txt', '.md', '.html', '.py', '.json', '.csv','.pdf'])


In [None]:
from datasets import load_dataset
from typing import List, Dict, Any

def extract_text(example: Dict[str, Any]) -> Dict[str, str]:
    """
    Concatenate all string values from the example into a single text string.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, str]: A dictionary with a single key 'text' containing the concatenated text.
    """
    text_values = [str(value) for value in example.values() if isinstance(value, str)]
    text = " ".join(text_values)
    return {"text": text}

def extract_image_urls(example: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Extract image URLs from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, List[str]]: A dictionary with a single key 'image_urls' containing a list of image URLs.
    """
    image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp", ".tiff", ".ico"]
    image_urls = [str(value) for value in example.values() if isinstance(value, str) and any(value.lower().endswith(ext) for ext in image_extensions)]
    return {"image_urls": image_urls}

def extract_video_urls(example: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Extract video URLs from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, List[str]]: A dictionary with a single key 'video_urls' containing a list of video URLs.
    """
    video_extensions = [".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", ".webm", ".m4v", ".mpg", ".mpeg"]
    video_urls = [str(value) for value in example.values() if isinstance(value, str) and any(value.lower().endswith(ext) for ext in video_extensions)]
    return {"video_urls": video_urls}

def extract_tabular(example: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
    """
    Extract tabular data from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, Dict[str, Any]]: A dictionary with a single key 'tabular' containing the tabular data.
    """
    tabular_data = {key: value for key, value in example.items() if not isinstance(value, (str, list))}
    return {"tabular": tabular_data}

def extract_numerical(example: Dict[str, Any]) -> Dict[str, Dict[str, float]]:
    """
    Extract numerical data from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, Dict[str, float]]: A dictionary with a single key 'numerical' containing the numerical data.
    """
    numerical_data = {key: float(value) for key, value in example.items() if isinstance(value, (int, float))}
    return {"numerical": numerical_data}

def extract_audio(example: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Extract audio URLs from the example.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, List[str]]: A dictionary with a single key 'audio_urls' containing a list of audio URLs.
    """
    audio_extensions = [".mp3", ".wav", ".aac", ".flac", ".ogg", ".wma", ".m4a", ".aiff", ".alac", ".pcm"]
    audio_urls = [str(value) for value in example.values() if isinstance(value, str) and any(value.lower().endswith(ext) for ext in audio_extensions)]
    return {"audio_urls": audio_urls}

def process_datasets(dataset_names: List[str], split: str = 'train') -> None:
    """
    Load each dataset by name, extract different data types, and print the first item of each data type.

    Parameters:
    - dataset_names (List[str]): A list of dataset names to be processed.
    - split (str): The split of the dataset to load, defaults to 'train'.
    """
    for dataset_name in dataset_names:
        try:
            # Load the dataset
            dataset = load_dataset(dataset_name, split=split)

            # Extract different data types using the map function
            text_dataset = dataset.map(extract_text, remove_columns=dataset.column_names)
            image_dataset = dataset.map(extract_image_urls, remove_columns=dataset.column_names)
            video_dataset = dataset.map(extract_video_urls, remove_columns=dataset.column_names)
            tabular_dataset = dataset.map(extract_tabular, remove_columns=dataset.column_names)
            numerical_dataset = dataset.map(extract_numerical, remove_columns=dataset.column_names)
            audio_dataset = dataset.map(extract_audio, remove_columns=dataset.column_names)

            # Print the first example of each data type to verify the result
            print(f"First item from the processed dataset '{dataset_name}':")
            print("Text:", text_dataset[0])
            print("Image URLs:", image_dataset[0])
            print("Video URLs:", video_dataset[0])
            print("Tabular Data:", tabular_dataset[0])
            print("Numerical Data:", numerical_dataset[0])
            print("Audio URLs:", audio_dataset[0])
            print("\n" + "-"*80 + "\n")  # Separator line for readability
        except Exception as e:
            print(f"Error processing dataset '{dataset_name}': {str(e)}")

# Get user input for dataset names
dataset_list = input("Enter the dataset names separated by commas: ").split(",")
dataset_list = [name.strip() for name in dataset_list]

# Process the datasets
process_datasets(dataset_list)


TASK: we have take list of Datasets (user input )  now data convert into text , image(urls),video(urls), tabulr , numerical,audio

NOTE: we advnced hooks (to extact all proper datasets )(we should extract 100% percantage correct dataset)



below sample for text code now extent image , video , audio, numerical, tabualr





from datasets import load_dataset
from typing import List, Dict, Any

def convert_to_text(example: Dict[str, Any]) -> Dict[str, str]:
    """
    Concatenate all values from the example into a single text string.

    Parameters:
    - example (Dict[str, Any]): A dictionary representing a row from the dataset.

    Returns:
    - Dict[str, str]: A dictionary with a single key 'text' containing the concatenated text.
    """
    text = " ".join(str(value) for value in example.values())
    return {"text": text}

def process_datasets(dataset_names: List[str], split: str = 'train') -> None:
    """
    Load each dataset by name, convert all columns into a single text column, and print the first item.

    Parameters:
    - dataset_names (List[str]): A list of dataset names to be processed.
    - split (str): The split of the dataset to load, defaults to 'train'.
    """
    for dataset_name in dataset_names:
        # Load the dataset
        dataset = load_dataset(dataset_name, split=split)

        # Convert the columns into a single text column using the map function
        dataset = dataset.map(convert_to_text, remove_columns=dataset.column_names)

        # Print the first example to verify the result
        print(f"First item from the processed dataset '{dataset_name}':")
        print(dataset[0])
        print("\n" + "-"*80 + "\n")  # Separator line for readability

# List of 20 dataset names to process
dataset_list = [
    "Open-Orca/OpenOrca",       # Replace with actual dataset names
    # "dataset_name_2",
    # "dataset_name_3",
    # ...
    # "dataset_name_20",
]

# Make sure to replace the placeholders with the actual names of your datasets before running the function
process_datasets(dataset_list)

In [None]:
import json
import logging
from typing import Dict, Any, Union
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the base directory where all data will be stored
BASE_DIR = Path("data_collections")

# Define the structure of the metadata
MetaData = Dict[str, Any]
TextData = Dict[str, Any]
AudioMetaData = Dict[str, Any]
ImageMetaData = Dict[str, Any]
VideoMetaData = Dict[str, Any]


def initialize_user_data(meta_data: MetaData) -> None:
    """
    Initializes the data collection folders for a new user based on the provided metadata.
    :param meta_data: A dictionary containing user metadata such as 'id'.
    """
    user_id = meta_data.get('id')
    if user_id is None:
        raise ValueError("Metadata must contain an 'id' key.")
    
    user_dir = BASE_DIR / str(user_id)
    logging.info(f"Initializing data directories for user {user_id}")
    # Create directories for each type of data
    for data_type in ['text', 'image', 'video', 'audio']:
        data_path = user_dir / f"{data_type}-data"
        data_path.mkdir(parents=True, exist_ok=True)


def append_data(user_id: str, data: Union[TextData, AudioMetaData, ImageMetaData, VideoMetaData], data_type: str) -> None:
    """
    Appends new data to an existing JSON file within the specified data folder for a specific user.
    Creates a new JSON file if it does not exist.
    :param user_id: The unique identifier for the user.
    :param data: A dictionary containing the data to append.
    :param data_type: The type of data to append ('text', 'audio', 'image', 'video').
    """
    data_dir = BASE_DIR / user_id / f"{data_type}-data"
    data_file = data_dir / f"user_{data_type}_data.json"
    
    if not data_file.exists():
        logging.info(f"Creating new {data_type} data file for user {user_id}")
        data_file.touch()
        data_file.write_text(json.dumps([]))  # Initialize with an empty list
    
    with data_file.open('r+') as file:
        logging.info(f"Appending new {data_type} data for user {user_id}")
        existing_data = json.load(file)
        existing_data.append(data)
        file.seek(0)
        json.dump(existing_data, file, indent=4)


# Example usage:
try:
    user_metadata = {'id': '12347'}
    initialize_user_data(user_metadata)
    
    text_sample = {'content': 'Sample text data', 'timestamp': '2024-03-23T13:06:45'}
    append_data(user_id='12347', data=text_sample, data_type='text')
    
    audio_sample = {'file_name': 'sample.mp3', 'duration': 120,'joking':10}
    append_data(user_id='12347', data=audio_sample, data_type='audio')
    
except ValueError as e:
    logging.error(f"Error: {e}")
except Exception as e:
    logging.error(f"An unexpected error occurred: {e}")


In [None]:
import json
import logging
from typing import Dict, Any, Union
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the base directory where all data will be stored
BASE_DIR = Path("data_collections")

# Define the structure of the metadata
MetaData = Dict[str, Any]
TextData = Dict[str, Any]
AudioMetaData = Dict[str, Any]
ImageMetaData = Dict[str, Any]
VideoMetaData = Dict[str, Any]


def initialize_user_data(meta_data: MetaData) -> None:
    """
    Initializes the data collection folders for a new user based on the provided metadata.
    :param meta_data: A dictionary containing user metadata such as 'id', 'name', 'email', 'phone'.
    """
    user_id = meta_data.get('id')
    if user_id is None:
        raise ValueError("Metadata must contain an 'id' key.")
    
    user_dir = BASE_DIR / str(user_id)
    logging.info(f"Initializing data directories for user {user_id}")
    # Create directories for each type of data
    for data_type in ['text', 'image', 'video', 'audio']:
        data_path = user_dir / f"{data_type}-data"
        data_path.mkdir(parents=True, exist_ok=True)
    
    # Save user metadata to a JSON file
    metadata_file = user_dir / "user_metadata.json"
    with metadata_file.open('w') as file:
        json.dump(meta_data, file, indent=4)


def append_data(user_id: str, data: Union[TextData, AudioMetaData, ImageMetaData, VideoMetaData], data_type: str) -> None:
    """
    Appends new data to an existing JSON file within the specified data folder for a specific user.
    Creates a new JSON file if it does not exist.
    :param user_id: The unique identifier for the user.
    :param data: A dictionary containing the data to append.
    :param data_type: The type of data to append ('text', 'audio', 'image', 'video').
    """
    data_dir = BASE_DIR / user_id / f"{data_type}-data"
    data_file = data_dir / f"user_{data_type}_data.json"
    
    if not data_file.exists():
        logging.info(f"Creating new {data_type} data file for user {user_id}")
        data_file.touch()
        data_file.write_text(json.dumps([]))  # Initialize with an empty list
    
    with data_file.open('r+') as file:
        logging.info(f"Appending new {data_type} data for user {user_id}")
        existing_data = json.load(file)
        existing_data.append(data)
        file.seek(0)
        json.dump(existing_data, file, indent=4)


# Example usage:
try:
    user_metadata = {
        'id': '12346',
        'name': 'John Doe',
        'email': 'john.doe@example.com',
        'phone': '123-456-7890',
        'about': 'Sample user for demonstration purposes',
        'Anything else': '1234'
    }
    initialize_user_data(user_metadata)
    
    text_sample = {'content': 'Sample text data', 'timestamp': '2024-03-23T13:06:45'}
    append_data(user_id='12345', data=text_sample, data_type='text')
    
    audio_sample = {'file_name': 'sample.mp3', 'duration': 120}
    append_data(user_id='12345', data=audio_sample, data_type='audio')
    
except ValueError as e:
    logging.error(f"Error: {e}")
except Exception as e:
    logging.error(f"An unexpected error occurred: {e}")


In [None]:
from datasets import list_datasets

def main() -> None:
    """
    Main function to list all text-generation datasets from the Hugging Face Datasets library.
    """
    # Get all dataset names
    datasets = list_datasets()

    # Filter text-generation datasets
    text_generation_datasets = [
        dataset for dataset in datasets if "text-generation" in dataset.split('/')
    ]

    # Print the list of text-generation datasets
    print("Text-Generation Datasets:")
    for dataset in text_generation_datasets:
        print(f"- {dataset}")

if __name__ == "__main__":
    main()


In [None]:
from huggingface_hub import list_datasets

def main() -> None:
    """
    Main function to list all text-generation datasets from the Hugging Face Hub.
    """
    # Get all dataset names
    datasets = list_datasets()

    # Filter text-generation datasets
    text_generation_datasets = [
        dataset.id for dataset in datasets if "text-generation" in dataset.tags
    ]

    # Print the list of text-generation datasets
    print("Text-Generation Datasets:")
    for dataset in text_generation_datasets:
        print(f"- {dataset}")

if __name__ == "__main__":
    main()
