In [None]:
# + [markdown]
"""
# Dataset Analysis and Model Creation/Training


1. Brief HuggingFace intro
  1. Logging in to HuggingFace
  1. Creating and uploading datasets to HuggingFace
  1. Downloading datasets from HuggingFace
1. Analyzing the O-RAN slicing dataset
  1. Loading the dataset
  1. Visualizing the dataset
  1. Processing the dataset
  1. Uploading the processed dataset to HuggingFace

### Prerequisites

- Read Section VI (AI/ML Workflows) of the [NEU ORAN paper](https://utah.instructure.com/courses/1045795/files/170447527?wrap=1)
- Join the [HuggingFace CyberPowder organization](https://huggingface.co/cyberpowder)
"""

In [None]:
# Install required packages (various other required packages are already available in the colab environment)
!uv -q pip install datasets

In [None]:
# Import required packages
import datetime

import huggingface_hub as hf
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots

## 1. Brief HuggingFace intro
### Logging in to HuggingFace

In [None]:
# Log in to huggingface
# (you'll need to enter the access token you created earlier)
hf.notebook_login()

In [None]:
# Check that you're logged in (ignore the warning about adding the token as a Colab secret)
username = hf.whoami()['name']
print(f"Logged in as {username}")
# From here on out, calls to the HuggingFace API should be automatically authenticated with your access token.

## 2. Creating and Uploading Datasets to Hugging Face
### Creating a dataset

In [None]:
# Let's create some random data to use as a dataset
# We'll use numpy to generate random features and targets and throw them into a pandas DataFrame
random_features = np.random.rand(100, 2)  # 100 samples, 2 features
random_targets = np.random.rand(100, 1)  # 100 samples, 1 target
df = pd.DataFrame(random_features, columns=["feature1", "feature2"])
df["target"] = random_targets
df

In [None]:
# Now we'll use the datasets library to create a dataset from the pandas DataFrame
# You could also create a dataset from a csv file, json file, etc.
dataset = datasets.Dataset.from_pandas(df)

In [None]:
# Now let's create a repository on HuggingFace to store the dataset we just created
# We'll use the current date and time to make the dataset name unique
repo_name = f"{username}/dummy-datasets"
dataset_name = f"dummy-dataset-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"

# Create the dataset repository using the HuggingFace API
# (this will raise an error if the repository already exists, but we'll ignore that for now)
hf_api = hf.HfApi()
try:
    hf_api.create_repo(repo_name)
except Exception as e:
    print(f"Error creating repository: {e}")

In [None]:
"""
Now we'll push the dataset to the repository we just created.
If the dataset already exists, and there are no changes, HuggingFace will not create a new version/commit.
We'll call the dataset configuration "full" to indicate that it contains all the data.
We'll also make the dataset private so that only you can access it.
"""
dataset.push_to_hub(repo_name, config_name="full", private=True)

## 3. Downloading Datasets from HuggingFace

In [None]:
"""
Now let's make sure we can download the dataset we just uploaded.
We'll first use the HuggingFace API to list the datasets in the repository.
"""
my_datasets = hf_api.list_datasets(repo_name)
my_datasets

"""
You should see the dataset we just uploaded in the list of datasets.
It may be the only dataset in the list if you haven't uploaded any others.
"""

In [None]:
"""
Let's download the dataset we just uploaded using the datasets package.
"""
dataset = datasets.load_dataset(repo_name)
dataset

In [None]:
"""
We can now access the dataset as a dictionary. Since we didn't specify a train/test split when we 
uploaded the dataset, all of the data is in the "train" key in the dataset dictionary.
"""
df = dataset['train'].to_pandas()
df

"""
Now, let's move on to interacting with the O-RAN slicing dataset that we'll be processing today
for use in next Friday's session, where you will each create, train, and validate a model using
PyTorch.
"""

## 4. Analyzing the O-RAN slicing dataset

In [None]:
# Load the data from the "default" O-RAN slicing dataset
cp_repo_name = "cyberpowder/cyberpowder-network-metrics"
oran_dataset = datasets.load_dataset(cp_repo_name, "default")
oran_dataset

%%shell
ls