In [None]:
import os
from datasets import load_dataset
import pyarrow.parquet as pq

OUTPUT_DIR = "../data/"
OUTPUT_FILE = "combined_data.parquet"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

# Load the dataset
try:
    ds = load_dataset("Nan-Do/code-search-net-python")
    print("Dataset loaded successfully")
    print(f"Dataset structure: {ds}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise


# Combine all partitions into a single dataset
combined_dataset = ds[next(iter(ds))]  
for partition in list(ds.keys())[1:]:
    combined_dataset = combined_dataset.concatenate_datasets([ds[partition]])

# Save the combined dataset as a single Parquet file
output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
try:
    combined_dataset.to_parquet(output_path)
    print(f"Attempting to save data to: {output_path}")
    
    # Verify that the file was created
    if os.path.exists(output_path):
        print(f"File successfully created at: {output_path}")
        print(f"File size: {os.path.getsize(output_path)} bytes")
    else:
        print(f"Error: File was not created at {output_path}")
except Exception as e:
    print(f"Error saving to Parquet: {e}")
    raise

print(f"Total number of rows: {len(combined_dataset)}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_parquet('../data/combined_data.parquet')


print(df.info())
df.head()
df.describe()

df['column_name'].hist()
plt.title('Histogram of column_name')
plt.show()