In [1]:
import os
from datasets import load_dataset
import pyarrow.parquet as pq

OUTPUT_DIR = "../data/"
OUTPUT_FILE = "combined_data.parquet"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

# Load the dataset
try:
    ds = load_dataset("Nan-Do/code-search-net-python")
    print("Dataset loaded successfully")
    print(f"Dataset structure: {ds}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise


# Combine all partitions into a single dataset
combined_dataset = ds[next(iter(ds))]  
for partition in list(ds.keys())[1:]:
    combined_dataset = combined_dataset.concatenate_datasets([ds[partition]])

# Save the combined dataset as a single Parquet file
output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
try:
    combined_dataset.to_parquet(output_path)
    print(f"Attempting to save data to: {output_path}")
    
    # Verify that the file was created
    if os.path.exists(output_path):
        print(f"File successfully created at: {output_path}")
        print(f"File size: {os.path.getsize(output_path)} bytes")
    else:
        print(f"Error: File was not created at {output_path}")
except Exception as e:
    print(f"Error saving to Parquet: {e}")
    raise

print(f"Total number of rows: {len(combined_dataset)}")

  from .autonotebook import tqdm as notebook_tqdm


Output directory: ../data/
Dataset loaded successfully
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary'],
        num_rows: 455243
    })
})


Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████| 456/456 [00:05<00:00, 77.98ba/s]

Attempting to save data to: ../data/combined_data.parquet
File successfully created at: ../data/combined_data.parquet
File size: 598461591 bytes
Total number of rows: 455243





In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_parquet('../data/combined_data.parquet')


print(df.info())
df.head()
df.describe()

df['column_name'].hist()
plt.title('Histogram of column_name')
plt.show()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455243 entries, 0 to 455242
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   repo              455243 non-null  object
 1   path              455243 non-null  object
 2   func_name         455243 non-null  object
 3   original_string   455243 non-null  object
 4   language          455243 non-null  object
 5   code              455243 non-null  object
 6   code_tokens       455243 non-null  object
 7   docstring         455243 non-null  object
 8   docstring_tokens  455243 non-null  object
 9   sha               455243 non-null  object
 10  url               455243 non-null  object
 11  partition         455243 non-null  object
 12  summary           455243 non-null  object
dtypes: object(13)
memory usage: 45.2+ MB
None
