In [None]:
import pandas as pd
import pyarrow.parquet as pq

## Convert json to parquet file

In [None]:
json_file_path = "yt_metadata_en.jsonl"

chunksize = 1_000_000
parquet_file_path = "yt_metadata_en.parquet"

# Use pandas to read JSON file in chunks
for i, chunk in enumerate(pd.read_json(json_file_path, lines=True, chunksize=chunksize)):
    print(f"Processing chunk {i}")
    if i == 0:
        chunk.to_parquet(parquet_file_path)
    else:
        chunk.to_parquet(parquet_file_path, engine="fastparquet", append=True)

## Create dataset subsets

For easier handling of data

### Sport keyword dataset

Do a filtering where we create a dataset subset using sport keywords (e.g. "football", "soccer", "basketball", "tennis", etc.) to identify sports-related videos.

In [None]:
def create_subset_parquet(parquet_file_path, filter_function, chunksize=1_000_000):
    pq_metadata = pq.ParquetFile(parquet_file_path)

    # Initialize an empty DataFrame to store the filtered data
    filtered_df = pd.DataFrame()

    # Iterate through the batches and filter the necessary columns
    for batch in pq_metadata.iter_batches(batch_size=chunksize):
        temp_df = batch.to_pandas().drop(columns=['description'])
        temp_df = temp_df[temp_df.apply(lambda row: filter_function(row), axis=1)]
        filtered_df = pd.concat([filtered_df, temp_df], ignore_index=True)

        # Print the size and memory usage of the filtered DataFrame
        print(f"Current size of filtered_df: {filtered_df.shape}")
        print(f"Memory usage of filtered_df: {filtered_df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

In [None]:
def filter_keyword_function(row):
    return any(tag in row['tags'].lower() for tag in ['sport', 'football', 'soccer', 'fifa', 'nba', 'olympic', 'golf', 'tennis', 'cricket', 'formula1', 'f1', 'basketball', 'nascar', 'nfl', 'world cup', 'eurocup', 'superbowl']) or any(
        tag in row['title'].lower() for tag in ['sport', 'football', 'soccer', 'fifa', 'olympic', 'golf', 'tennis', 'cricket', 'formula1', 'f1', 'basketball', 'nascar', 'nfl', 'world cup', 'eurocup', 'superbowl'])

In [None]:
parquet_file_path = "yt_metadata_en.parquet"

pq_metadata = pq.ParquetFile(parquet_file_path)
 
# Initialize an empty DataFrame to store the filtered data
filtered_df = pd.DataFrame()

# Iterate through the batches and filter the necessary columns
for batch in pq_metadata.iter_batches(batch_size=1_000_000):
    temp_df = batch.to_pandas().drop(columns=['description'])
    temp_df = temp_df[temp_df.apply(lambda row: any(tag in row['tags'].lower() for tag in ['sport', 'football', 'soccer', 'fifa', 'nba', 'olympic', 'golf', 'tennis', 'cricket', 'formula1', 'f1', 'basketball', 'nascar', 'nfl', 'world cup', 'eurocup', 'superbowl']) or any(tag in row['title'].lower() for tag in ['sport', 'football', 'soccer', 'fifa', 'olympic', 'golf', 'tennis', 'cricket', 'formula1', 'f1', 'basketball', 'nascar', 'nfl', 'world cup', 'eurocup', 'superbowl']), axis=1)]
    filtered_df = pd.concat([filtered_df, temp_df], ignore_index=True)

    # Print the size and memory usage of the filtered DataFrame
    print(f"Current size of filtered_df: {filtered_df.shape}")
    print(f"Memory usage of filtered_df: {filtered_df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

In [None]:
filtered_df.to_parquet("filtered_sport_metadata(by_tags_and_title)_withoud_description.parquet", engine="fastparquet")

# save the file without the gaming category
filtered_df_without_gaming = filtered_df[~filtered_df['category'].str.contains(
    'Gaming')]

filtered_df_without_gaming.to_parquet(
    "filtered_sport_metadata(by_tags_and_title)_without_gaming_category.parquet", engine="fastparquet")

### Sport category dataset

Do a filter where we only grab the Sport category from the category column from the dataset.

In [None]:
def filter_category_function(row):
    return "Sports" in row['category']

In [None]:
parquet_file_path = "yt_metadata_en.parquet"

pq_metadata = pq.ParquetFile(parquet_file_path)

# Initialize an empty DataFrame to store the filtered data
filtered_sport_df = pd.DataFrame()

# Iterate through the batches and filter the necessary columns
for batch in pq_metadata.iter_batches(batch_size=1_000_000):
    temp_df = batch.to_pandas().drop(columns=['description'])

    temp_df = temp_df[temp_df['categories'].apply(lambda x: 'Sports' in x)]
    filtered_sport_df = pd.concat(
        [filtered_sport_df, temp_df], ignore_index=True)
    
    # Print the size and memory usage of the filtered DataFrame
    print(f"Current size of filtered_df: {filtered_sport_df.shape}")
    print(f"Memory usage of filtered_df: {filtered_sport_df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

In [None]:
filtered_sport_df.to_parquet("filtered_sport_category_without_description_column_metadata.parquet", engine="fastparquet")