In [1]:
"""
This script reads the very large yt_metadata_en JSONLines file and splits it into smaller CSV files in order to make it more manageable.
Each CSV file contains a chunk of the data, with a specified number of records per chunk.
The script uses pandas to read the JSONL file, process the data, and save each chunk as a CSV file.
"""

import pandas as pd
import json
import os

from preprocessing import map_column_to_week

In [1]:
# Set up variables
file_path = 'yt_metadata_en.jsonl'
chunk_size = 12_924_794
columns = ['channel_id', 'upload_date', 'title', 'description']
output_folder = 'chunks'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to process each chunk and save it as a CSV
def process_and_save_chunk(file_path, start_line, end_line, chunk_index, columns):
    processed_data = []
    with open(file_path, 'r') as file:
        # Skip to the start line
        for _ in range(start_line):
            file.readline()

        # Process the chunk
        for i in range(start_line, end_line):
            line = file.readline()
            if not line:
                break  # End of file
            try:
                record = json.loads(line)
                processed_data.append({col: record[col] for col in columns})
            except json.JSONDecodeError:
                print(f"Skipping malformed line at index {i}")

    # Convert the chunk to a DataFrame
    df_chunk = pd.DataFrame(processed_data)

    # Write to CSV and clear the DataFrame from memory
    output_path = os.path.join(output_folder, f'chunk_{chunk_index}.csv')
    df_chunk.to_csv(output_path, index=False)
    print(f"Saved chunk {chunk_index} to {output_path}")
    del df_chunk  # Free up memory

# Loop through the file and process each chunk
chunk_index = 0
start_line = 60_000_001

end_line = start_line + chunk_size
process_and_save_chunk(file_path, start_line, end_line, 6, columns) 


Saved chunk 6 to chunks/chunk_6.csv


In [9]:
# Test on chunk 0

# Load chunk
print("Loading chunk 0")
chunk = pd.read_csv('../../data/chunks/chunk_0.csv')

# Display unique values for upload_date
print("Unique values for upload_date:")
print(chunk['upload_date'].unique(50))

Loading chunk 0


In [6]:
# Chunks processing

def process_chunk(file_path, chunk_index):
    """
    Process chunks to get first and last week

    Parameters:
    file_path (str): Path to the file
    """
    # Load chunk
    print(f"Loading chunk {chunk_index}")
    chunk = pd.read_csv(file_path)

    print(f"Processing chunk {chunk_index}")
    print(f"\n \Head: {chunk.head(20)}")
    print(f"\n \Tail: {chunk.tail(20)}")

    # Change column name
    chunk.rename(columns={'channel': 'channel_id'}, inplace=True)

    # Convert date to week
    chunk['upload_date'] = pd.to_datetime(chunk['upload_date'])
    chunk = map_column_to_week(chunk, 'upload_date')

    # Set index to (week, channel)
    chunk.set_index(['week', 'channel'], inplace=True)

    print(f"\n *Head: {chunk.head(20)}")
    print(f"\n *Tail: {chunk.tail(20)}")

    # Get first and last week
    first_week = chunk['week'].min()
    last_week = chunk['week'].max()

    # Write first and last week to file name
    output_path = file_path.replace('.csv', f'metadata_{first_week}_{last_week}.csv')
    chunk.to_csv(output_path, index=False)
    print(f"Saved chunk {chunk_index} to {output_path}")

In [7]:
# 1st chunk test
process_chunk('../../data/chunks/chunk_0.csv', 0)

Loading chunk 0
Processing chunk 0

 \Head:                   channel_id          upload_date  \
0   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-28 00:00:00   
1   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-28 00:00:00   
2   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-28 00:00:00   
3   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-28 00:00:00   
4   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-28 00:00:00   
5   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-27 00:00:00   
6   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-27 00:00:00   
7   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-27 00:00:00   
8   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-27 00:00:00   
9   UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-27 00:00:00   
10  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-27 00:00:00   
11  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-26 00:00:00   
12  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-26 00:00:00   
13  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-26 00:00:00   
14  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-26 00:00:00   
15  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-26 00:00:00   
16  UCzWrhkg9eK5I8Bm3HfV-unA  2016-09-25 00:00:00   
17

ValueError: time data "Natsu (E.N.D) vs Gray Devil Slayer - Fairy Tail Final Season「AMV」- Mirror



➱ Anime:Fairy Tail Final Season




▶Music: NEFFEX - Mirror


▶NEFFEX:
Spotify: http://bit.ly/NEFFEX_Spotify SoundCloud: http://bit.ly/NEFFEX_SC Facebook: http://bit.ly/NEFFEX_FB Instagram: http://bit.ly/NEFFEX_Insta Twitter: http://bit.ly/NEFFEX_Twitter YouTube: http://bit.ly/NEFFEX_YouTube




➱ Like, Comment & Subscribe!! Thanks for watching! ツ

👉‍ Important All rights reserved to the authors, if you own some material from this video and do not want it displayed here, send an email to me and I will immediately remove it.


"Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for "fair use" for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use."" doesn't match format "%Y-%m-%d %H:%M:%S", at position 4876. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.