In [None]:
import os
import json
import polars as pl
import polars.selectors as cs
from pathlib import Path

## Turn File into .parquet

In [4]:
INPUT_DIR = "data/raw_youtube_comments"
OUTPUT_FILE = "all_comments.parquet"

In [5]:
def flatten_json(d, parent_key="", sep="_"):
    """Recursively flattens a nested JSON object into a dict with prefixed keys."""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_json(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [None]:
# # Let's say you have multiple JSON files in a folder
# data_folder = Path("data/raw_youtube_comments_relevance")
# rows = []

# for file in data_folder.glob("*.json"):
#     with open(file, "r", encoding="utf-8") as f:
#         raw = json.load(f)
#         for entry in raw.get("items", []):  # your "response" field
#             rows.append(flatten_json(entry))

# # Create Polars DataFrame
# df = pl.DataFrame(rows)

# # Save to parquet
# df.write_parquet("comments.parquet")

## Data by relevance

In [6]:
df = pl.read_parquet("data/clean/relevance/comments.parquet")
df.head()

kind,etag,id,snippet_channelId,snippet_videoId,snippet_topLevelComment_kind,snippet_topLevelComment_etag,snippet_topLevelComment_id,snippet_topLevelComment_snippet_channelId,snippet_topLevelComment_snippet_videoId,snippet_topLevelComment_snippet_textDisplay,snippet_topLevelComment_snippet_textOriginal,snippet_topLevelComment_snippet_authorDisplayName,snippet_topLevelComment_snippet_authorProfileImageUrl,snippet_topLevelComment_snippet_authorChannelUrl,snippet_topLevelComment_snippet_authorChannelId_value,snippet_topLevelComment_snippet_canRate,snippet_topLevelComment_snippet_viewerRating,snippet_topLevelComment_snippet_likeCount,snippet_topLevelComment_snippet_publishedAt,snippet_topLevelComment_snippet_updatedAt,snippet_canReply,snippet_totalReplyCount,snippet_isPublic,replies_comments
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,i64,str,str,bool,i64,bool,list[struct[4]]
"""youtube#commentThread""","""5X4c9JpM8u0seA5iAaQ8P9Z30UE""","""Ugx_LrD9EQxPmvmHmNB4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""OFySluPd6L6HxyxATAhAZLZIu88""","""Ugx_LrD9EQxPmvmHmNB4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""Virgo""","""Virgo""","""@BilalRazzaq-r4p""","""https://yt3.ggpht.com/ytc/AIdr…","""http://www.youtube.com/@BilalR…","""UCbyVaZZErCfW8kV0vPUMUqQ""",True,"""none""",0,"""2025-10-02T10:12:32Z""","""2025-10-02T10:12:32Z""",True,0,True,
"""youtube#commentThread""","""03_v_OV1dSXBxKOHHVoh9ddg5VY""","""UgwL0V8QLNB_bmXMEYx4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""BOyJO_vmuZ5Y20upJQHQVXlNi6Y""","""UgwL0V8QLNB_bmXMEYx4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""Azgona""","""Azgona""","""@lindaselimi1723""","""https://yt3.ggpht.com/ytc/AIdr…","""http://www.youtube.com/@lindas…","""UCBwnn2MNaIW1rWr1J5XMoRg""",True,"""none""",0,"""2025-09-26T12:05:50Z""","""2025-09-26T12:05:50Z""",True,0,True,
"""youtube#commentThread""","""zijVJD5RGsUJFjt_v-8uXlf2e2E""","""Ugx9WwEYettdsDcZ7MV4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""s8On9I5xCwCWXhDdqBPBl2kELoY""","""Ugx9WwEYettdsDcZ7MV4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""Justin <a href=""https://www.yo…","""Justin 2:31 hoy no puedo tengo…","""@JuliethandreaAsprilla""","""https://yt3.ggpht.com/_gSgbkBK…","""http://www.youtube.com/@Juliet…","""UC7_7zuBcOWGw4NzkWynlbXg""",True,"""none""",0,"""2025-09-26T15:38:02Z""","""2025-09-26T15:38:02Z""",True,0,True,
"""youtube#commentThread""","""12t3pbgH0bzDzU7UMs7NfLeNhdA""","""UgzWEHA_DyX6lhxXGRt4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""_hw2ZYC0sL_4Bl5YcO5psTFeOEU""","""UgzWEHA_DyX6lhxXGRt4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""Is this before he got diddled?""","""Is this before he got diddled?""","""@Dr3w_4_u""","""https://yt3.ggpht.com/ytc/AIdr…","""http://www.youtube.com/@Dr3w_4…","""UCJ49JtaIp3vCVjKl1QtfSXg""",True,"""none""",0,"""2025-09-29T23:22:35Z""","""2025-09-29T23:22:35Z""",True,0,True,
"""youtube#commentThread""","""2Bs4BWOg2QNNGKEDsZkWlTAtl6Q""","""UgzeBP3fOh430MO3Ohx4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""uSqWR9fQEzeZkvrq9Ulqyd_I4fU""","""UgzeBP3fOh430MO3Ohx4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""ロジック😮""","""ロジック😮""","""@user-zd8wp8bl3v""","""https://yt3.ggpht.com/ytc/AIdr…","""http://www.youtube.com/@user-z…","""UC6vJc-nIXA7NXzQbIftId_g""",True,"""none""",0,"""2025-09-27T19:18:18Z""","""2025-09-27T19:18:18Z""",True,0,True,


In [7]:
df.shape

(1086, 25)

## Data by Chronological Order

In [14]:
df = pl.read_parquet("data/clean/chronological/comments.parquet")
df.head()

kind,etag,id,snippet_channelId,snippet_videoId,snippet_topLevelComment_kind,snippet_topLevelComment_etag,snippet_topLevelComment_id,snippet_topLevelComment_snippet_channelId,snippet_topLevelComment_snippet_videoId,snippet_topLevelComment_snippet_textDisplay,snippet_topLevelComment_snippet_textOriginal,snippet_topLevelComment_snippet_authorDisplayName,snippet_topLevelComment_snippet_authorProfileImageUrl,snippet_topLevelComment_snippet_authorChannelUrl,snippet_topLevelComment_snippet_authorChannelId_value,snippet_topLevelComment_snippet_canRate,snippet_topLevelComment_snippet_viewerRating,snippet_topLevelComment_snippet_likeCount,snippet_topLevelComment_snippet_publishedAt,snippet_topLevelComment_snippet_updatedAt,snippet_canReply,snippet_totalReplyCount,snippet_isPublic,replies_comments
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,i64,str,str,bool,i64,bool,list[struct[4]]
"""youtube#commentThread""","""uQA-qS9CXMbmpJnBFzzYVKNAsvM""","""UgxJOtvRgP-TK-DDSbh4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""ftCX1u_cMcFYWvy1sForf9HYYlo""","""UgxJOtvRgP-TK-DDSbh4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""Good reason holy grain""","""Good reason holy grain""","""@nishapatel-m8r""","""https://yt3.ggpht.com/ytc/AIdr…","""http://www.youtube.com/@nishap…","""UC1pcc2njrSbt9Dxvv7jCZcQ""",True,"""none""",1,"""2025-09-24T17:11:11Z""","""2025-09-24T17:11:11Z""",True,0,True,
"""youtube#commentThread""","""4hudTL3Z5iN4pgKkkQbIulOJ5EM""","""UgxtC2RpNWCNRupMHvl4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""rTwa0LUA1izDUGc0PKXTIu-GiJg""","""UgxtC2RpNWCNRupMHvl4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""anyone in 2025 ....... lets ga…","""anyone in 2025 ....... lets ga…","""@moloantoa8655""","""https://yt3.ggpht.com/sCWKiUE8…","""http://www.youtube.com/@moloan…","""UCHLFXF2C8LdrjPhH3Vi4dyA""",True,"""none""",56,"""2025-09-24T16:47:44Z""","""2025-09-24T16:47:44Z""",True,8,True,"[{""youtube#comment"",""aaOFfL5gP8hlrICa-lZ1PtGajTc"",""UgxtC2RpNWCNRupMHvl4AaABAg.ANT03nbSg30ANTQ5hnoIcR"",{""UCHkj014U2CQ2Nv0UZeYpE_A"",""kffacxfA7G4"",""hehe"",""hehe"",""UgxtC2RpNWCNRupMHvl4AaABAg"",""@Toliii_"",""https://yt3.ggpht.com/zOobxWViS3ZZQ4T5dmDK3bkleUQZGHV4sPgxanVnhUeqtcmYMBVXtIlzRVNRR_3Sqqu-Px0IRc4=s48-c-k-c0x00ffffff-no-rj"",""http://www.youtube.com/@Toliii_"",{""UCmiE9AN4VP-mbZLe4YZIUeA""},true,""none"",2,""2025-09-24T20:35:12Z"",""2025-09-24T20:35:12Z""}}, {""youtube#comment"",""RApwV-DBVk3KyXkgj5WdBT7wVg8"",""UgxtC2RpNWCNRupMHvl4AaABAg.ANT03nbSg30ANTc5AYS6rZ"",{""UCHkj014U2CQ2Nv0UZeYpE_A"",""kffacxfA7G4"",""Here bro, just for nostalgia"",""Here bro, just for nostalgia"",""UgxtC2RpNWCNRupMHvl4AaABAg"",""@AssSlayer-v4q"",""https://yt3.ggpht.com/fJvVUbGK5uNPv-jG7cX4v0N_-VNNUFzOSqeMOBPuAgqtqY30T9Pj2DaYq3-tCJIsemU5_BeC-w=s48-c-k-c0x00ffffff-no-rj"",""http://www.youtube.com/@AssSlayer-v4q"",{""UCBELkmM6LYQ02rNyibcTrTA""},true,""none"",0,""2025-09-24T22:28:43Z"",""2025-09-24T22:28:43Z""}}, … {""youtube#comment"",""z2aiV_-BciedbhIBylMPA4XAMaY"",""UgxtC2RpNWCNRupMHvl4AaABAg.ANT03nbSg30ANWq7skb7fa"",{""UCHkj014U2CQ2Nv0UZeYpE_A"",""kffacxfA7G4"",""Hey"",""Hey"",""UgxtC2RpNWCNRupMHvl4AaABAg"",""@Eliotxpresso"",""https://yt3.ggpht.com/cXSFxB5g6oLy2imLdKik8qZ6UpUFVKiXus1JCoE8wxja8tWVEl0Q6Dn6qUTSd9cruBPYBS0U=s48-c-k-c0x00ffffff-no-rj"",""http://www.youtube.com/@Eliotxpresso"",{""UC3v-Xu6dpdHX0kZva3NnJXA""},true,""none"",1,""2025-09-26T04:29:08Z"",""2025-09-26T04:29:08Z""}}]"
"""youtube#commentThread""","""Mh2HvwFyqTHuY3lmXdtWdqTXbSs""","""UgxKAugIlYmsTGIUgxt4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""VUb_AWjx-fRjqJH9VA1DiG7tWhc""","""UgxKAugIlYmsTGIUgxt4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""After so many years people r s…","""After so many years people r s…","""@Meckzy_on60fps""","""https://yt3.ggpht.com/7OthEhNG…","""http://www.youtube.com/@Meckzy…","""UCfdxsCfK0qJGZBc-GBOX1UA""",True,"""none""",1,"""2025-09-24T16:37:23Z""","""2025-09-24T16:37:23Z""",True,0,True,
"""youtube#commentThread""","""zWwNW4kE0lNRZOOqWWdZLfhx_4o""","""UgyTyY11iDQ0--9qH_54AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""DpsifoeTtomPgd94ik0z42wxOUo""","""UgyTyY11iDQ0--9qH_54AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""You&#39;re a legend if you&#39…","""You're a legend if you're list…","""@MyFlower_PARKJimin""","""https://yt3.ggpht.com/-yfnrXAJ…","""http://www.youtube.com/@MyFlow…","""UCdGwvLMmAb0WeybPerBJaZw""",True,"""none""",255,"""2025-09-24T16:36:24Z""","""2025-09-24T16:36:24Z""",True,8,True,"[{""youtube#comment"",""dOuOZdi928gW9vhZPnplL_8tJZE"",""UgyTyY11iDQ0--9qH_54AaABAg.ANSzlicp7b1ANTZh8Hv_uK"",{""UCHkj014U2CQ2Nv0UZeYpE_A"",""kffacxfA7G4"",""You&#39;re going to break the world record for the oldest virgin💪"",""You're going to break the world record for the oldest virgin💪"",""UgyTyY11iDQ0--9qH_54AaABAg"",""@therridethr4463"",""https://yt3.ggpht.com/ytc/AIdro_knUhSruL3XYpUntnZnDgbe0HJlAHpsH9rFZYpXGCOK0IzNS94q7eGlq8R0YGI-SbSa0g=s48-c-k-c0x00ffffff-no-rj"",""http://www.youtube.com/@therridethr4463"",{""UC6FD8MyJMqLcLi0plLWjZXA""},true,""none"",0,""2025-09-24T21:59:05Z"",""2025-09-24T21:59:05Z""}}, {""youtube#comment"",""_PjNkOa757KBZe4Xnb-rVkRsdZ0"",""UgyTyY11iDQ0--9qH_54AaABAg.ANSzlicp7b1ANWtkxYvSB0"",{""UCHkj014U2CQ2Nv0UZeYpE_A"",""kffacxfA7G4"",""Me😢"",""Me😢"",""UgyTyY11iDQ0--9qH_54AaABAg"",""@MeseretKena-n8z"",""https://yt3.ggpht.com/2izwlO-QR4BHy1i4pQgPmxW3hM1gumjJ86A6C5HUQzDT32PezHK6tEOKYRyOKT1SIydlXnPKNKc=s48-c-k-c0x00ffffff-no-rj"",""http://www.youtube.com/@MeseretKena-n8z"",{""UC76FbqPM6W8Q6szUQA8Y6HA""},true,""none"",4,""2025-09-26T05:00:50Z"",""2025-09-26T05:00:50Z""}}, … {""youtube#comment"",""xnCWwY_mxRILp8EziEUuiHul6JQ"",""UgyTyY11iDQ0--9qH_54AaABAg.ANSzlicp7b1ANXs6T4C9Gt"",{""UCHkj014U2CQ2Nv0UZeYpE_A"",""kffacxfA7G4"",""i was 10 year old in 2010 bro"",""i was 10 year old in 2010 bro"",""UgyTyY11iDQ0--9qH_54AaABAg"",""@ThuyLeThiThu-v5l"",""https://yt3.ggpht.com/jk2x-C4B2iIPC9XDHajpYYqAcVh2k_iCNmo4_rFNsmSYqYkKQ-qBM5L2h_bnJXu0_malQXkjIQ=s48-c-k-c0x00ffffff-no-rj"",""http://www.youtube.com/@ThuyLeThiThu-v5l"",{""UCU16bY5pEVqYtM7yOJUoKGA""},true,""none"",5,""2025-09-26T14:05:40Z"",""2025-09-26T14:05:40Z""}}]"
"""youtube#commentThread""","""U7baMib9pe2lkg_Ls0Ylnzu6KoY""","""UgwujpmaPaL9W-RnFjN4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""youtube#comment""","""fgzPU6soxIbzkyDzNcU7S6H1sKU""","""UgwujpmaPaL9W-RnFjN4AaABAg""","""UCHkj014U2CQ2Nv0UZeYpE_A""","""kffacxfA7G4""","""2050 anyone?""","""2050 anyone?""","""@tuyamarino""","""https://yt3.ggpht.com/ytc/AIdr…","""http://www.youtube.com/@tuyama…","""UC3PwcVCdUGjmfELhGU7ZiPg""",True,"""none""",0,"""2025-09-24T16:27:29Z""","""2025-09-24T16:27:29Z""",True,0,True,


In [15]:
df.shape

(2000, 25)

In [22]:
num_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns

# Estadísticas básicas por columna numérica
num_stats = df.select([
    pl.col(col).mean().alias(f"{col}_mean") for col in num_cols
] + [
    pl.col(col).median().alias(f"{col}_median") for col in num_cols
] + [
    pl.col(col).std().alias(f"{col}_std") for col in num_cols
] + [
    pl.col(col).min().alias(f"{col}_min") for col in num_cols
] + [
    pl.col(col).max().alias(f"{col}_max") for col in num_cols
])

print(num_stats)


shape: (1, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ snippet_t ┆ snippet_t ┆ snippet_t ┆ snippet_t ┆ … ┆ snippet_t ┆ snippet_t ┆ snippet_t ┆ snippet_ │
│ opLevelCo ┆ otalReply ┆ opLevelCo ┆ otalReply ┆   ┆ opLevelCo ┆ otalReply ┆ opLevelCo ┆ totalRep │
│ mment_sni ┆ Count_mea ┆ mment_sni ┆ Count_med ┆   ┆ mment_sni ┆ Count_min ┆ mment_sni ┆ lyCount_ │
│ ppe…      ┆ n         ┆ ppe…      ┆ ian       ┆   ┆ ppe…      ┆ ---       ┆ ppe…      ┆ max      │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ i64       ┆ ---       ┆ ---      │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ i64       ┆           ┆ i64       ┆ i64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2.2325    ┆ 0.2455    ┆ 0.0       ┆ 0.0       ┆ … ┆ 0         ┆ 0         ┆ 836       ┆ 106      │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────

  num_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns


In [26]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp312-cp312-win_amd64.whl.metadata (681 bytes)
Collecting fsspec (from fastparquet)
  Using cached fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-win_amd64.whl (673 kB)
   ---------------------------------------- 0.0/673.3 kB ? eta -:--:--
   ------------------------------- -------- 524.3/673.3 kB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 673.3/673.3 kB 3.3 MB/s eta 0:00:00
Downloading cramjam-2.11.0-cp312-cp312-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------------ --------------- 1.0/1.7 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 5.5 MB/s eta 0:00:00
Using cached fsspec-2025.9.0-py3-none-any.whl (199 kB)
Installing collected packages: fsspec,


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import pandas as pd

df = pd.read_parquet("data/clean/chronological/comments.parquet", engine="fastparquet")
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [28]:
df.head()

Unnamed: 0,kind,etag,id,snippet_channelId,snippet_videoId,snippet_topLevelComment_kind,snippet_topLevelComment_etag,snippet_topLevelComment_id,snippet_topLevelComment_snippet_channelId,snippet_topLevelComment_snippet_videoId,...,snippet_topLevelComment_snippet_authorChannelId_value,snippet_topLevelComment_snippet_canRate,snippet_topLevelComment_snippet_viewerRating,snippet_topLevelComment_snippet_likeCount,snippet_topLevelComment_snippet_publishedAt,snippet_topLevelComment_snippet_updatedAt,snippet_canReply,snippet_totalReplyCount,snippet_isPublic,replies_comments
0,youtube#commentThread,uQA-qS9CXMbmpJnBFzzYVKNAsvM,UgxJOtvRgP-TK-DDSbh4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,youtube#comment,ftCX1u_cMcFYWvy1sForf9HYYlo,UgxJOtvRgP-TK-DDSbh4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,...,UC1pcc2njrSbt9Dxvv7jCZcQ,True,none,1,2025-09-24T17:11:11Z,2025-09-24T17:11:11Z,True,0,True,
1,youtube#commentThread,4hudTL3Z5iN4pgKkkQbIulOJ5EM,UgxtC2RpNWCNRupMHvl4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,youtube#comment,rTwa0LUA1izDUGc0PKXTIu-GiJg,UgxtC2RpNWCNRupMHvl4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,...,UCHLFXF2C8LdrjPhH3Vi4dyA,True,none,56,2025-09-24T16:47:44Z,2025-09-24T16:47:44Z,True,8,True,
2,youtube#commentThread,Mh2HvwFyqTHuY3lmXdtWdqTXbSs,UgxKAugIlYmsTGIUgxt4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,youtube#comment,VUb_AWjx-fRjqJH9VA1DiG7tWhc,UgxKAugIlYmsTGIUgxt4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,...,UCfdxsCfK0qJGZBc-GBOX1UA,True,none,1,2025-09-24T16:37:23Z,2025-09-24T16:37:23Z,True,0,True,
3,youtube#commentThread,zWwNW4kE0lNRZOOqWWdZLfhx_4o,UgyTyY11iDQ0--9qH_54AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,youtube#comment,DpsifoeTtomPgd94ik0z42wxOUo,UgyTyY11iDQ0--9qH_54AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,...,UCdGwvLMmAb0WeybPerBJaZw,True,none,255,2025-09-24T16:36:24Z,2025-09-24T16:36:24Z,True,8,True,
4,youtube#commentThread,U7baMib9pe2lkg_Ls0Ylnzu6KoY,UgwujpmaPaL9W-RnFjN4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,youtube#comment,fgzPU6soxIbzkyDzNcU7S6H1sKU,UgwujpmaPaL9W-RnFjN4AaABAg,UCHkj014U2CQ2Nv0UZeYpE_A,kffacxfA7G4,...,UC3PwcVCdUGjmfELhGU7ZiPg,True,none,0,2025-09-24T16:27:29Z,2025-09-24T16:27:29Z,True,0,True,


In [33]:
print(df.iloc[200])

kind                                                                                 youtube#commentThread
etag                                                                           slTACm3H5zAcKOM3rJFoA4EiJpM
id                                                                              Ugy1Nx_Ybue33_skAHJ4AaABAg
snippet_channelId                                                                 UCHkj014U2CQ2Nv0UZeYpE_A
snippet_videoId                                                                                kffacxfA7G4
snippet_topLevelComment_kind                                                               youtube#comment
snippet_topLevelComment_etag                                                   uGLb9G_rzohAanD50R7LvwPa_Lg
snippet_topLevelComment_id                                                      Ugy1Nx_Ybue33_skAHJ4AaABAg
snippet_topLevelComment_snippet_channelId                                         UCHkj014U2CQ2Nv0UZeYpE_A
snippet_topLevelComment_snippet_video

In [29]:
# Resumen estadístico
print(df.describe(include="all"))

# Valores nulos por columna
print(df.isnull().sum())

# Duplicados
print(df.duplicated().sum())

# Conteo de valores únicos por columna
print(df.nunique())

                         kind                         etag  \
count                    2000                         2000   
unique                      1                         2000   
top     youtube#commentThread  uQA-qS9CXMbmpJnBFzzYVKNAsvM   
freq                     2000                            1   
mean                      NaN                          NaN   
std                       NaN                          NaN   
min                       NaN                          NaN   
25%                       NaN                          NaN   
50%                       NaN                          NaN   
75%                       NaN                          NaN   
max                       NaN                          NaN   

                                id         snippet_channelId snippet_videoId  \
count                         2000                      2000            2000   
unique                        2000                         1               1   
top     UgxJOtv

In [30]:
# Matriz de correlación
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.show()

ModuleNotFoundError: No module named 'seaborn'