# Explore the NYC comments dataset

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.26.0

numpy    : 1.26.0
pandas   : 2.2.2
polars   : 1.4.1
mlxtend  : 0.23.1
omegaconf: not installed

conda environment: n/a



In [2]:
# Built-in library
from pathlib import Path
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Load Data

In [3]:
fp: str = "../data/all_comments.parquet"

df: pl.DataFrame = pl.read_parquet(fp)
print(f"{df.shape = }")

df.head(2)

df.shape = (2176364, 34)


approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,inReplyTo,newDesk,parentID,parentUserDisplayName,permID,picURL,printPage,recommendations,recommendedFlag,replyCount,reportAbuseFlag,sectionName,sharing,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
i64,str,f64,str,f64,f64,str,str,f64,f64,i64,f64,str,f64,str,str,str,f64,f64,null,f64,null,str,i64,str,f64,f64,str,i64,str,f64,str,str,str
1517529462,"""5a7258e410f40f00018bed7d""",835.0,"""The snake-filled heads comment made me think of Medusa. <br/><br/>I hope he loses, so retrograde.""",25791250.0,25791250.0,"""<br/>""","""comment""",1517500000.0,1.0,0,0.0,"""OpEd""",0.0,,"""25791250""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",23.0,5.0,,0.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1517529462,"""Jennie""",79172841.0,"""WA""",,
1517529428,"""5a7258e410f40f00018bed7d""",835.0,"""She-devil reporting for duty!""",25795675.0,25795675.0,"""<br/>""","""comment""",1517500000.0,1.0,0,0.0,"""OpEd""",0.0,,"""25795675""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",23.0,2.0,,0.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1517529428,"""Nice White Lady""",66376882.0,"""Seattle""",,


In [4]:
df.filter(pl.col("editorsSelection").eq(1)).sample(5, seed=123)

approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,inReplyTo,newDesk,parentID,parentUserDisplayName,permID,picURL,printPage,recommendations,recommendedFlag,replyCount,reportAbuseFlag,sectionName,sharing,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
i64,str,f64,str,f64,f64,str,str,f64,f64,i64,f64,str,f64,str,str,str,f64,f64,null,f64,null,str,i64,str,f64,f64,str,i64,str,f64,str,str,str
1522081525,"""5ab8421847de81a9012170ca""",1487.0,"""""a theory with little supporting evidence"" is not a theory at all. A theory is a comprehensive set of facts and reliable observations; these made-up, concocted lies are not that at all. <br/><br/>Mr. Trump surrounds himself with people who show loyalty to him in all his machinations and delusions. It is no wonder that an attorney such as this would propound the same foolishness that Trump believes. Yet it still says something that an attorney such as this either can not or will not work for this president. <br/><br/>I guess even he has standards.""",26486089.0,26486089.0,"""<br/>""","""comment""",1522100000.0,1.0,1,0.0,"""Washington""",0.0,,"""26486089""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",1.0,41.0,,0.0,,"""Politics""",0,"""approved""",1.0,0.0,"""News""",1522082502,"""bkane8""",67036940.0,"""Altadena, CA""",,
1521825381,"""5ab4ff2447de81a901215ff1""",1635.0,"""Anyone who is surprised by Trump's last minute ""reversal"" has not been paying attention - this is what he does; it's a planned negotiating tactic. Congress needs to call his bluff and override his veto, or we can expect this to keep happening.""",26454083.0,26454083.0,"""<br/>""","""comment""",1521800000.0,1.0,1,0.0,"""Washington""",0.0,,"""26454083""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",1.0,75.0,,1.0,,"""Politics""",0,"""approved""",1.0,0.0,"""News""",1521825382,"""JAS""",63074421.0,"""NYC""",,
1490690758,"""58da0ef47c459f24986d6e3b""",728.0,"""Hmmm, so you plan is to squeeze the younger, healthier people harder, the ones who are essentially carrying the older, sicker and often wealthier older people on the backs of their own premiums and policies?<br/><br/>Share the cost.<br/>Younger people vote too!<br/>End the mandate that unfairly penalized healthy people for spending money to stay fit, healthy and out of the medical-industrial complex that makes this cost-shift from young to old, from working to ... resting, from financially independent to the wealthier ill possible.<br/><br/>Why do you hate the healthy people of America? Shouldn't those who need our help be looking to US, not for our hard-earned dollars but for how we live our lives to avoid consuming massive amounts of medical care?? You could learn a lot from a healthy person.""",21952165.0,21952165.0,"""<br/>""","""comment""",1490700000.0,1.0,1,0.0,"""OpEd""",0.0,,"""21952165""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",27.0,8.0,,56.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1490717732,"""Midway""",73158615.0,"""Midwest""",,
1489154952,"""58c2620e7c459f247a912960""",790.0,"""My greatest hope is Trumpcare is so bad we will finally throw off our shackles and adopt a simple, fair and less expensive solution: Medicare for all. Full stop.<br/><br/>Health care financing should never have been a jobs program for intermediaries who provide nothing for health save higher costs.""",21757311.0,21757311.0,"""<br/>""","""comment""",1489100000.0,1.0,1,0.0,"""OpEd""",0.0,,"""21757311""","""https://s3.amazonaws.com/pimage.timespeople.nytimes.com/2388/489/cropped-2388489.jpg?0.10247170923292104""",31.0,573.0,,0.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1489154952,"""Doug Mc""",2388489.0,"""<br/>""",,
1493813923,"""590984ea7c459f24986dd662""",1593.0,"""What you are witnessing Mr. Douthat is the near to last throes of Conservatives worldwide, who cannot abide the world commingling as one. They can't stand the thought of their nations becoming truly cosmopolitan with folks from around the globe living together. What was once the American experiment has branched out to in compass more countries. The globe's borders are disappearing and that frightens conservatives worldwide, who want their areas to be a single religion, a single race and a single type of people. The country club is letting everyone participate now and the old conservative guard doesn't like it. How are they to feel superior.""",22351867.0,22351867.0,"""<br/>""","""comment""",1493800000.0,1.0,1,0.0,"""OpEd""",0.0,,"""22351867""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",0.0,75.0,,7.0,,"""Unknown""",0,"""approved""",1.0,0.0,"""Op-Ed""",1493813924,"""David Gifford""",53905383.0,"""Rehoboth beach, DE 19971""",,
