## Data Extraction

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.26.0

numpy    : 1.26.0
pandas   : 2.2.2
polars   : 1.4.1
mlxtend  : 0.23.1
omegaconf: not installed

conda environment: n/a



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import glob


fp: str = "../data/NYC-comment-data/articles/*.csv"
article_columns: list[str] = [
    "articleID",
    "articleWordCount",
    "byline",
    "documentType",
    "headline",
    "keywords",
    "multimedia",
    "newDesk",
    "printPage",
    "pubDate",
    "sectionName",
    "snippet",
    "source",
    "typeOfMaterial",
    "webURL",
]
all_files: list[str] = glob.glob(pathname=fp)
articles_df: pl.DataFrame = pl.DataFrame()

for f in all_files:
    df: pl.DataFrame = pl.read_csv(f).select(article_columns)
    articles_df = pl.concat([articles_df, df], how="vertical")

articles_df.write_parquet("../data/all_articles.parquet", use_pyarrow=True)
print(f"{articles_df.shape = }")

articles_df.shape = (9335, 15)


In [5]:
articles_df.head(2)

articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
str,i64,str,str,str,str,i64,str,i64,str,str,str,str,str,str
"""58927e0495d0e0392607e1b3""",1129,"""By KEN BELSON""","""article""","""N.F.L. vs. Politics Has Been Battle All Season Long""","""['Football', 'Super Bowl', 'National Football League', 'New England Patriots', 'Goodell, Roger', 'Lady Gaga', 'Immigration and Emigration', 'Trump, Donald J']""",1,"""Sports""",12,"""2017-02-02 00:26:16""","""Pro Football""","""Despite the national tumult over immigration spilling over, Commissioner Roger Goodell said, “I’m singularly focused on the Super Bowl right now.”""","""The New York Times""","""News""","""https://www.nytimes.com/2017/02/01/sports/super-bowl-politics-trump-nfl.html"""
"""5893033d95d0e0392607e2d6""",3082,"""By UNKNOWN""","""article""","""Voice. Vice. Veracity.""","""['Television', 'Home Box Office', 'Girls (TV Program)', 'Dunham, Lena', 'Mamet, Zosia', 'Kirke, Jemima', 'Williams, Allison (1988- )', 'Women and Girls', 'Writing and Writers']""",1,"""Arts&Leisure""",1,"""2017-02-02 10:00:24""","""Television""","""Our critics look at the impact of the HBO show on television as it enters its sixth and final season.""","""The New York Times""","""News""","""https://www.nytimes.com/2017/02/02/arts/television/girls-season-six.html"""


In [36]:
fp: str = "../data/NYC-comment-data/comments/*.csv"
comments_schema: dict[str, Any] = {
    "approveDate": pl.Int64,
    "articleID": pl.String,
    "articleWordCount": pl.Float64,
    "commentBody": pl.String,
    "commentID": pl.Float64,
    "commentSequence": pl.Float64,
    "commentTitle": pl.String,
    "commentType": pl.String,
    "createDate": pl.Int64,
    "depth": pl.Float64,
    "editorsSelection": pl.String,
    "inReplyTo": pl.Float64,
    "newDesk": pl.String,
    "parentID": pl.Float64,
    "parentUserDisplayName": pl.String,
    "permID": pl.String,
    "picURL": pl.String,
    "printPage": pl.Float64,
    "recommendations": pl.Int64,
    "recommendedFlag": pl.String,
    "replyCount": pl.Int64,
    "reportAbuseFlag": pl.String,
    "sectionName": pl.String,
    "sharing": pl.Int64,
    "status": pl.String,
    "timespeople": pl.Int64,
    "trusted": pl.Int64,
    "typeOfMaterial": pl.String,
    "updateDate": pl.Int64,
    "userDisplayName": pl.String,
    "userID": pl.Float64,
    "userLocation": pl.String,
    "userTitle": pl.String,
    "userURL": pl.String,
}
all_files: list[str] = glob.glob(pathname=fp)
comments_df: pl.DataFrame = pl.DataFrame(schema=comments_schema)

for f in all_files:
    df: pl.DataFrame = pl.read_csv(f, ignore_errors=True, schema=comments_schema)
    comments_df = pl.concat([comments_df, df], how="vertical")

comments_df.write_parquet("../data/all_comments.parquet", use_pyarrow=True)
print(f"{comments_df.shape = }")

comments_df.shape = (2176364, 34)


In [6]:
fp: str = "../data/NYC-comment-data/comments/CommentsApril2017.csv"
comments_df: pl.DataFrame = pl.read_csv(fp)
print(f"{comments_df.shape = }")

comments_df.head()

comments_df.shape = (243832, 34)


approveDate,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,parentID,parentUserDisplayName,permID,picURL,recommendations,recommendedFlag,replyCount,reportAbuseFlag,sharing,status,timespeople,trusted,updateDate,userDisplayName,userID,userLocation,userTitle,userURL,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
i64,str,f64,i64,str,str,f64,i64,bool,f64,str,str,str,f64,str,f64,str,i64,str,f64,f64,i64,str,i64,str,str,str,i64,str,str,str,f64,i64,str
1491245186,"""This project makes me happy to be a 30+ year Times subscriber... continue to innovate across all platforms, please.""",22022598.0,22022598,"""<br/>""","""comment""",1491200000.0,1,False,0.0,,"""22022598""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",2.0,,0.0,,0,"""approved""",1.0,0.0,1491245186,"""Rob Gayle""",46006296,"""Riverside, CA""",,,0,"""58def1347c459f24986d7c80""","""Unknown""","""Insider""",716.0,2,"""News"""
1491188619,"""Stunning photos and reportage. Infuriating that the Trump admistration's draconian reinstatement of the global gag order will prevent men and women from receiving appropriate family planning advice, so obviously desperately needed.""",22017350.0,22017350,"""n/a""","""comment""",1491200000.0,1,False,0.0,,"""22017350""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",1.0,,0.0,,0,"""approved""",1.0,0.0,1491188619,"""Susan A.""",29202761,"""<br/>""",,,0,"""58def1347c459f24986d7c80""","""Unknown""","""Insider""",716.0,2,"""News"""
1491188617,"""Brilliant work from conception to execution. I've never seen anything like it. As a paper of record it is important to leave a record of these and other peoples for whom no one else speaks. Please keep humanizing what are otherwise bland statistics. It matters. Time and again their stories sounded like mine, could have been mine. We're about to see hordes of people far away experiencing utter devastation. Empathy goes away unless there are names and stories, and pictures. Links to helping at organizations, resources.... """,22017334.0,22017334,"""<br/>""","""comment""",1491200000.0,1,False,0.0,,"""22017334""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",3.0,,0.0,,0,"""approved""",1.0,0.0,1491188617,"""Meta""",63944806,"""Raleigh NC""",,,0,"""58def1347c459f24986d7c80""","""Unknown""","""Insider""",716.0,2,"""News"""
1491167820,"""NYT reporters should provide a contributor's link to a crowdfunding project or aid organization focused on the specific subject of news stories like this. Not for every sad story, but for the biggest and most obvious human problems on earth. <br/><br/>That should be a regular feature, with a dedicated click-button always located in the same familiar spot. Only 36% of America has truly hardened their hearts toward those suffering abroad. """,22015913.0,22015913,"""<br/>""","""comment""",1491200000.0,1,False,0.0,,"""22015913""","""https://graphics8.nytimes.com/images/apps/timespeople/none.png""",7.0,,2.0,,0,"""approved""",1.0,0.0,1491167820,"""Tom Wyrick""",1266184,"""Missouri, USA""",,,0,"""58def1347c459f24986d7c80""","""Unknown""","""Insider""",716.0,2,"""News"""
1491167815,"""Could only have been done in print. Stunning. """,22015466.0,22015466,"""<br/>""","""comment""",1491100000.0,1,False,0.0,,"""22015466""","""http://profile.ak.fbcdn.net/hprofile-ak-snc4/hs439.snc4/48586_720248669_1867_q.jpg""",5.0,,0.0,,0,"""approved""",1.0,0.0,1491167815,"""Joe Sharkey""",61121360,"""Tucson, Arizona""",,,0,"""58def1347c459f24986d7c80""","""Unknown""","""Insider""",716.0,2,"""News"""


In [None]:
import datetime

timestamp = 1491167820
date_time = datetime.datetime.fromtimestamp(timestamp)
formatted_date_time = date_time.strftime("%Y-%m-%d %H:%M")
print(formatted_date_time)