# Basic Data Analysis
This notebook performs basic statistics and analysis on collected text data.
* Finds date recency of articles
* Word count statistics

In [14]:
import json
import os
import re

In [5]:
# list all files in directory
folder = 'data-collect/petmd/'
files = os.listdir(folder)
folders = [file for file in files if '.' not in file]  # only want folder names
folders

['behavior',
 'adultdog',
 'disease-illness-injury',
 'care',
 'puppy',
 'poisoning',
 'seniordog',
 'breed',
 'symptoms',
 'medication',
 'nutrition',
 'allergies']

In [39]:
def collect_dates(folder):
    '''
    @params folder: folder name (str)
    returns two lists of dates: published and modified dates
    '''
    directory = 'data-collect/petmd'
    folder_path = f'{directory}/{folder}'
    files = os.listdir(folder_path)

    published_dates = []
    modified_dates = []

    for file in files:
        file_obj = open(f'{folder_path}/{file}', 'r')
        json_data = json.load(file_obj)

        date_published = json_data[-1]['datePublished']
        date_modified = json_data[-1]['dateModified']

        published_dates.append(date_published)
        modified_dates.append(date_modified)
        
    # de-duplicate dates in all lists
    unique_published_dates = set(published_dates)
    unique_modified_dates = set(modified_dates)
    
    # sort lists
    final_published_dates = sorted(list(unique_published_dates))
    final_modified_dates = sorted(list(unique_modified_dates))
        
    return final_published_dates, final_modified_dates

In [43]:
# collect dates
pb_dates = []  # published dates
md_dates = []  # modified dates

for folder in folders:
    try:
        pb_dt, md_dt = collect_dates(folder)

        pb_dates = pb_dates + pb_dt
        md_dates = md_dates + md_dt
    except:
        pass
    
unique_pb_dates = set(pb_dates)
unique_md_dates = set(md_dates)

final_pb_dates = sorted(list(unique_pb_dates))
final_md_dates = sorted(list(unique_md_dates))

In [52]:
# earliest published date: Jul 2008
# latest published date: Nov 2023

years = [dt[-4:] for dt in final_pb_dates]
print(f'Earliest published year is: {min(years)}')  # which is 2008

# final_pb_dates

Earliest published year is: 2008


In [55]:
# earliest modified date: Apr 2015
# latest modified date: Nov 2023

years = [dt[-4:] for dt in final_md_dates]
print(f'Earliest published year is: {min(years)}')  # which is 2015

# final_md_dates

Earliest published year is: 2015
