In [None]:
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import clean
from clean import try_or_none
df = pd.read_pickle("../data/cdis_with_schemas.pkl")
df = df.sort_values(by='id')

In [None]:
@try_or_none
def get_refs(row):
    return row["content"]["about"]["references_list"]

@try_or_none
def get_ref_original(row):
    return row['Original/Primary Reference']

@try_or_none
def get_text(row):
    return row[0]['text']

@try_or_none
def get_href(row):
    return row[0]['href']

@try_or_none
def get_year_from_str(s: str):
    # search for a 4 digit number that between 1900 and 2023

    match = re.search(r'(?<!\d)(19\d{2}|20[01]\d|202[0-3])(?!\d)', s)
    if match:
        return int(match.group())
    return None

In [None]:
# clean list-valued columns
df = clean.clean_list_valued_strings(df)

# add feature_names
def get_feature_names_list(schema):
    if isinstance(schema, list):
        return [s["label_en"] if "label_en" in s else "unknown" for s in schema]
    else:
        return []


df["feature_names"] = df["input_schema"].apply(get_feature_names_list)
df["refs"] = df.apply(get_refs, axis=1)
df['ref_original'] = df['refs'].apply(get_ref_original)
df['ref_href'] = df['ref_original'].apply(get_href)
df['ref_text'] = df['ref_original'].apply(get_text)
df['ref_year'] = df['ref_text'].apply(get_year_from_str)

df.to_pickle("../data/cdis_with_schemas_cleaned.pkl")
df.to_csv("../data/cdis_with_schemas_cleaned.csv")

In [None]:
# display all columnswith no max_width
cols = ['id', 'full_title_en', 'short_description_en', 'ref_text', 'ref_href', 'ref_year']
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 0): 
    display(df[cols].head(5))
    # display(df[df['ref_year'].min() == df['ref_year']][['ref_text', 'ref_href']])
df[cols].to_csv('../data/main_auto.csv', index=False)