In [1]:
import pandas as pd
from zipfile import ZipFile
import pyarrow.parquet as pq
import gzip
import json
import pyarrow as pa
import os

In [2]:
COLAB = False

In [3]:
DATA_DIR = "gdrive/MyDrive/Work/qjn/" if COLAB else "data/"
FULL_DATASET_PATH = DATA_DIR + "cjp_tables_old.zip"

# We will put this temporarily into one parquet then will split and delete full.
FULL_TEXT_PATH = DATA_DIR + "newsarticles_article.parquet"
FULL_TEXT_TRAIN_PATH = DATA_DIR + "newsarticles_article_train.parquet"
FULL_TEXT_DEV_PATH = DATA_DIR + "newsarticles_article_dev.parquet"
FULL_TEXT_TEST_PATH = DATA_DIR + "newsarticles_article_test.parquet"

# Can directly split these into new files
USER_LABELS_TRAIN_PATH = DATA_DIR + "newsarticles_usercoding_train.csv"
USER_LABELS_DEV_PATH = DATA_DIR + "newsarticles_usercoding_dev.csv"
USER_LABELS_TEST_PATH = DATA_DIR + "newsarticles_usercoding_test.csv"    

# These dont' need to be split
DATA_DICT_PATH = DATA_DIR + "data_dict.txt"
GEOCODED_PATH = DATA_DIR + "newsarticles_trainedlocation.parquet"

# Read Data

In [4]:
!unzip -l {FULL_DATASET_PATH} | grep -v MACOSX

Archive:  data/cjp_tables_old.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  02-19-2021 08:33   cjp_tables/
  7860721  02-11-2020 00:34   cjp_tables/newsarticles_trainedlocation.csv.gz
 20361082  02-11-2020 00:34   cjp_tables/newsarticles_trainedcategoryrelevance.csv.gz
   702194  02-11-2020 00:34   cjp_tables/newsarticles_usercoding_categories.csv.gz
     6148  03-23-2023 20:22   cjp_tables/.DS_Store
  3391051  02-11-2020 00:34   cjp_tables/newsarticles_usercoding.csv.gz
      670  02-11-2020 00:30   cjp_tables/newsarticles_newssource.csv.gz
    14661  02-11-2020 00:34   cjp_tables/column_names.txt
   331916  02-11-2020 00:34   cjp_tables/newsarticles_trainedsentiment.csv.gz
 15028273  02-11-2020 00:34   cjp_tables/newsarticles_trainedcoding.csv.gz
     1170  02-11-2020 00:34   cjp_tables/newsarticles_category.csv.gz
    15152  02-11-2020 00:34   cjp_tables/newsarticles_trainedsentimententities.csv.gz
1353923030  02-11-2020 00:34   cjp_tables/news

## Extract

In [5]:
with ZipFile(FULL_DATASET_PATH, 'r') as zf:
    with zf.open("cjp_tables/column_names.txt", "r") as zzf:
        data_dict = zzf.read().decode()
        with open(DATA_DICT_PATH, "w") as tf:
            tf.write(data_dict)

    with zf.open("cjp_tables/newsarticles_article.csv.gz", "r") as zzf:
        with gzip.open(zzf) as zzzf:
            article_data_chunks = pd.read_csv(zzzf,
                    names=['id','feedname','url','orig_html','title','bodytext',
                            'relevant','created','last_modified','news_source_id', 'author'],
                        true_values=['t'], false_values=['f'],
                        iterator=True, chunksize=1000)
            writer = None
            for chunk in article_data_chunks:
                chunk = chunk.filter(['id','title','bodytext','relevant'])
                table = pa.Table.from_pandas(chunk)
                if writer is None:
                    writer = pq.ParquetWriter(FULL_TEXT_PATH, table.schema)
                writer.write_table(table)
            writer.close()

In [6]:
with ZipFile(FULL_DATASET_PATH, 'r') as zf:
    with zf.open("cjp_tables/newsarticles_trainedlocation.csv.gz", "r") as zzf:
        with gzip.open(zzf) as zzzf:
            geocodes = pd.read_csv(zzzf,
                    names=['id','text','latitude','longitude','coding_id',
                            'confidence','neighborhood','is_best'],
                    true_values=['t'],
                    false_values=['f'])
            geocodes.to_parquet(GEOCODED_PATH)
    
    with zf.open("cjp_tables/newsarticles_usercoding.csv.gz", "r") as zzf:
        with gzip.open(zzf) as zzzf:
            loc_data = pd.read_csv(zzzf, 
                        names=['id','date','relevant','article_id','user_id','locations','sentiment'],
                        dtype={'locations':'str'},
                    true_values=['t'],
                    false_values=['f'])
            mask = (loc_data['locations'] != '[]') & loc_data['relevant']
            loc_data = loc_data[mask]


# Preprocess

In [7]:
article_data = pd.read_parquet(FULL_TEXT_PATH)
article_data = article_data.dropna(subset='bodytext')
article_data['bodytext'] = (article_data['bodytext']
                                  .str.replace('\n',' ')
                                  .str.replace(u'\xa0', u' '))

## Break out locations

In [8]:
loc_data = loc_data.filter(['article_id','user_id','locations'])
loc_data['location'] = loc_data['locations'].apply(json.loads)

In [9]:
exploded = loc_data.explode('location', ignore_index=True)
exploded_locs = exploded.location.apply(pd.Series)
exploded_locs = exploded_locs.rename(columns={'start':'loc_start','end':'loc_end', 'text':'loc_text'})
exploded = pd.concat([exploded, exploded_locs], axis=1)

### Fix Indexes

IDK what is causing this but the article bodytext and location indexes are not aligned. 
They seem to be consistently off by 9 (at least for the first 20 I checked).

In [10]:
verify = exploded.merge(article_data, left_on='article_id', right_on='id', how='left')
verify['locclean'] = (verify['loc_text']
                      .str.replace('\n',' ')
                      .str.replace(u'\xa0', u' '))

In [11]:
verify['extracted'] = verify.apply(lambda r: r.bodytext[r.loc_start:r.loc_end],axis=1)

In [12]:
correct = verify['extracted'] == verify.locclean

In [13]:
print("Pct aligned {:.1%}".format(correct.mean()))

Pct aligned 0.0%


In [14]:
OFFSET = 9
verify['loc_start'] -= OFFSET
verify['loc_end'] -= OFFSET

In [15]:
verify['extracted'] = verify.apply(lambda r: r.bodytext[r.loc_start:r.loc_end],axis=1)

In [16]:
correct = verify['extracted'] == verify.locclean

In [17]:
print("Pct aligned {:.1%}".format(correct.mean()))

Pct aligned 91.7%


In [18]:
OFFSET = 8
verify.loc[~correct,'loc_start'] -= OFFSET
verify.loc[~correct,'loc_end'] -= OFFSET

In [19]:
verify['extracted'] = verify.apply(lambda r: r.bodytext[r.loc_start:r.loc_end],axis=1)

In [20]:
correct = verify['extracted'] == verify.locclean

In [21]:
print("Pct aligned {:.1%}. Misaligned {} rows.".format(correct.mean(),len(correct)-correct.sum()))

Pct aligned 99.7%. Misaligned 33 rows.


Another .3% to fix ...

In [22]:
verify[~correct][['extracted','locclean']]

Unnamed: 0,extracted,locclean
672,,"JARRATT, Va."
1068,,#### Related Stories
1259,in the South Side Grand Crossing neighbo,South Side Grand Crossing neighborhood.
1324,cago&ref=,
1552,you're in,
1674,e was in,
1675,e was in,
1763,n,
1786,e was in,
1914,ting on Chicago's South,Chicago's South Side.


But screw it. It's confusing what is going wrong here. Some are blank.
Some seem to have double spaces, but i can't fix that earlier or it messes up
the other alignments. Just going to drop these FOR TRAINING.

In [23]:
assert (verify.index == exploded.index).all()
exploded['loc_start'] = verify['loc_start']
exploded['loc_end'] = verify['loc_end']

### Finish locations

In [24]:
exploded = exploded.filter(['article_id','user_id','loc_start','loc_end','loc_text'])
dups = exploded.duplicated()
print(f"Dropping {dups.sum()} duplicated location tags.")
loc_data = exploded.loc[~dups & correct]

Dropping 4 duplicated location tags.


### Strip Whitespace

Spacy will drop any NER entity that begins or ends with whitespace.
So we'll strip it here and adjust the offsets.

In [25]:
start_clean = loc_data.loc_text.str.lstrip()
end_clean = loc_data.loc_text.str.rstrip()
loc_data.loc[:, 'loc_start'] += loc_data.loc_text.str.len() - start_clean.str.len()
loc_data.loc[:, 'loc_end'] -= loc_data.loc_text.str.len() - end_clean.str.len()
loc_data.loc[:, 'loc_text'] = loc_data.loc_text.str.strip()

### Extend to token edges

Some of the loc text is missing a leading or trailing character in the token.
It needs to be token-aligned for spacy. I'll extend the start-end to the next
whitespace on either side.

In [26]:
loc_data = loc_data.merge(article_data[['id','bodytext']], left_on='article_id', right_on='id', how='left')

In [27]:
# Find previous white space
prev_ws = loc_data.apply(lambda row: row.bodytext.rfind(' ', 0, row.loc_start), axis=1).rename('prev_ws')
# Find next white space
next_ws = loc_data.apply(lambda row: row.bodytext.find(' ', row.loc_end), axis=1).rename('next_ws')
# Check if we already started just after a white space.
extend_prev = (prev_ws != -1) & (prev_ws != loc_data.loc_start - 1)
# Check if we already ended on a white space.
extend_next = (next_ws != -1) & (next_ws != loc_data.loc_end)
# Conditionally extend indexes first.
loc_data['loc_start_ext'] = prev_ws + 1
loc_data['loc_end_ext'] = next_ws
# Re-slice all strings
context = loc_data.apply(lambda row: row.bodytext[row.loc_start_ext:row.loc_end_ext], axis=1)
loc_data['loc_text_ext'] = context

In [28]:
# Selectively overwrite start/end/text
loc_data.loc[extend_prev | extend_next, 'loc_text'] = loc_data.loc[extend_prev | extend_next, 'loc_text_ext']
loc_data.loc[extend_prev, 'loc_start'] = loc_data.loc[extend_prev, 'loc_start_ext']
loc_data.loc[extend_next, 'loc_end'] = loc_data.loc[extend_next, 'loc_end_ext']
loc_data = loc_data.drop(columns=['loc_start_ext','loc_end_ext','loc_text_ext'])

In [29]:
# The expansions has caused some spans to overlap. Spacy can't handle
# overlapping NER entities. Need to merge.

def merge_spans(row):
    spans = sorted(row, key=lambda x: x['start'])
    merged = [spans[0]]

    for span in spans[1:]:
        prev_start, prev_end = merged[-1]['start'], merged[-1]['end']
        next_start, next_end = span['start'], span['end']

        if prev_end > next_start:  # Overlapping intervals
            merged[-1] = {'start': prev_start, 'end': max(prev_end, next_end)}
        else:
            merged.append({'start': next_start, 'end':next_end})

    return merged

def group_spans(block):
    return [{'start':s, 'end':e} for s,e in zip(block['loc_start'], block['loc_end'])]

spans = (loc_data
         .groupby('article_id')
         .apply(group_spans,include_groups=False)
         .apply(merge_spans)
         .explode()
         .apply(pd.Series)
         .reset_index())

In [30]:
# XXX: this process drops user_id and id from labeled loc data.
loc_data = spans.merge(article_data[['id','bodytext']], left_on='article_id', right_on='id', how='left')
loc_data['loc_text'] = loc_data.apply(lambda r: r.bodytext[r.start:r.end],axis=1)
loc_data = loc_data.rename(columns={'start':'loc_start','end':'loc_end'})
loc_data = loc_data.drop(columns=['bodytext','id'])

# Split

## Split Articles

In [31]:
def split_train_dev_test(data, train_path, dev_path, test_path):
    train = data.sample(frac=.8, random_state=22225)
    dev_test = data.loc[data.index.difference(train.index)]
    dev = dev_test.sample(frac=.5, random_state=22225)
    test = dev_test.loc[dev_test.index.difference(dev.index)]
    train.to_parquet(train_path)
    dev.to_parquet(dev_path)
    test.to_parquet(test_path)

In [32]:
split_train_dev_test(article_data, FULL_TEXT_TRAIN_PATH, FULL_TEXT_DEV_PATH, FULL_TEXT_TEST_PATH)
os.remove(FULL_TEXT_PATH)
del article_data

## Split User Codings

Need to observe article splits so we don't leak training data.

In [33]:
article_data_train = pd.read_parquet(FULL_TEXT_TRAIN_PATH, columns=['id'])
article_data_dev = pd.read_parquet(FULL_TEXT_DEV_PATH, columns=['id'])
article_data_test = pd.read_parquet(FULL_TEXT_TEST_PATH, columns=['id'])

In [34]:
loc_data_train = loc_data[loc_data.article_id.isin(article_data_train.id)]
loc_data_dev = loc_data[loc_data.article_id.isin(article_data_dev.id)]
loc_data_test = loc_data[loc_data.article_id.isin(article_data_test.id)]

In [35]:
loc_data_train.to_csv(USER_LABELS_TRAIN_PATH, index=False)
loc_data_dev.to_csv(USER_LABELS_DEV_PATH, index=False)
loc_data_test.to_csv(USER_LABELS_TEST_PATH, index=False)