In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [2]:
quote_text = pd.read_csv('GNI88.csv')
quote_text.head(3)

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,Legacy Quote Tag,...,Source Religion,Legacy Source Tag,Constituent Group,Media Name,Media Medium,Journalist Name,Constituent Author,Article Issues,Custom Group,Media Group
0,2018-01-10,3759306,North Korea makes deals and threats,7023849,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Neutral,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
1,2018-01-10,3759306,North Korea makes deals and threats,7023842,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,
2,2018-01-10,3759306,North Korea makes deals and threats,7023839,News,Neutral,OLD Proliferation,OLD State Level Nuclear Programs,Negative,,...,Unknown,Unknown,None (Legacy Other),Washington Post,,"Fifield, Anna",,,,


In [3]:
quote_text.columns

Index(['artdate', 'Article ID', 'Headline', 'Quote ID', 'Article Status',
       'Article Position', 'Messages', 'Submessages', 'Quote Position',
       'Legacy Quote Tag', 'On/Off Message', 'QText', 'Source Name',
       'Source Type', 'Source Party Affiliation', 'Source Ethnicity',
       'Source Nationality', 'Source Gender', 'Source Religion',
       'Legacy Source Tag', 'Constituent Group', 'Media Name', 'Media Medium',
       'Journalist Name', 'Constituent Author', 'Article Issues',
       'Custom Group', 'Media Group'],
      dtype='object')

In [4]:
article_data = pd.read_json("gni88.json", lines=True)
article_data.head()

Unnamed: 0,Article ID,Headline,Content,Media Name,Author,Published Date
0,3777409,Modernized Tu-160 to boost Russia's long-range...,Modernized Tu-160 to boost Russia's long-range...,Defense News,"Bodner, Matthew",2018-02-15 00:00:00+00
1,3764250,Give Trump more nuclear weapons and more ways ...,Give Trump more nuclear weapons and more ways ...,CNN,"Collina, Tom Z.",2018-02-02 00:00:00+00
2,3764251,Trump wants to build smaller atomic weapons; R...,Trump wants to build smaller atomic weapons; R...,Los Angeles Times,"Cloud, David",2018-02-03 00:00:00+00
3,3764252,Egypt and Israel Secretly Allied In Sinai Battle,Egypt and Israel Secretly Allied In Sinai Batt...,The New York Times,"Kirkpatrick, David D.",2018-02-04 00:00:00+00
4,3764253,Listen: Top Armed Services Dem worried about r...,Listen: Top Armed Services Dem worried about r...,The Hill,"Simendinger, Alexis",2018-02-03 00:00:00+00


# remove irrelevant text from articles (prev code)

In [13]:
#Patterns at start of article
allarticle_header_regex = "Media: .*\nAuthor: (?:.*\n){1,10}Date: .*\n\n"

politico_share_regex = '.*\n{1,20}Follow Us\n'
politico_date_regex = '^.*\nBy.*\n\d\d/\d\d/\d\d\d\d \d\d:\d\d (?:AM|PM) EDT'

#Patterns at end of article
dow_regex = 'License this article from Dow Jones Reprint Service'

#Patterns in article
#search for line with only all caps and punctuation
fox_bold_regex = "\n[A-Z ',.-]+\n"

In [14]:
def regex_trim(rx_list,column, df=article_data, replace_value=""):
    '''Takes a list of regex patterns, and joins the patterns with an OR (|) separator. 
    Searches the specified column/df for the pattern and replaces it with value specified (default value-nothing)'''
    df[column] = df[column].replace(to_replace="|".join(rx_list), value=replace_value, regex=True)
    return df

In [15]:
remove_patterns = [allarticle_header_regex, fox_bold_regex, 
                   dow_regex, 
                   politico_date_regex]

df = regex_trim(remove_patterns, "Content")

In [16]:
df

Unnamed: 0,Article ID,Headline,Content,Media Name,Author,Published Date,First_10%_Content_Split
0,3777409,Modernized Tu-160 to boost Russia's long-range...,Modernized Tu-160 to boost Russia's long-range...,Defense News,"Bodner, Matthew",2018-02-15 00:00:00+00,"[Modernized, Tu-160, to, boost, Russia's, long..."
1,3764250,Give Trump more nuclear weapons and more ways ...,Give Trump more nuclear weapons and more ways ...,CNN,"Collina, Tom Z.",2018-02-02 00:00:00+00,"[Give, Trump, more, nuclear, weapons, and, mor..."
2,3764251,Trump wants to build smaller atomic weapons; R...,Trump wants to build smaller atomic weapons; R...,Los Angeles Times,"Cloud, David",2018-02-03 00:00:00+00,"[Trump, wants, to, build, smaller, atomic, wea..."
3,3764252,Egypt and Israel Secretly Allied In Sinai Battle,Egypt and Israel Secretly Allied In Sinai Batt...,The New York Times,"Kirkpatrick, David D.",2018-02-04 00:00:00+00,"[Egypt, and, Israel, Secretly, Allied, In, Sin..."
4,3764253,Listen: Top Armed Services Dem worried about r...,Listen: Top Armed Services Dem worried about r...,The Hill,"Simendinger, Alexis",2018-02-03 00:00:00+00,"[Listen:, Top, Armed, Services, Dem, worried, ..."
...,...,...,...,...,...,...,...
69990,3783411,The Pyongyang Olympics,The Pyongyang Olympics\r\n\r\nMedia: The Wall ...,The Wall Street Journal,"No By-Line,",2018-02-12 00:00:00+00,"[The, Pyongyang, Olympics, Media:, The, Wall, ..."
69991,3783412,"US Air Force requests $156.3 billion in FY19, ...","US Air Force requests $156.3 billion in FY19, ...",Defense News,"Mehta, Aaron",2018-02-12 00:00:00+00,"[US, Air, Force, requests, $156.3, billion, in..."
69992,3783413,Mike Pence says the US is ready to negotiate w...,Mike Pence says the US is ready to negotiate w...,Vox,"Ward, Alex",2018-02-12 00:00:00+00,"[Mike, Pence, says, the, US, is, ready, to, ne..."
69993,3783414,Chinese Sub Commanders May Get,Chinese Sub Commanders May Get\r\n\r\nMedia: D...,Defense One,"Kania, Elsa",2018-02-12 00:00:00+00,"[Chinese, Sub, Commanders, May, Get, Media:, D..."


In [17]:
article_data = df

# Extract first 10% of article content

In [18]:
article_data['First_10%_Content_Split'] = article_data['Content'].apply(lambda x: x.split()[:int(0.1*len(x))])

In [19]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
article_data['First_10%_Content'] = article_data['First_10%_Content_Split'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

# Check first 10% content

Check if length is reasonable:

In [20]:
article_data['First_10%_Content_Split']

0        [Modernized, Tu-160, to, boost, Russia's, long...
1        [Give, Trump, more, nuclear, weapons, and, mor...
2        [Trump, wants, to, build, smaller, atomic, wea...
3        [Egypt, and, Israel, Secretly, Allied, In, Sin...
4        [Listen:, Top, Armed, Services, Dem, worried, ...
                               ...                        
69990    [The, Pyongyang, Olympics, Media:, The, Wall, ...
69991    [US, Air, Force, requests, $156.3, billion, in...
69992    [Mike, Pence, says, the, US, is, ready, to, ne...
69993    [Chinese, Sub, Commanders, May, Get, Media:, D...
69994    [Trump, Proposes, 10%, Bump, for, the, Pentago...
Name: First_10%_Content_Split, Length: 69995, dtype: object

In [21]:
min_length = article_data['First_10%_Content_Split'].apply(lambda x: len(x))

In [22]:
length_table = pd.DataFrame(data = min_length)
small_length_table = length_table.loc[length_table['First_10%_Content_Split'] < 10]
small_length_table

Unnamed: 0,First_10%_Content_Split
52556,8
53635,8
53910,8
54236,7
55202,9
...,...
64959,9
65082,6
65683,7
68119,9


In [23]:
article_data['Content_Split'] = article_data['Content'].apply(lambda x: x.split())

In [24]:
min_length_content = article_data['Content_Split'].apply(lambda x: len(x))

In [25]:
length_table_Content = pd.DataFrame(data = min_length_content)
small_length_table_Content = length_table_Content.loc[length_table_Content['Content_Split'] < 10]

In [26]:
small_length_table_Content.value_counts()

Content_Split
0                253
dtype: int64

There are 253 in the df with no "Content"

In [27]:
article_data['Content'].value_counts

<bound method IndexOpsMixin.value_counts of 0        Modernized Tu-160 to boost Russia's long-range...
1        Give Trump more nuclear weapons and more ways ...
2        Trump wants to build smaller atomic weapons; R...
3        Egypt and Israel Secretly Allied In Sinai Batt...
4        Listen: Top Armed Services Dem worried about r...
                               ...                        
69990    The Pyongyang Olympics\r\n\r\nMedia: The Wall ...
69991    US Air Force requests $156.3 billion in FY19, ...
69992    Mike Pence says the US is ready to negotiate w...
69993    Chinese Sub Commanders May Get\r\n\r\nMedia: D...
69994    Trump Proposes 10% Bump for the Pentagon - the...
Name: Content, Length: 69995, dtype: object>

In [28]:
article_data.iloc[small_length_table_Content.index, :].head()

Unnamed: 0,Article ID,Headline,Content,Media Name,Author,Published Date,First_10%_Content_Split,First_10%_Content,Content_Split
58653,7714048,,,Task and Purpose,Idrees Ali and Phil Stewart,2020-04-16 19:53:38+00,[],,[]
58654,7714049,,,Task and Purpose,,2020-04-16 17:47:58+00,[],,[]
58655,7714050,,,Task and Purpose,Jim Thompson,2020-04-16 15:41:51+00,[],,[]
58656,7714051,,,Task and Purpose,Haley Britzky,2020-04-16 14:38:28+00,[],,[]
58657,7714052,,,Task and Purpose,Jared Keller,2020-04-16 14:38:02+00,[],,[]


It seems like many of the rows of the dataframe do not have anything in the Content column, or only contain the headers/title of article.

Will remove those rows for feature selection analysis.

In [29]:
article_data = article_data.drop(index = small_length_table_Content.index)

In [30]:
merged_df = quote_text.merge(article_data[['Article ID', 'Content', 'Author', 'Published Date', 'First_10%_Content']], on = 'Article ID')

# Improve First Line Extraction

In [38]:
import re

In [96]:
def first_sentence2(string):
    no_acronyms = re.sub(r'(?<!\w)([A-Z])\.', r'\1', string)
    pattern = r"^([^.!?]+)"
    print(re.findall(pattern, no_acronyms))

^ previous first line func: getting everything before first punctuation (not including abbreviations)

Common pattern in content - media, byline, date is before actual content (example below)
Regex for only taking content after the 2nd new line

In [220]:
print(merged_df['Content'][0])

North Korea makes deals and threats

Media: The Washington Post
Byline: Anna Fifield
Date: 10 January 2018

SEOUL - North Korea's representatives assured the South Korean government Tuesday that the country's "cutting-edge" nuclear weapons are aimed only at the United States, not at its neighbors, as they struck a deal to send athletes to next month's Winter Olympics and to reopen a military hotline.

The sobering words underscored how, despite the rare agreement with the South, Pyongyang continues to assert its right to fend off the United States with nuclear arms.

Nevertheless, South Korea achieved its immediate goal of bringing North Korean athletes to compete in what Seoul has dubbed the "peace games." South Korean officials portrayed this agreement as a first step in a significant improvement in bilateral relations. The question, analysts said, is whether the North will pursue this opening with any sincerity.

South Korea signaled that it was willing to suspend some o

In [221]:
import regex as re

In [222]:
def first_sentence3(content):
    content = ''.join(content.split('\r\n\r\n')[2:])
    no_acronyms = re.sub(r'(?<!\w)([A-Z])\.', r'\1', content)
    pattern = r"^([^.!?]+)"
    print(re.findall(pattern, no_acronyms))


In [224]:
first_sentence3(merged_df['Content'][0])

['SEOUL - North Korea\'s representatives assured the South Korean government Tuesday that the country\'s "cutting-edge" nuclear weapons are aimed only at the United States, not at its neighbors, as they struck a deal to send athletes to next month\'s Winter Olympics and to reopen a military hotline']
