In [4]:
import pandas as pd
import json
import re
import os
import numpy as np
from datetime import datetime

In [5]:
leaks = pd.read_csv('processed_leaks_1.csv')
news = pd.read_csv('processed_news_1.csv')

In [6]:
leaks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         44 non-null     object
 1   terrorism    44 non-null     bool  
 2   security     44 non-null     bool  
 3   espionage    44 non-null     bool  
 4   communalism  44 non-null     bool  
 5   keyword      44 non-null     bool  
 6   response     43 non-null     object
dtypes: bool(5), object(2)
memory usage: 1.0+ KB


In [7]:
leaks.head(5)

Unnamed: 0,Text,terrorism,security,espionage,communalism,keyword,response
0,Pristina Airport – Possible administrative irr...,False,False,False,False,False,Here is the metadata of the text in the reques...
1,"""An interoffice memorandum providing an “outst...",False,True,False,False,True,Here is the metadata of the text in the reques...
2,"""Description..This is a Secret US National Sec...",False,True,False,False,True,Here is the metadata of the text in JSON forma...
3,"""Tokyo's Climate Change Officials to Continue ...",False,False,False,False,False,"{\n ""metadata"": [\n {\n ""..."
4,"""Description..This is a secret US National Sec...",False,True,False,False,True,Here is the metadata of the text in the reques...


In [8]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         1509 non-null   object
 1   terrorism    1509 non-null   bool  
 2   security     1509 non-null   bool  
 3   espionage    1509 non-null   bool  
 4   communalism  1509 non-null   bool  
 5   keyword      1509 non-null   bool  
 6   response     1509 non-null   object
dtypes: bool(5), object(2)
memory usage: 31.1+ KB


In [9]:
news.head(5)

Unnamed: 0,Text,terrorism,security,espionage,communalism,keyword,response
0,Starbucks violated federal labor law when it i...,False,False,False,False,False,Here is the metadata of the given text in the ...
1,The first suspect to plead guilty in Singapore...,False,False,False,False,False,Here is the metadata of the text in JSON forma...
2,Meta has been fined a record-breaking €1.2 bil...,False,False,False,False,False,Here is the metadata of the given text in JSON...
3,SINGAPORE: A 45-year-old man linked to Singapo...,False,False,False,False,False,Here is the metadata of the given text in the ...
4,The Department of Education imposed a record $...,False,True,False,False,True,Here is the metadata of the text in the reques...


In [10]:
leaks[leaks['response'].isnull() == True]

Unnamed: 0,Text,terrorism,security,espionage,communalism,keyword,response
17,Finance Assistant 1 claimed he had not noticed...,False,False,False,False,False,


In [11]:
# Function to remove anything that is not within curly braces {}
def extract_within_curly_braces(text):
    """Extract content within the outermost curly braces only."""
    if not isinstance(text, str):  # Ensure input is a string
        text = str(text)
    
    # Extract the content within the outermost curly braces
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        return match.group(0)
    return ""

leaks['response'] = leaks['response'].apply(extract_within_curly_braces)
news['response'] = news['response'].apply(extract_within_curly_braces)

In [12]:
print(leaks['response'].iloc[0])
print(news['response'].iloc[0])

{
    "metadata": {
        "earliest_date": "2003",
        "terrorism": false,
        "cyber_security": false,
        "espionage": true,
        "communalism": false,
        "countries": ["Kosovo", "Slovenia"]
    }
}
{
    "metadata": [
        {
            "earliest_date": null,
            "terrorism": false,
            "cyber_security": false,
            "espionage": false,
            "communalism": false,
            "countries": []
        }
    ]
}


In [14]:
def extract_date_components(metadata):
    if not isinstance(metadata, str):
        return None, None, None  # Return None for all components if metadata is not a string

    # Regular expressions to capture different date formats
    date_patterns = [
        r"(\b\d{4}-\d{2}-\d{2}\b)",  # Matches YYYY-MM-DD
        r"(\b\d{2}/\d{2}/\d{4}\b)",  # Matches MM/DD/YYYY
        r"(\b\d{4}\b)"               # Matches YYYY
    ]

    for pattern in date_patterns:
        match = re.search(pattern, metadata)
        if match:
            date_str = match.group(0)
            try:
                # Parse full dates (e.g., YYYY-MM-DD, MM/DD/YYYY)
                if "-" in date_str:
                    dt = datetime.strptime(date_str, "%Y-%m-%d")
                elif "/" in date_str:
                    dt = datetime.strptime(date_str, "%m/%d/%Y")
                else:
                    # If only the year is available
                    return int(date_str), None, None
                return dt.year, dt.month, dt.day
            except ValueError:
                continue
    return None, None, None

In [15]:
def extract_countries_list(input_string):
    """
    Extracts the list of countries from the first occurrence of the "countries" key in the input string.

    Args:
        input_string (str): The input string (potential JSON-like content).

    Returns:
        list: The extracted list of countries, or None if not found.
    """
    # Regex pattern to match the "countries" key and its list
    pattern = r'"countries":\s*\[.*?\]'
    
    # Search for the first occurrence of the pattern
    match = re.search(pattern, input_string)
    if match:
        # Extract the full match, e.g., '"countries": ["Germany", "United States", "European Union"]'
        countries_string = match.group()
        
        # Extract only the list part, e.g., '["Germany", "United States", "European Union"]'
        list_start = countries_string.find('[')
        list_end = countries_string.find(']')
        list_content = countries_string[list_start:list_end + 1]
        
        try:
            # Safely evaluate the list content as a Python list
            countries_list = eval(list_content)
            return countries_list
        except Exception as e:
            print(f"Error parsing list content: {e}")
            return None

    return None

In [16]:
leaks['year'], leaks['month'], leaks['day'] = zip(
    *leaks['response'].apply(extract_date_components)
)

news['year'], news['month'], news['day'] = zip(
    *news['response'].apply(extract_date_components)
)

leaks['countries'] = leaks['response'].apply(extract_countries_list)
news['countries'] = news['response'].apply(extract_countries_list)

In [19]:
leaks['year'] = leaks['year'].fillna(0).astype(int)
leaks['month'] = leaks['month'].fillna(0).astype(int)
leaks['day'] = leaks['day'].fillna(0).astype(int)
news['year'] = news['year'].fillna(0).astype(int)
news['month'] = news['month'].fillna(0).astype(int)
news['day'] = news['day'].fillna(0).astype(int)

In [20]:
leaks.rename(columns={'Text': 'text'}, inplace=True)
news.rename(columns={'Text': 'text'}, inplace=True)

In [21]:
news

Unnamed: 0,text,terrorism,security,espionage,communalism,keyword,response,year,month,day,countries
0,Starbucks violated federal labor law when it i...,False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",0,0,0,[]
1,The first suspect to plead guilty in Singapore...,False,False,False,False,False,"{\n ""metadata"": {\n ""earliest_date"":...",2019,4,2,"[Singapore, Cambodia, China]"
2,Meta has been fined a record-breaking €1.2 bil...,False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",0,0,0,"[European Union, United States]"
3,SINGAPORE: A 45-year-old man linked to Singapo...,False,False,False,False,False,"{\n ""metadata"": {\n ""earliest_date"":...",2019,4,30,[Singapore]
4,The Department of Education imposed a record $...,False,True,False,False,True,"{\n ""metadata"": [\n {\n ""earliest_dat...",2022,0,0,"[United States, Virginia]"
...,...,...,...,...,...,...,...,...,...,...,...
1504,NOT A SURPRISE FOR INDUSTRY FOLLOWERS..It is h...,False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",0,0,0,[Malaysia]
1505,"Ahead of the Tokyo 2020 Olympics, Japan prohib...",False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",2019,0,0,"[Japan, Australia, Singapore]"
1506,At a court hearing on Monday in the city of Ki...,False,False,False,False,False,"{\n ""metadata"": [\n ""earliest_date"":...",2015,7,1,[Germany]
1507,"Ray Mann, managing director of the West Wales ...",False,False,False,False,False,"{\n ""metadata"": [\n ""earliest_date"":...",2020,0,0,[United Kingdom]


In [22]:
leaks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         44 non-null     object
 1   terrorism    44 non-null     bool  
 2   security     44 non-null     bool  
 3   espionage    44 non-null     bool  
 4   communalism  44 non-null     bool  
 5   keyword      44 non-null     bool  
 6   response     44 non-null     object
 7   year         44 non-null     int32 
 8   month        44 non-null     int32 
 9   day          44 non-null     int32 
 10  countries    40 non-null     object
dtypes: bool(5), int32(3), object(3)
memory usage: 1.9+ KB


In [56]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         1509 non-null   object
 1   terrorism    1509 non-null   bool  
 2   security     1509 non-null   bool  
 3   espionage    1509 non-null   bool  
 4   communalism  1509 non-null   bool  
 5   keyword      1509 non-null   bool  
 6   response     1509 non-null   object
 7   year         1509 non-null   int32 
 8   month        1509 non-null   int32 
 9   day          1509 non-null   int32 
 10  countries    1420 non-null   object
dtypes: bool(5), int32(3), object(3)
memory usage: 60.5+ KB


In [25]:
news

Unnamed: 0,text,terrorism,security,espionage,communalism,keyword,response,year,month,day,countries
0,Starbucks violated federal labor law when it i...,False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",0,0,0,[]
1,The first suspect to plead guilty in Singapore...,False,False,False,False,False,"{\n ""metadata"": {\n ""earliest_date"":...",2019,4,2,"[Singapore, Cambodia, China]"
2,Meta has been fined a record-breaking €1.2 bil...,False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",0,0,0,"[European Union, United States]"
3,SINGAPORE: A 45-year-old man linked to Singapo...,False,False,False,False,False,"{\n ""metadata"": {\n ""earliest_date"":...",2019,4,30,[Singapore]
4,The Department of Education imposed a record $...,False,True,False,False,True,"{\n ""metadata"": [\n {\n ""earliest_dat...",2022,0,0,"[United States, Virginia]"
...,...,...,...,...,...,...,...,...,...,...,...
1504,NOT A SURPRISE FOR INDUSTRY FOLLOWERS..It is h...,False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",0,0,0,[Malaysia]
1505,"Ahead of the Tokyo 2020 Olympics, Japan prohib...",False,False,False,False,False,"{\n ""metadata"": [\n {\n ""...",2019,0,0,"[Japan, Australia, Singapore]"
1506,At a court hearing on Monday in the city of Ki...,False,False,False,False,False,"{\n ""metadata"": [\n ""earliest_date"":...",2015,7,1,[Germany]
1507,"Ray Mann, managing director of the West Wales ...",False,False,False,False,False,"{\n ""metadata"": [\n ""earliest_date"":...",2020,0,0,[United Kingdom]


In [27]:
news.to_csv('processed_news_2.csv', index= False)
leaks.to_csv('processed_leaks_2.csv', index= False)