In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize BigQuery client
client = bigquery.Client()


In [2]:
# Define your query
query = """
SELECT *
FROM `GDELT.actor_event_covid`
"""

# Execute the query and load the data into a pandas DataFrame
df = client.query(query).to_dataframe()

I0000 00:00:1724273238.722575    5394 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [3]:
df.head()

Unnamed: 0,GLOBALEVENTID,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventBaseCode,EventCode,NumMentions,SOURCEURL
0,794352508,20181013,VOTER,UNITED STATES,2,23,23,8,https://lacrossetribune.com/news/state-and-reg...
1,1061670505,20220906,JOE BIDEN,GOVERNOR,2,25,25,8,https://kesq.com/news/2022/09/05/we-need-more-...
2,638542634,20170320,CONGRESS,UNITED STATES,17,171,1712,8,http://www.informationclearinghouse.info/46688...
3,455223846,20150805,ADMINISTRATION,CONGRESS,15,154,154,20,http://time.com/3984453/defense-contractors-lo...
4,882385542,20191026,COLLEGE,NEW YORK,8,84,841,20,https://www.wohfradio.com/national-news/felici...


In [4]:
# Convert SQLDATE to datetime to extract month and day
df['SQLDATE'] = pd.to_datetime(df['SQLDATE'], format='%Y%m%d')

# Calculate the time feature
df['time'] = (30 * df['SQLDATE'].dt.month + df['SQLDATE'].dt.day) / 365

In [6]:
df.head()

Unnamed: 0,GLOBALEVENTID,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventBaseCode,EventCode,NumMentions,SOURCEURL,time
0,794352508,2018-10-13,VOTER,UNITED STATES,2,23,23,8,https://lacrossetribune.com/news/state-and-reg...,0.857534
1,1061670505,2022-09-06,JOE BIDEN,GOVERNOR,2,25,25,8,https://kesq.com/news/2022/09/05/we-need-more-...,0.756164
2,638542634,2017-03-20,CONGRESS,UNITED STATES,17,171,1712,8,http://www.informationclearinghouse.info/46688...,0.30137
3,455223846,2015-08-05,ADMINISTRATION,CONGRESS,15,154,154,20,http://time.com/3984453/defense-contractors-lo...,0.671233
4,882385542,2019-10-26,COLLEGE,NEW YORK,8,84,841,20,https://www.wohfradio.com/national-news/felici...,0.893151


In [8]:
# Count the frequency of each actor
actor_counts = pd.concat([df['Actor1Name'], df['Actor2Name']]).value_counts()

# Filter actors based on the minimum number of occurrences
important_actors = actor_counts.index
print("actor vector: {}".format(important_actors))

actor vector: Index(['UNITED STATES', 'SCHOOL', 'POLICE', 'PRESIDENT', 'COMMUNITY',
       'STUDENT', 'BUSINESS', 'COMPANY', 'ATTORNEY', 'JUDGE', 'AMERICAN',
       'GOVERNMENT', 'UNIVERSITY', 'SENATE', 'PRISON', 'MEDIA', 'WASHINGTON',
       'GOVERNOR', 'HOSPITAL', 'COLLEGE', 'NEW YORK', 'VOTER', 'EMPLOYEE',
       'TEXAS', 'CRIMINAL', 'RESIDENTS', 'AUTHORITIES', 'CALIFORNIA',
       'CONGRESS', 'THE WHITE HOUSE', 'FLORIDA', 'LAWMAKER', 'ADMINISTRATION',
       'SENATOR', 'WORKER', 'MAYOR', 'CHINA', 'MILITARY', 'INDUSTRY',
       'PROSECUTOR', 'PROTESTER', 'JOE BIDEN', 'AFRICA', 'LAWYER', 'OHIO',
       'UNITED KINGDOM', 'MICHIGAN', 'CHICAGO', 'SUPREME COURT', 'IOWA',
       'COMPANIES', 'VIRGINIA', 'WEBSITE', 'DOCTOR', 'THE US',
       'HEALTH OFFICIAL'],
      dtype='object')


In [None]:
# Function to filter and multi-hot encode actors
def multi_hot_encode_actors(row, important_actors):
    actor_vector = np.zeros(len(important_actors), dtype=int)
    actor_vector[important_actors.get_loc(row['Actor1Name'])] = 1
    actor_vector[important_actors.get_loc(row['Actor2Name'])] = 1
    return actor_vector

# Apply multi-hot encoding for actors
df['actor_vector'] = df.apply(multi_hot_encode_actors, important_actors=important_actors, axis=1)