In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize BigQuery client
client = bigquery.Client()


In [2]:
# Define your query
query = """
SELECT *
FROM `GDELT.actor_event_covid`
"""

# Execute the query and load the data into a pandas DataFrame
df = client.query(query).to_dataframe()

I0000 00:00:1724297611.377224   57141 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [3]:
df.head()

Unnamed: 0,GLOBALEVENTID,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventBaseCode,EventCode,NumMentions,SOURCEURL
0,971034931,20210223,UNITED STATES,UNITED STATES,1,11,11,8,https://www.wbrc.com/2021/02/23/why-we-cant-ma...
1,1121877444,20230820,COMPANY,UNITED STATES,3,31,31,8,https://www.nola.com/news/politics/louisianas-...
2,823935681,20190215,JUDGE,PRISON,12,128,128,10,https://www.independent.ie/world-news/north-am...
3,664533358,20170606,UNITED STATES,LAWMAKER,4,41,41,4,http://www.virginiamn.com/news/local/judge-app...
4,542628712,20160521,UNITED STATES,HOSPITAL,7,73,73,4,http://wqad.com/2016/05/21/two-airlifted-to-ho...


In [4]:
# Convert SQLDATE to datetime to extract month and day
df['SQLDATE'] = pd.to_datetime(df['SQLDATE'], format='%Y%m%d')

# Calculate the time feature
df['time'] = (30 * df['SQLDATE'].dt.month + df['SQLDATE'].dt.day) / 365

In [5]:
df.head()

Unnamed: 0,GLOBALEVENTID,SQLDATE,Actor1Name,Actor2Name,EventRootCode,EventBaseCode,EventCode,NumMentions,SOURCEURL,time
0,971034931,2021-02-23,UNITED STATES,UNITED STATES,1,11,11,8,https://www.wbrc.com/2021/02/23/why-we-cant-ma...,0.227397
1,1121877444,2023-08-20,COMPANY,UNITED STATES,3,31,31,8,https://www.nola.com/news/politics/louisianas-...,0.712329
2,823935681,2019-02-15,JUDGE,PRISON,12,128,128,10,https://www.independent.ie/world-news/north-am...,0.205479
3,664533358,2017-06-06,UNITED STATES,LAWMAKER,4,41,41,4,http://www.virginiamn.com/news/local/judge-app...,0.509589
4,542628712,2016-05-21,UNITED STATES,HOSPITAL,7,73,73,4,http://wqad.com/2016/05/21/two-airlifted-to-ho...,0.468493


In [6]:
# Count the frequency of each actor
actor_counts = pd.concat([df['Actor1Name'], df['Actor2Name']]).value_counts()

# Filter actors based on the minimum number of occurrences
important_actors = actor_counts.index
print("actor vector: {}".format(important_actors))

actor vector: Index(['UNITED STATES', 'SCHOOL', 'POLICE', 'PRESIDENT', 'COMMUNITY',
       'STUDENT', 'BUSINESS', 'COMPANY', 'ATTORNEY', 'JUDGE', 'AMERICAN',
       'GOVERNMENT', 'UNIVERSITY', 'SENATE', 'PRISON', 'MEDIA', 'WASHINGTON',
       'GOVERNOR', 'HOSPITAL', 'COLLEGE', 'NEW YORK', 'VOTER', 'EMPLOYEE',
       'TEXAS', 'CRIMINAL', 'RESIDENTS', 'AUTHORITIES', 'CALIFORNIA',
       'CONGRESS', 'THE WHITE HOUSE', 'FLORIDA', 'LAWMAKER', 'ADMINISTRATION',
       'SENATOR', 'WORKER', 'MAYOR', 'CHINA', 'MILITARY', 'INDUSTRY',
       'PROSECUTOR', 'PROTESTER', 'JOE BIDEN', 'AFRICA', 'LAWYER', 'OHIO',
       'UNITED KINGDOM', 'MICHIGAN', 'CHICAGO', 'SUPREME COURT', 'IOWA',
       'COMPANIES', 'VIRGINIA', 'WEBSITE', 'DOCTOR', 'THE US',
       'HEALTH OFFICIAL'],
      dtype='object')


In [7]:
len(important_actors)

56

In [8]:
# Function to filter and multi-hot encode actors
def multi_hot_encode_actors(row, important_actors):
    actor_vector = np.zeros(len(important_actors), dtype=int)
    actor_vector[important_actors.get_loc(row['Actor1Name'])] = 1
    actor_vector[important_actors.get_loc(row['Actor2Name'])] = 1
    return actor_vector

# Apply multi-hot encoding for actors
df['actor_vector'] = df.apply(multi_hot_encode_actors, important_actors=important_actors, axis=1)


In [9]:
df['actor_vector'].head()

0    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
3    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: actor_vector, dtype: object

In [10]:
# Combine unique codes from all three columns
all_unique_codes = sorted(set(df['EventRootCode'].unique()) )# |
                          #set(df['EventBaseCode'].unique()) |
                          #set(df['EventCode'].unique()))

# Create a mapping of codes to indices
code_to_index = {code: idx for idx, code in enumerate(all_unique_codes)}

In [11]:
all_unique_codes

['--',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']

In [12]:
len(code_to_index)

21

In [13]:
# Function to encode a single row
def encode_row(row, code_to_index):
    # Create a vector of zeros with length equal to the number of unique codes
    vector = np.zeros(len(code_to_index), dtype=int)
    
    # Set the index for each code in the vector to 1
    for col in ['EventRootCode', 'EventBaseCode', 'EventCode']:
        code = row[col]
        if code in code_to_index:
            vector[code_to_index[code]] = 1
        
    return vector

# Apply the encoding to each row in the dataframe
df['event_vector'] = df.apply(encode_row, code_to_index=code_to_index, axis=1)

In [14]:
df['event_vector'].head()

0    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
3    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
Name: event_vector, dtype: object

In [15]:
df['event_vector'].size

24015904

In [16]:
df['actor_vector'].size

24015904

In [17]:
df['time'].size

24015904

In [18]:
# Convert the list of vectors to a matrix
df['combined_vector'] = df.apply(lambda row: np.concatenate([[row['time']] ,row['actor_vector'], row['event_vector']]), axis=1)

In [19]:
df['combined_vector'].head()

0    [0.2273972602739726, 1.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.7123287671232876, 1.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.2054794520547945, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.5095890410958904, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.4684931506849315, 1.0, 0.0, 0.0, 0.0, 0.0, ...
Name: combined_vector, dtype: object

In [20]:
df_combined=df['combined_vector'].copy()

In [21]:
del df

In [22]:
# Define column names for actor_vector and event_vector
actor_columns = [f'actor_{actor}' for actor in important_actors]
event_columns = [f'event_{code}' for code in all_unique_codes]

# Combine the column names
combined_columns = ['time']+actor_columns + event_columns
# Convert the numpy array to a DataFrame
combined_df = pd.DataFrame(data=np.vstack(df_combined.values), columns=combined_columns)

In [23]:
combined_df.head()

Unnamed: 0,time,actor_UNITED STATES,actor_SCHOOL,actor_POLICE,actor_PRESIDENT,actor_COMMUNITY,actor_STUDENT,actor_BUSINESS,actor_COMPANY,actor_ATTORNEY,...,event_11,event_12,event_13,event_14,event_15,event_16,event_17,event_18,event_19,event_20
0,0.227397,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.712329,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.205479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.509589,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.468493,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
del df_combined

In [25]:
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()

# Define the table ID
table_id = "factoreddatathon2014.GDELT.training_set_kmeans"

# Chunk size
chunk_size = 1000000  # Adjust based on your memory limits

# Upload in chunks
for i in range(0, len(combined_df), chunk_size):
    chunk = combined_df.iloc[i:i + chunk_size]
    job = client.load_table_from_∫dataframe(chunk, table_id)
    job.result()  # Wait for the job to complete

    print(f"Loaded chunk {i // chunk_size + 1} into {table_id}")


Loaded chunk 1 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 2 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 3 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 4 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 5 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 6 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 7 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 8 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 9 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 10 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 11 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 12 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 13 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 14 into factoreddatathon2014.GDELT.training_set_kmeans
Loaded chunk 15 into factoreddatathon2014.GDELT.training_