## Setup

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, IntegerType
import glob

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("GDELT Analysis") \
    .getOrCreate()

## Reading and Dataprep

In [8]:
gkg_files = glob.glob('/home/tiago/factored-datathon-2024-seed42/files/**/*export.CSV', recursive=True)
csv_files = [file for file in gkg_files if file.lower().endswith('.csv')]
events_df = spark.read.csv(csv_files, sep='\t', header=True, inferSchema=True)
cameo_df = spark.read.csv('/home/tiago/factored-datathon-2024-seed42/files/cameo.csv', sep='\t', header=True, inferSchema=True)
cameo_df = cameo_df.withColumnRenamed("EventCode", "EventCode_join")
merged_df = events_df.join(cameo_df, events_df['EventCode'] == cameo_df['EventCode_join'], "left")
gdelt_df = merged_df.drop("EventCode_join")

# Create year and month columns for easier analysis
gdelt_df = gdelt_df.withColumn("GoldsteinScale", col("GoldsteinScale").cast(FloatType())) \
                   .withColumn("NumMentions", col("NumMentions").cast(IntegerType())) \
                   .withColumn("NumSources", col("NumSources").cast(IntegerType())) \
                   .withColumn("NumArticles", col("NumArticles").cast(IntegerType()))\
                   .withColumn("AvgTone", col("NumArticles").cast(FloatType()))

# Show the first few rows
gdelt_df.show(5)


                                                                                

+-------------+--------+---------+----+------------+----------+----------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+----------+----------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+-----------+---------+-------------+-------------+---------+--------------+-----------+----------+-----------+-------+--------------+------------------+---------------------+------------------+-------------+--------------+-------------------+--------------+------------------+---------------------+------------------+-------------+--------------+-------------------+--------------+------------------+---------------------+------------------+-------------+--------------+-------------------+---------+--------------------+--------------------+
|GLOBALEVENTID| SQLDATE|MonthYear|Year|FractionDate|Actor1Code|Actor1Name

## Loading data with sanction codes

http://data.gdeltproject.org/documentation/CAMEO.Manual.1.1b3.pdf

In [14]:
from pyspark.sql.functions import col

# Filter cameo_df based on event description containing "sanction"
sanctions_df = gdelt_df.filter(col("EventDescription").like("%sanction%"))

# Show the filtered DataFrame
sanctions_df.show(5)

+-------------+--------+---------+----+------------+----------+-------------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+----------+----------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+-----------+---------+-------------+-------------+---------+--------------+-----------+----------+-----------+-------+--------------+--------------------+---------------------+------------------+-------------+--------------+-------------------+--------------+--------------------+---------------------+------------------+-------------+--------------+-------------------+--------------+--------------------+---------------------+------------------+-------------+--------------+-------------------+---------+--------------------+--------------------+
|GLOBALEVENTID| SQLDATE|MonthYear|Year|FractionDate|Actor1Code| 

## Geographic Distribution of Sanction-Related Events

In [17]:
# Count the number of sanction events by country, excluding None values
sanctions_by_country = (
    sanctions_df
    .filter(sanctions_df.ActionGeo_CountryCode.isNotNull())
    .groupBy('ActionGeo_CountryCode')
    .count()
    .orderBy('count', ascending=False)
)

sanctions_by_country.show()




+---------------------+------+
|ActionGeo_CountryCode| count|
+---------------------+------+
|                   US|149295|
|                   UK| 21191|
|                   IS| 19847|
|                   IN| 17570|
|                   RS| 15185|
|                   CH| 13330|
|                   AS| 10721|
|                   CA|  9956|
|                   PK|  6484|
|                   UP|  6421|
|                   NI|  5882|
|                   IR|  5612|
|                   FR|  4911|
|                   EI|  4359|
|                   GM|  4157|
|                   IT|  2888|
|                   SF|  2696|
|                   SP|  2690|
|                   MX|  2652|
|                   VE|  2576|
+---------------------+------+
only showing top 20 rows



                                                                                

## Temporal Trend of Sanction-Related Events

In [18]:
# Count the number of sanction events per year
sanctions_by_year = sanctions_df.groupBy('Year').count().orderBy('Year')

sanctions_by_year.show()



+----+------+
|Year| count|
+----+------+
|2014|    78|
|2023|  6871|
|2024|391518|
+----+------+



                                                                                

## Sentiment Analysis Related to Sanctions

In [20]:
from pyspark.sql.functions import avg

# Group by Actor1CountryCode and calculate average tone for sanction-related events
sanction_sentiment_by_country = sanctions_df.groupBy('Actor1CountryCode').agg(avg('AvgTone').alias('AvgTone')).orderBy('AvgTone', ascending=False)

sanction_sentiment_by_country.show()




+-----------------+------------------+
|Actor1CountryCode|           AvgTone|
+-----------------+------------------+
|              DOM| 36.98620689655172|
|              VAT|24.095238095238095|
|              BHR|23.878260869565217|
|              SLB|            23.125|
|              GMB|              22.0|
|              URY|18.731182795698924|
|              GRC|  17.6103781882146|
|              AUS| 17.33591211511682|
|              THA| 16.00769230769231|
|              ARE|15.710374639769453|
|              AFG|14.350940017905103|
|             NULL|14.260659694288012|
|              TUN|14.215277777777779|
|              NOR|14.089316987740805|
|              USA|14.000254390784947|
|              COL| 13.57948717948718|
|              BRA|13.429602888086643|
|              SEN|13.013071895424837|
|              CAN|12.630941836433422|
|              ISR|12.398988682879846|
+-----------------+------------------+
only showing top 20 rows



                                                                                

## Identifying Key Actors in Sanction-Related Events

In [21]:
# Count the number of occurrences of each actor in sanction-related events
actors_df = sanctions_df.groupBy('Actor1Name').count().orderBy('count', ascending=False)

# Remove null values from actors_df
actors_df = actors_df.na.drop()

actors_df.show()



+--------------+-----+
|    Actor1Name|count|
+--------------+-----+
| UNITED STATES|41977|
|UNITED KINGDOM| 9043|
|    GOVERNMENT| 8290|
|        ISRAEL| 6171|
| SUPREME COURT| 6025|
|         JUDGE| 5692|
|        RUSSIA| 5600|
|         CHINA| 5091|
|        THE US| 4683|
|        SCHOOL| 4052|
|     PRESIDENT| 4040|
|        CANADA| 3757|
|     AUSTRALIA| 3718|
|        POLICE| 3367|
|         MEDIA| 2985|
|       COMPANY| 2794|
|       FLORIDA| 2793|
|    WASHINGTON| 2790|
|ADMINISTRATION| 2779|
|       RUSSIAN| 2568|
+--------------+-----+
only showing top 20 rows



                                                                                

##  Sentiment Analysis Over Time for Specific Countries

In [22]:
from pyspark.sql import functions as F

# Filter for a specific country (e.g., 'RUS' for Russia)
country_df = sanctions_df.filter(F.col('Actor1CountryCode') == 'RUS')

# Group by year and calculate average tone
sentiment_over_time = country_df.groupBy('Year').agg(F.avg('AvgTone').alias('AvgTone')).orderBy('Year')

sentiment_over_time.show()




+----+-----------------+
|Year|          AvgTone|
+----+-----------------+
|2023|           8.0625|
|2024|9.895254756177565|
+----+-----------------+



                                                                                

## Co-occurrence Analysis of Actors


In [23]:
# Count the co-occurrence of Actor1 and Actor2
co_occurrence_df = sanctions_df.groupBy('Actor1Name', 'Actor2Name').count().orderBy('count', ascending=False)

# Remove cases where one of the columns is null or equal to another
co_occurrence_df = co_occurrence_df.filter((col("Actor1Name").isNotNull()) & (col("Actor2Name").isNotNull()) & (col("Actor1Name") != col("Actor2Name")))

co_occurrence_df.show()



+-------------+-------------+-----+
|   Actor1Name|   Actor2Name|count|
+-------------+-------------+-----+
|        JUDGE|UNITED STATES| 1060|
|       RUSSIA|      UKRAINE|  826|
|UNITED STATES|       SCHOOL|  744|
|SUPREME COURT|UNITED STATES|  714|
|      UKRAINE|       RUSSIA|  663|
|       SCHOOL|UNITED STATES|  519|
|UNITED STATES|SUPREME COURT|  498|
|UNITED STATES|       RUSSIA|  492|
|UNITED STATES|       ISRAEL|  437|
|UNITED STATES|        JUDGE|  431|
|      FLORIDA|UNITED STATES|  425|
|    PRESIDENT|UNITED STATES|  424|
|     NEW YORK|UNITED STATES|  407|
|UNITED STATES|    PRESIDENT|  399|
|UNITED STATES|         IRAN|  389|
|       ISRAEL|  PALESTINIAN|  378|
|UNITED STATES|        MEDIA|  371|
|UNITED STATES|        CHINA|  344|
|UNITED STATES|     BUSINESS|  341|
|UNITED STATES|      STUDENT|  334|
+-------------+-------------+-----+
only showing top 20 rows



                                                                                

## Temporal Analysis of Sanction Events by QuadClass


In [24]:
# Group by year and QuadClass to count events
quadclass_over_time = sanctions_df.groupBy('Year', 'QuadClass').count().orderBy('Year')

quadclass_over_time.show()



+----+---------+------+
|Year|QuadClass| count|
+----+---------+------+
|2014|        4|    68|
|2014|        2|    10|
|2023|        2|  1316|
|2023|        4|  5317|
|2023|        3|   229|
|2023|        1|     9|
|2024|        3| 13130|
|2024|        2| 66077|
|2024|        1|   639|
|2024|        4|311672|
+----+---------+------+



                                                                                

## Impact Analysis: Sanctions and Media Attention

In [25]:
# Group by EventCode (representing different sanction types) and calculate average number of articles
media_attention_df = sanctions_df.groupBy('EventCode').agg(avg('NumArticles').alias('AvgArticles')).orderBy('AvgArticles', ascending=False)

# Show the results
media_attention_df.show()




+---------+------------------+
|EventCode|       AvgArticles|
+---------+------------------+
|     0351|21.297794117647058|
|     0251|14.976744186046512|
|     1241|14.155177386789731|
|     1054|13.281879194630873|
|      172|13.270516366560324|
|     1244|12.986342943854325|
|      081|11.927395450737805|
|      163|10.369068759033658|
|      085| 9.005068518866153|
|     1312|  8.33035548110925|
|     0254| 7.979757085020243|
|      132| 6.004166666666666|
|     1051|               5.5|
+---------+------------------+



                                                                                

## Analyzing the Effect of Sanctions on International Relations

In [26]:
# Group by Actor1CountryCode and calculate the average GoldsteinScale for sanctions
relation_effect_df = sanctions_df.groupBy('Actor1CountryCode').agg(avg('GoldsteinScale').alias('AvgGoldsteinScale')).orderBy('AvgGoldsteinScale', ascending=False)

# Show the results
relation_effect_df.show(10)




+-----------------+-------------------+
|Actor1CountryCode|  AvgGoldsteinScale|
+-----------------+-------------------+
|              LIE|                5.0|
|              EAF|                5.0|
|              GNQ|                3.5|
|              URY| 1.3118279569892473|
|              MAC|                1.0|
|              GNB|                1.0|
|              CRB|                1.0|
|              MWI| 0.6674311907466398|
|              LUX| 0.6510638277581398|
|              WAF|0.05999999589203669|
+-----------------+-------------------+
only showing top 10 rows



                                                                                