In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
/kaggle/input/gtd-01-data-cleaning-and-aggregation/bridge_event_attack.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_impact.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_location.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_weapon_type.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_ransom_kidnap.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/__results__.html
/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_target_type.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/bridge_event_weapon.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_flags.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/bridge_event_target.csv
/kaggle/input/gtd-01-data-cleaning-and-aggregation/__notebook__.ipynb
/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_incident.c

# Global Terrorism Database (GTD)
## Cleaning and Validation of Modeled Tables

**Project:** Global Terrorism Trends & Operational Dynamics (1970â€“2020)

### Objective
This notebook focuses on cleaning, validating, and finalizing
the normalized GTD tables produced in the data modeling stage.

The goal is to ensure that all fact, dimension, and bridge tables are:
- Consistent
- Free of critical missing values
- Ready for analytical use in Tableau

### Scope
- Load modeled fact, dimension, and bridge tables
- Validate record counts and relationships
- Handle missing and inconsistent values
- Standardize column naming and formats
- Export final analysis-ready datasets


In [8]:
import os
os.listdir("/kaggle/input")


['gtd-01-data-cleaning-and-aggregation']

In [9]:
os.listdir("/kaggle/input/gtd-01-data-cleaning-and-aggregation")


['bridge_event_attack.csv',
 'fact_impact.csv',
 'dim_location.csv',
 'dim_weapon_type.csv',
 'fact_ransom_kidnap.csv',
 '__results__.html',
 'dim_target_type.csv',
 'bridge_event_weapon.csv',
 'fact_flags.csv',
 'bridge_event_target.csv',
 '__notebook__.ipynb',
 'fact_incident.csv',
 '__output__.json',
 'dim_attack_type.csv',
 'custom.css']

## 1. Loading Modeled Tables

In this section, the normalized tables generated in the previous
notebook are loaded for cleaning and validation.

These tables include:
- Fact tables (incident-level measures)
- Dimension tables (descriptive attributes)
- Bridge tables (many-to-many relationships)


## 2. Initial Structure and Consistency Checks

This section verifies that all modeled tables:
- Load correctly
- Have expected dimensions
- Contain the required key columns

These checks ensure structural integrity before cleaning.


## 3. Fact Table Cleaning

The fact table is cleaned to ensure:
- One row per incident
- Valid values for key flags and indicators
- No critical missing values in analytical fields

This table acts as the central reference for all analysis.


## Fact Table 1 

In [10]:
fact_incident = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_incident.csv")

print(fact_incident.shape)
fact_incident.head()


(209706, 10)


Unnamed: 0,eventid,iyear,imonth,iday,success,suicide,doubtterr,multiple,extended,location_id
0,197000000001,1970,7,2,1,0,0,0.0,0,1
1,197000000002,1970,0,0,1,0,0,0.0,0,2
2,197001000001,1970,1,0,1,0,0,0.0,0,3
3,197001000002,1970,1,0,1,0,0,0.0,0,4
4,197001000003,1970,1,0,1,0,-9,0.0,0,5


In [11]:
fact_incident.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   eventid      209706 non-null  int64  
 1   iyear        209706 non-null  int64  
 2   imonth       209706 non-null  int64  
 3   iday         209706 non-null  int64  
 4   success      209706 non-null  int64  
 5   suicide      209706 non-null  int64  
 6   doubtterr    209706 non-null  int64  
 7   multiple     209705 non-null  float64
 8   extended     209706 non-null  int64  
 9   location_id  209706 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 16.0 MB


In [12]:
fact_incident['imonth'] = fact_incident['imonth'].replace(0, pd.NA)

In [13]:
fact_incident['iday'] = fact_incident['iday'].replace(0, pd.NA)

In [14]:
fact_incident[['imonth', 'iday']].describe()


Unnamed: 0,imonth,iday
count,209686,208815
unique,12,31
top,5,15
freq,19651,7464


In [15]:
flag_cols = ['success', 'suicide', 'doubtterr', 'multiple', 'extended']

fact_incident[flag_cols] = fact_incident[flag_cols].apply(pd.to_numeric)


In [16]:
fact_incident.isnull().sum()

eventid          0
iyear            0
imonth          20
iday           891
success          0
suicide          0
doubtterr        0
multiple         1
extended         0
location_id      0
dtype: int64

In [17]:
fact_incident.to_csv(
    "/kaggle/working/fact_incident_cleaned.csv",
    index=False
)


## Fact Table 2 

In [18]:
fact_impact = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_impact.csv")

In [19]:
print(fact_impact.shape)
fact_impact.head()

(209706, 5)


Unnamed: 0,eventid,nkill,nwound,property,propextent
0,197000000001,1.0,0.0,0,
1,197000000002,0.0,0.0,0,
2,197001000001,1.0,0.0,0,
3,197001000002,,,1,
4,197001000003,,,1,


In [20]:
fact_impact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   eventid     209706 non-null  int64  
 1   nkill       197179 non-null  float64
 2   nwound      189770 non-null  float64
 3   property    209706 non-null  int64  
 4   propextent  73464 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 8.0 MB


In [21]:
num_cols = ['nkill', 'nwound', 'property', 'propextent']

fact_impact[num_cols] = fact_impact[num_cols].apply(pd.to_numeric, errors='coerce')


In [22]:
fact_impact[num_cols] = fact_impact[num_cols].where(fact_impact[num_cols] >= 0)


In [23]:
fact_impact.isnull().sum()


eventid            0
nkill          12527
nwound         19936
property       28048
propextent    136242
dtype: int64

In [24]:
fact_impact.to_csv(
    "/kaggle/working/fact_impact_cleaned.csv",
    index=False
)


## Fact Table 3

In [25]:
fact_ransom_kidnap = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_ransom_kidnap.csv")


In [26]:
fact_ransom_kidnap.shape

(209706, 6)

In [27]:
fact_ransom_kidnap.head()

Unnamed: 0,eventid,ishostkid,nhostkid,ransom,ransomamt,ransompaid
0,197000000001,0.0,,0.0,,
1,197000000002,1.0,1.0,1.0,800000.0,
2,197001000001,0.0,,0.0,,
3,197001000002,0.0,,0.0,,
4,197001000003,0.0,,0.0,,


In [28]:
fact_ransom_kidnap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   eventid     209706 non-null  int64  
 1   ishostkid   209528 non-null  float64
 2   nhostkid    16667 non-null   float64
 3   ransom      80466 non-null   float64
 4   ransomamt   1533 non-null    float64
 5   ransompaid  951 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 9.6 MB


In [29]:
num_cols = ['ishostkid', 'nhostkid', 'ransom', 'ransomamt', 'ransompaid']

fact_ransom_kidnap[num_cols] = fact_ransom_kidnap[num_cols].apply(
    pd.to_numeric, errors='coerce'
)


In [30]:
num_cols = ['ishostkid', 'nhostkid', 'ransom', 'ransomamt', 'ransompaid']

fact_ransom_kidnap[num_cols] = fact_ransom_kidnap[num_cols].apply(
    pd.to_numeric, errors='coerce'
)


In [31]:
fact_ransom_kidnap.isnull().sum()


eventid            0
ishostkid        178
nhostkid      193039
ransom        129240
ransomamt     208173
ransompaid    208755
dtype: int64

In [32]:
fact_ransom_kidnap.to_csv(
    "/kaggle/working/fact_ransom_kidnap_cleaned.csv",
    index=False
)


## Fact Table 4

In [33]:
fact_flags = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/fact_flags.csv")


In [34]:
fact_flags.head()

Unnamed: 0,eventid,success,suicide,doubtterr,multiple,extended
0,197000000001,1,0,0,0.0,0
1,197000000002,1,0,0,0.0,0
2,197001000001,1,0,0,0.0,0
3,197001000002,1,0,0,0.0,0
4,197001000003,1,0,-9,0.0,0


In [35]:
fact_flags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209706 entries, 0 to 209705
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   eventid    209706 non-null  int64  
 1   success    209706 non-null  int64  
 2   suicide    209706 non-null  int64  
 3   doubtterr  209706 non-null  int64  
 4   multiple   209705 non-null  float64
 5   extended   209706 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 9.6 MB


In [36]:
flag_cols = ['success', 'suicide', 'doubtterr', 'multiple', 'extended']

fact_flags[flag_cols] = fact_flags[flag_cols].apply(
    pd.to_numeric, errors='coerce'
)


In [37]:
fact_flags.isnull().sum()


eventid      0
success      0
suicide      0
doubtterr    0
multiple     1
extended     0
dtype: int64

In [38]:
fact_flags.to_csv(
    "/kaggle/working/fact_flags_cleaned.csv",
    index=False
)


## 4. Dimension Table Cleaning

Dimension tables are cleaned to ensure:
- No duplicate keys
- Clear and consistent categorical labels
- Compatibility with fact table joins

This improves readability and reduces ambiguity during analysis.


## Dimension Table 1 

In [39]:
 dim_location = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_location.csv")

In [40]:
print(dim_location.shape)
dim_location.head()

(204, 5)


Unnamed: 0,location_id,country,country_txt,region,region_txt
0,1,58,Dominican Republic,2,Central America & Caribbean
1,2,130,Mexico,1,North America
2,3,160,Philippines,5,Southeast Asia
3,4,78,Greece,8,Western Europe
4,5,101,Japan,4,East Asia


In [41]:
dim_location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   location_id  204 non-null    int64 
 1   country      204 non-null    int64 
 2   country_txt  204 non-null    object
 3   region       204 non-null    int64 
 4   region_txt   204 non-null    object
dtypes: int64(3), object(2)
memory usage: 8.1+ KB


In [42]:
text_cols = ['country_txt', 'region_txt']

for col in text_cols:
    dim_location[col] = dim_location[col].str.strip().str.title()

In [43]:
dim_location.isnull().sum()

location_id    0
country        0
country_txt    0
region         0
region_txt     0
dtype: int64

In [44]:
dim_location.to_csv(
    "/kaggle/working/dim_location_cleaned.csv",
    index=False
)


## Dimension Table 2

In [45]:
dim_attack_type = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_attack_type.csv")

In [46]:
dim_attack_type.shape

(9, 2)

In [47]:
dim_attack_type.head()

Unnamed: 0,attacktype_id,attacktype_txt
0,1.0,Assassination
1,6.0,Hostage Taking (Kidnapping)
2,3.0,Bombing/Explosion
3,7.0,Facility/Infrastructure Attack
4,2.0,Armed Assault


In [48]:
dim_attack_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   attacktype_id   9 non-null      float64
 1   attacktype_txt  9 non-null      object 
dtypes: float64(1), object(1)
memory usage: 276.0+ bytes


In [49]:
dim_attack_type['attacktype_txt'] = (
    dim_attack_type['attacktype_txt']
    .str.strip()
    .str.title()
)

In [50]:
dim_attack_type.to_csv(
    "/kaggle/working/dim_attack_type_cleaned.csv",
    index=False
)

## Dimension Table 3

In [51]:
 dim_location = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_location.csv")

In [52]:
dim_target_type = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_target_type.csv")

In [53]:
dim_target_type.shape

(22, 2)

In [54]:
dim_target_type.head()

Unnamed: 0,targettype_id,targettype_txt
0,14.0,Private Citizens & Property
1,7.0,Government (Diplomatic)
2,10.0,Journalists & Media
3,3.0,Police
4,21.0,Utilities


In [55]:
dim_target_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   targettype_id   22 non-null     float64
 1   targettype_txt  22 non-null     object 
dtypes: float64(1), object(1)
memory usage: 484.0+ bytes


In [56]:
dim_target_type['targettype_txt'] = (
    dim_target_type['targettype_txt']
    .str.strip()
    .str.title()
)

In [57]:
dim_target_type.to_csv(
    "/kaggle/working/dim_target_type_cleaned.csv",
    index=False
)

## Dimension Table 4

In [58]:
dim_weapon_type = pd.read_csv("/kaggle/input/gtd-01-data-cleaning-and-aggregation/dim_weapon_type.csv")

In [59]:
dim_weapon_type.shape

(12, 2)

In [60]:
dim_weapon_type.head()

Unnamed: 0,weapontype_id,weapontype_txt
0,13.0,Unknown
1,6.0,Explosives
2,8.0,Incendiary
3,5.0,Firearms
4,2.0,Chemical


In [61]:
dim_weapon_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   weapontype_id   12 non-null     float64
 1   weapontype_txt  12 non-null     object 
dtypes: float64(1), object(1)
memory usage: 324.0+ bytes


In [62]:
dim_weapon_type['weapontype_txt'] = (
    dim_weapon_type['weapontype_txt']
    .str.strip()
    .str.title()
)

In [63]:
dim_weapon_type.to_csv(
    "/kaggle/working/dim_weapon_type_cleaned.csv",
    index=False
)

## 5. Bridge Tables

Bridge tables are confirmed and saved



In [64]:
base_path = "/kaggle/input/gtd-01-data-cleaning-and-aggregation"

In [65]:
bridge_event_attack = pd.read_csv(f"{base_path}/bridge_event_attack.csv")

In [66]:
bridge_event_target = pd.read_csv(f"{base_path}/bridge_event_target.csv")

In [67]:
bridge_event_weapon = pd.read_csv(f"{base_path}/bridge_event_weapon.csv")

In [68]:
bridge_event_attack.to_csv(
    "/kaggle/working/bridge_event_attack_cleaned.csv",
    index=False
)

In [69]:
bridge_event_target.to_csv(
    "/kaggle/working/bridge_event_target_cleaned.csv",
    index=False
)


In [70]:
bridge_event_weapon.to_csv(
    "/kaggle/working/bridge_event_weapon_cleaned.csv",
    index=False
)


## 9. Summary and Next Steps

This notebook completed the cleaning and validation
of the GTD data model.

All fact, dimension, and bridge tables are now:
- Clean
- Consistent
- Analysis-ready

The next stage of the project focuses on:
- Tableau dashboard development
- Insight generation
- Documentation and interpretation
