In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import scipy.stats as stats
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_excel('../../data/globalterrorismdb_0522dist.xlsx')

In [None]:
df.head()

In [None]:
def get_boolean_like_columns(df):
    boolean_columns = []
    boolean_columns.extend(df.select_dtypes(include=['bool']).columns)
    for col in df.select_dtypes(include=['number']).columns:
        unique_values = df[col].dropna().unique()
        if set(unique_values).issubset({0, 1, 0.0, 1.0, False, True}):
            boolean_columns.append(col)
    
    return boolean_columns

# Récupération des colonnes booléennes
boolean_columns = get_boolean_like_columns(df)
print("Colonnes booléennes probables :", boolean_columns)

In [None]:
df_bool = df[boolean_columns]
df_bool.head()

# remplacement des NaN par 0
df_bool.fillna(0, inplace=True)
supports = apriori(df_bool, min_support=0.1, use_colnames=True)

# Calculer les règles d'association (optionnel)
rules = association_rules(supports, metric="leverage", min_threshold=1)

supports.sort_values(by="support", ascending=False).head(10)



In [None]:
supports["len"] = supports["itemsets"].apply(lambda x: len(x))
supports[supports["len"]==2].sort_values(by="support", ascending=False).head(10)

In [None]:
# compute association rules
rules = association_rules(supports, metric="confidence", min_threshold=0.5)
rules.head(50)

# filtrer par leverage descroissant
filtered_rules = rules[
    (rules["support"] >= 0.05) &
    (rules["confidence"] >= 0.7) &
    (rules["lift"] > 1) &
    (rules["leverage"] > 0) &
    (rules["conviction"] > 1.2)
]


filtered_rules = filtered_rules.sort_values(by="lift", ascending=False)

filtered_rules.head(10)

From the observation we made, we can see that lift is around 1.0 that means the items are independent of each other. We will try to to use our preprocessed data to see if we can get pattern frequent itemsets.

In [None]:
df2 = pd.read_csv('../../data/db_preprocessed.csv')
df2.head()

In [None]:

# Récupération des colonnes booléennes
boolean_columns = get_boolean_like_columns(df2)
print("Colonnes booléennes probables :", boolean_columns)

In [None]:
df_bool = df2[boolean_columns]

# remplacement des NaN par 0
df_bool.fillna(0, inplace=True)

df_bool.head()

# drop columns with unknown in the name like weapon_unknown
df_bool = df_bool[df_bool.columns.drop(list(df_bool.filter(regex='_unknown')))]


In [None]:
supports = apriori(df_bool, min_support=0.05, use_colnames=True)


supports.sort_values(by="support", ascending=False).head(10)


In [None]:
supports["len"] = supports["itemsets"].apply(lambda x: len(x))
pd.set_option("display.max_colwidth", None)
supports[supports["len"]==2].sort_values(by="support", ascending=False).head(10)

In [None]:
# compute association rules
rules = association_rules(supports, metric="lift", min_threshold=0.5)
rules.head(50)

# filtrer par leverage descroissant
filtered_rules = rules[
    (rules["support"] >= 0.05) &
    (rules["confidence"] >= 0.7) &
    (rules["lift"] > 1) &
    (rules["leverage"] > 0) &
    (rules["conviction"] > 1.2)
]


filtered_rules = filtered_rules.sort_values(by="lift", ascending=False)

filtered_rules.head(10)

# Analysis patterns
On previous data (db_2021_preprocessed.csv)

```python
- Rule 5392
    - Antecedents : (doubtterr, crit2, crit1, attack_armed assault, success)
    - Consequents : (weapon_firearms, target_military)

- Support: 5.99%, a moderate proportion.
- Confidence: 94.29%, indicating that almost all incidents meeting these criteria involve firearms and target a military target.
- Lift: 10.98, a very strong relationship.
- Context: This suggests that these criteria (crit1, crit2, doubtterr) are strong indicators of armed incidents targeting military targets.
------------
- Rule 5434
    - Antecedents : (success, attack_armed assault, target_military)
    - Consequents : (weapon_firearms, doubtterr, crit2, crit1)

- Support: 5.99%, same as rule 5392.
- Confidence: 79.62%, slightly lower than the previous rule.
- Lift: 10.85, still very high.
- Context: Successful incidents against military targets in armed attacks are strongly associated with the use of firearms and the mentioned criteria.
-------------
- Rule 4568
    - Antecedents : (crit1, doubtterr, attack_armed assault, crit2)
    - Consequents : (weapon_firearms, target_military)

- Support: 6.69%, slightly more frequent than other rules.
- Confidence: 92.73%, very high, with a lift of 10.79.
- Lift: 10.80 : high
- Context: These criteria (including crit1, crit2, doubtterr) increase the likelihood that firearms will be used against military targets.
-------------

The criteria crit1, crit2, and doubtterr play a central role in defining armed incidents against military targets.

Successful incidents involving firearms have a very high probability of involving military targets.

The very strong relationships (high lift, high conviction) suggest that these patterns are significant in the context of terrorism.


In [None]:
df_looking_group_type= pd.read_csv('../../data/processed/db_2021_preprocessed.csv')
df_looking_group_type.head()

In [None]:
# One hot encode gname
df_looking_group_type = pd.get_dummies(df_looking_group_type, columns=['gname'])
df_looking_group_type.head()

In [None]:
df_looking_group_type_boolean_col = get_boolean_like_columns(df_looking_group_type)
print("Colonnes booléennes probables :", df_looking_group_type_boolean_col)
df_looking_group_type_boolean = df_looking_group_type[df_looking_group_type_boolean_col]
df_looking_group_type_boolean.head()


In [None]:
df_looking_group_type_boolean.fillna(0, inplace=True)

df_looking_group_type_boolean.head()

supports = apriori(df_looking_group_type_boolean, min_support=0.005, use_colnames=True)


In [None]:
rules = association_rules(supports, metric="support", min_threshold=0.05)
rules.head(50)

filtered_rules = filtered_rules.sort_values(by="lift", ascending=False)

filtered_rules_subset = filtered_rules.loc[
    filtered_rules['antecedents'].apply(lambda x: 'gname' in str(x)) & 
    filtered_rules['consequents'].apply(lambda x: 'attack' in str(x))
]

filtered_rules_subset.head(10)