# Exploratory Data Analysis (EDA)
This notebook contains the code for visualizing the data and gaining insights from it.

In [None]:
# Requiered imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import geopandas as gpd
import json
from matplotlib.lines import Line2D
from tqdm import tqdm
from shapely.geometry import Point
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Read and visualize data
df = pd.read_csv("../Data/Cluster_data.csv")
df

In [None]:
# Check null values
df.isnull().sum()

The NaN values in geoip.continent_code and hostGeoip.continent_code corresponds actually to the North America(NA) continent. This is easily observed by examining the countries with NaN values, they are all from NA. Let us correct the values

In [None]:
df['_source.geoip.continent_code'] = df['_source.geoip.continent_code'].fillna('NA')
df['_source.hostGeoip.continent_code'] = df['_source.hostGeoip.continent_code'].fillna('NA')

## General EDA

### Attacks by Continent/Country of Origin

In [None]:
# BAR CHART CONTINENT OF ORIGIN
# Count the number of attacks from each continent
attack_counts = df['_source.geoip.continent_code'].value_counts()
# Remove Antartica for visualization-->just 94 attacks
attack_counts = attack_counts.drop(['AN'])

# Create a color palette (e.g., using a seaborn color palette)
colors = sns.color_palette("rocket",len(attack_counts))

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = attack_counts.plot(kind='bar', color=colors, width=0.7)  # Adjust width as needed

# Adding grid lines
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Cyberattacks by Continent of Origin', fontsize=14)
plt.xlabel('Continent', fontsize=12)
plt.ylabel('Number of Attacks', fontsize=12)

# Millions variable
ax.set_yticklabels(['{:.1f}M'.format(y/ 1e6) for y in ax.get_yticks()])
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add a background color
ax.set_facecolor('#f0f0f0')

plt.show()

In [None]:
# BAR CHART COUNTRY OF ORIGIN
# Count the number of attacks from each country
attack_counts = df['_source.geoip.country_name'].value_counts().head(10)

# Create a color palette (e.g., using a seaborn color palette)
colors = sns.color_palette("rocket",len(attack_counts))

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = attack_counts.plot(kind='bar', color=colors, width=0.7)  # Adjust width as needed

# Adding grid lines
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Cyberattacks by Country of Origin', fontsize=14)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Attacks', fontsize=12)

# Thousands variable
ax.set_yticklabels(['{:.0f}K'.format(y/ 1e3) for y in ax.get_yticks()])

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add a background color
ax.set_facecolor('#f0f0f0')

plt.show()


Observe that Asia is the continent with the highest number of cyberattacks, possibly due to the weak policies and regulations on the field. Nevertheless, notice also that USA is in the top! 

In [None]:
# Read the data for the geographical chart
df_map = pd.read_csv('../Data/Map_origin_attacks.csv')

# Function to extract latitude and longitude using our data format
def extract_lat_lon(row):
    try:
        coordinates = json.loads(row)
        lat = coordinates['lat']
        lon = coordinates['lon']
        return lat, lon
    except (ValueError, KeyError):
        return None, None

In [None]:
# MAP CHART
# Convert Count column to float value
df_map['Count'] = df_map['Count'].str.replace(',', '').astype(float)

# Extract the longitude and latitude
df_map[['lat', 'lon']] = df_map['Geo Centroid'].apply(extract_lat_lon).apply(pd.Series)

# Convert the DataFrame to a GeoDataFrame
geometry = [Point(lon, lat) for lon, lat in zip(df_map['lon'], df_map['lat'])]
gdf = gpd.GeoDataFrame(df_map, geometry=geometry)

# Load the natural earth dataset as the base map
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Plot the 2D world map
world.boundary.plot(ax=ax, linewidth=1)

# Define the color and marker size based on 'Count' values
colors = []
sizes = []

for count in gdf['Count']:
    if 1 <= count < 2e6:
        colors.append('yellow')
        sizes.append(10)
    elif 2e6 <= count < 2.5e6:
        colors.append('orange')
        sizes.append(60)
    elif 2.5e6 <= count < 3.5e6:
        colors.append('darkorange')
        sizes.append(100)
    elif 3.5e6 <= count <= 5e6:
        colors.append('red')
        sizes.append(200)
    else:
        colors.append('gray')  
        sizes.append(10)

# Plot the data points with variable marker size and color
gdf.plot(ax=ax, markersize=sizes, color=colors, alpha=0.5, legend=True)
# Create a custom legend
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='yellow', markersize=4, label='1M to 2M'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=5, label='2M to 2.5M'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='darkorange', markersize=6, label='2.5M to 3.5M'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=7, label='3.5M to 5M'),
]

# Add the legend to the plot
ax.legend(handles=legend_elements, loc='best', title='Attack Count Range')

# Set axis labels and title
plt.title("Origin of attacks Geographic map")

# Show the map
plt.show()


### Attacks by Continent/Country destination

In [None]:
# BAR CHART CONTINENT OF ORIGIN
# Count the number of attacks from each continent
attack_counts = df['_source.hostGeoip.continent_code'].value_counts()

# Create a color palette (e.g., using a seaborn color palette)
colors = sns.color_palette("rocket",len(attack_counts))

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = attack_counts.plot(kind='bar', color=colors, width=0.7)  # Adjust width as needed

# Adding grid lines
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Cyberattacks by Continent of Origin', fontsize=14)
plt.xlabel('Continent', fontsize=12)
plt.ylabel('Number of Attacks', fontsize=12)

# Millions variable
ax.set_yticklabels(['{:.1f}M'.format(y/ 1e6) for y in ax.get_yticks()])
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add a background color
ax.set_facecolor('#f0f0f0')

plt.show()

In [None]:
# BAR CHART COUNTRY OF ORIGIN
# Count the number of attacks from each country
attack_counts = df['_source.hostGeoip.country_name'].value_counts().head(10)

# Create a color palette (e.g., using a seaborn color palette)
colors = sns.color_palette("rocket",len(attack_counts))

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = attack_counts.plot(kind='bar', color=colors, width=0.7)  # Adjust width as needed

# Adding grid lines
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Cyberattacks by Country of Origin', fontsize=14)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Attacks', fontsize=12)

# Thousands variable
ax.set_yticklabels(['{:.0f}K'.format(y/ 1e3) for y in ax.get_yticks()])

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add a background color
ax.set_facecolor('#f0f0f0')

plt.show()


While Asia is the continent most affected by cyberattacks, when we examine it by countries, it becomes evident that the USA is the most affected country.

We have seen that Asia is the continent with the highest number of cyberattacks. Are these attacks targeted to the USA?

In [None]:
# Attacks to USA
df_USA = df[df['_source.hostGeoip.country_name'] =='United States']
df_USA.reset_index(drop = True,inplace=True)

# Count the number of attacks from each country
attack_counts = df_USA['_source.geoip.continent_code'].value_counts()

# Create a color palette (e.g., using a seaborn color palette)
colors = sns.color_palette("rocket",len(attack_counts))

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = attack_counts.plot(kind='bar', color=colors, width=0.7)  # Adjust width as needed

# Adding grid lines
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Cyberattacks by Continent of Origin', fontsize=14)
plt.xlabel('Continent', fontsize=12)
plt.ylabel('Number of Attacks', fontsize=12)

# Thousands variable
ax.set_yticklabels(['{:.0f}K'.format(y/ 1e3) for y in ax.get_yticks()])
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add a background color
ax.set_facecolor('#f0f0f0')

plt.show()