# Data Leak Site Analytics using Ransomwatch & Plotly
## Jupyterthon 2024

*This notebook downloads data from RansomWatch and leverages Plotly to create dashboards.*

#### Notebook Outline
1. Download the data 
2. Preparing the dataframe
3. Creating a custom theme
4. Scatter Plot
5. Density Heatmap
6. Bar Graph
7. DLS Comparison 

### Step 1: Download data & create input widget

In [None]:
import json, requests, datetime
import pandas as pd
from ipywidgets import IntText

url = 'https://raw.githubusercontent.com/joshhighet/ransomwatch/main/posts.json'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Save the file to the current working directory
    with open('posts.json', 'w') as f:
        f.write(response.text)
else:
    print('Failed to download file:', response.status_code)
# Load the JSON file
with open('posts.json', 'r') as f:
    data = json.load(f)

number_input = IntText(value=365, description='Enter the number of past days to filter by:', style={'description_width': 'initial'})

display(number_input)

### Step 2: Setup Plotly dataframes

In [None]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

days_filter = number_input.value

# Get the current date
now = datetime.datetime.now()
# Filter the data to only include posts from the past year
filtered_data = [post for post in data if (now - datetime.datetime.fromisoformat(post['discovered'])).days < days_filter]

# Extract the group names and timestamps into separate lists
group_names = []
timestamps = []
for post in filtered_data:
    group_names.append(post['group_name'])
    timestamps.append(post['discovered'])

# Convert the lists into a Pandas dataframe
df = pd.DataFrame({'group_name': group_names, 'timestamp': timestamps})

# Convert the timestamps into a datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Group and sort the data by the number of postings in each group
df_sorted = df.groupby(['group_name', 'timestamp']).size().reset_index(name='count')
df_sorted = df_sorted.sort_values(by='count', ascending=False)

fig = px.scatter(df_sorted, x='timestamp', y='group_name', color='group_name', title='Posting Frequency by group', color_continuous_scale='dense')
fig.show()

### Prepare Custom Plotly Theme 

In [None]:
config = {
  'toImageButtonOptions': {
    'format': 'jpeg', # one of png, svg, jpeg, webp
    'height': 600,
    'width': 840,
    'scale': 3 # Multiply title/legend/axis/canvas sizes by this factor
  }
}

pio.templates.default = "plotly_dark"

def apply_custom_theme(fig):
    fig.update_layout(font=dict(family='Roboto',size=18))
    fig.update_layout(paper_bgcolor="#000032")
    fig.update_layout(yaxis_title=None)
    fig.update_layout(xaxis_title=None)
    fig.show(config=config)
print("[*] custom theme prepared")

### Visualization Option 1: Scatter Plot with custom theme 

In [None]:
fig2 = px.scatter(df_sorted, x='timestamp', y='group_name', color='group_name', title='Posting Frequency by group', color_continuous_scale='dense')
apply_custom_theme(fig2)

### Visualization Option 2: Density Heatmap

In [None]:
# Group and sort the data by the number of postings in each group
df_resorted = df.groupby(['group_name', 'timestamp']).size().reset_index(name='count')

# Filter the DataFrame to keep only the top 20 most common groups based on the 'count' column
top_25_groups = df_resorted.groupby('group_name')['count'].sum().nlargest(25).index
df_resorted = df_resorted[df_resorted['group_name'].isin(top_25_groups)]

# Create the density heatmap using Plotly Express
fig1 = px.density_heatmap(df_resorted, x='timestamp', y='group_name', z='count',
                          title='Posting Frequency by Group (Top 25)', color_continuous_scale='Thermal')

# Update the layout to show all x values in the heatmap
fig1.update_layout(yaxis=dict(tickmode='linear'))

apply_custom_theme(fig1)

### Visualization Option 3: Bar Graph

In [None]:
df_sorted = df.groupby('group_name').size().reset_index(name='count').sort_values(by='count', ascending=True)

# Filter the DataFrame to keep only the top 20 most common groups based on the 'count' column
top_25_groups = df_sorted.groupby('group_name')['count'].sum().nlargest(25).index
df_sorted = df_sorted[df_sorted['group_name'].isin(top_25_groups)]

fig4 = px.bar(df_sorted, x='group_name', y='count', color='count', title='Top 25 Data Leak Sites', color_continuous_scale='Reds')

fig4.update_xaxes(tickvals=df_sorted['group_name'])

apply_custom_theme(fig4)

## Comparing Multiple Sites (Posting by Month)

In [None]:
from ipywidgets import Textarea

text_area_1 = Textarea(
    value='',  # Default value
    placeholder='Enter ransomware group names, separated by new lines.',  # Placeholder text
    description='Groups:',  # Label for the widget
    disabled=False,  # Enable/disable the widget
    layout={'width': '400px', 'height': '100px'}  # Set the size of the Textarea
)

display(text_area_1)

In [None]:
import pandas as pd
import plotly.express as px

# Assuming text_area_1.value contains the group names separated by new lines
group_names_input = text_area_1.value.split('\n')  # Split input by new lines
group_names_input = [name.strip() for name in group_names_input if name.strip()]  # Clean up whitespace

cutoff_date = now - datetime.timedelta(days=days_filter)

# Filter the dataset for the specified ransomware groups and within the days filter
filtered_data = [post for post in data if post['group_name'] in group_names_input and datetime.datetime.fromisoformat(post['discovered']) > cutoff_date]

# Convert the filtered data into a Pandas dataframe
df = pd.DataFrame(filtered_data)

# Ensure the dataframe is not empty and has the expected column
if not df.empty and 'discovered' in df.columns and 'group_name' in df.columns:
    # Convert the 'discovered' column to datetime format for time series analysis
    df['discovered'] = pd.to_datetime(df['discovered'])

    # Setting the 'discovered' column as the index for resampling
    df.set_index('discovered', inplace=True)

    if len(df['group_name'].unique()) > 1:
        # For multiple groups, group by 'group_name' before resampling
        df_resampled = df.groupby('group_name').resample('M').size()
        df_resampled = df_resampled.reset_index(name='event_count')
    else:
        # For a single group, just resample and count, then adjust DataFrame structure
        df_resampled = df.resample('M').size().reset_index(name='event_count')
        # Manually add the group name if only one group is present
        single_group_name = df['group_name'].iloc[0]
        df_resampled['group_name'] = single_group_name

    # Ensure 'month' column is correctly named for both cases
    df_resampled.rename(columns={'discovered': 'month'}, inplace=True)

    # Plotting
    fig = px.bar(df_resampled, x='month', y='event_count', color='group_name', 
                 title='Monthly Data Leak Site Postings by Ransomware Group',
                 labels={'event_count': 'Number of Posts', 'month': 'Month'},
                 barmode='group')

    # Update layout for better x-axis date formatting
    fig.update_layout(xaxis=dict(tickformat='%Y-%m', dtick="M1"),
                      xaxis_title='Month',
                      yaxis_title='Number of Posts')

    apply_custom_theme(fig)
else:
    print("No data available for the specified ransomware groups or incorrect data structure.")


## Comparing Multiple Sites (Posting by Week)

In [None]:
import pandas as pd
import plotly.express as px

# Assuming text_area_1.value contains the group names separated by new lines
group_names_input = text_area_1.value.split('\n')  # Split input by new lines
group_names_input = [name.strip() for name in group_names_input if name.strip()]  # Clean up whitespace

cutoff_date = now - datetime.timedelta(days=days_filter)

# Filter the dataset for the specified ransomware groups and within the days filter
filtered_data = [post for post in data if post['group_name'] in group_names_input and datetime.datetime.fromisoformat(post['discovered']) > cutoff_date]

# Convert the filtered data into a Pandas dataframe
df = pd.DataFrame(filtered_data)

# Ensure the dataframe is not empty and has the expected column
if not df.empty and 'discovered' in df.columns and 'group_name' in df.columns:
    # Convert the 'discovered' column to datetime format for time series analysis
    df['discovered'] = pd.to_datetime(df['discovered'])
    
    # Setting the 'discovered' column as the index for resampling
    df.set_index('discovered', inplace=True)

    # Determine if there are multiple groups or a single group
    if len(df['group_name'].unique()) > 1:
        # Multiple groups: Group by 'group_name', then resample by week, and count events
        df_resampled = df.groupby('group_name').resample('W').size().reset_index(name='event_count')
    else:
        # Single group: Simply resample by week and count events
        df_resampled = df.resample('W').size().reset_index(name='event_count')
        # Add the group name column back if only one group is present
        df_resampled['group_name'] = df['group_name'].iloc[0]

    # Resetting index to make 'discovered' a column again for plotting
    df_resampled.rename(columns={'discovered': 'week'}, inplace=True)

    # Plotting
    fig = px.bar(df_resampled, x='week', y='event_count', color='group_name', 
                 title='Weekly Data Leak Site Postings by Ransomware Group',
                 labels={'event_count': 'Number of Posts', 'week': 'Week'},
                 barmode='group')

    fig.update_layout(
        xaxis=dict(
            tickformat='%Y-%m-%d',  # Keep the date format
            tickmode='auto',  # Let Plotly decide the best tick mode
            # Alternatively, you can specify 'tickvals' with a list of specific dates you want to mark as ticks.
        ),
        xaxis_title='Week Starting',
        yaxis_title='Number of Posts'
    )

    apply_custom_theme(fig)
else:
    print("No data available for the specified ransomware groups or incorrect data structure.")
