In [None]:
import pandas as pd

df = pd.read_csv('data/data.csv')

df.head()

In [None]:
df.info()
df.describe() 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.countplot(data=df, y='Family') 
plt.title('Count of Each Ransomware Family')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.countplot(data=df, y='Threats')
plt.title('Count of Each Threat')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.countplot(data=df, y='Prediction')
plt.title('Count of Each Prediction')
plt.show()

In [None]:
df = df[df['Time'] >= 0]

df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
df = df[df['ExpAddress'] != '1']
df.shape

Splitting the data into a 70/30 split. 70% is left for the unlabeled pool, 30% is left for the unlabeled pool. In this unlabeled pool there is 20% reserved for a validation pool, and then 80% left for unlabeled training. This means the machine learning I am trying to achieve with this specific dataset is semi-supervised learning. 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# fix the column  typo
df.rename(columns={'Protcol': 'Protocol'}, inplace=True)

# Drop ID-like columns that won't be used in the model
df_cleaned = df.drop(columns=['SeddAddress', 'ExpAddress', 'IPaddress'])

le = LabelEncoder()
y_full = le.fit_transform(df_cleaned['Prediction'])


# drop the original text 'Prediction' column
features_df = df_cleaned.drop(columns=['Prediction'])

# define columns to one-hot encode
categorical_features = ['Protocol', 'Flag', 'Family', 'Threats']

# create the encoded features DataFrame
X_full = pd.get_dummies(features_df, columns=categorical_features, drop_first=True)



# separate 30% of the data to be  'labeled' pool
X_labeled_pool, X_unlabeled, y_labeled_pool, y_unlabeled = train_test_split(
    X_full, y_full, 
    test_size=0.70, # 70% of data becomes the 'unlabeled' set
    random_state=42, 
    stratify=y_full
)

# split the labeled pool into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(
    X_labeled_pool, y_labeled_pool, 
    test_size=0.20, # 20% of the labeled pool becomes the validation set
    random_state=42, 
    stratify=y_labeled_pool
)

# --- Verification ---
print("--- Data Shapes ---")
print(f"Labeled Training Set (X_train): {X_train.shape}")
print(f"Validation Set (X_val):       {X_val.shape}")
print(f"Unlabeled Set (X_unlabeled):  {X_unlabeled.shape}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# initialize the Random Forest Classifier
baseline_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# train model only on labeled data
baseline_model.fit(X_train, y_train)
# make predictions on the validation set
predictions = baseline_model.predict(X_val)
# evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")
# detailed classification report
print(classification_report(y_val, predictions))

In [None]:
import pandas as pd

# Get feature importances from the first model you trained
importances = baseline_model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the table
feature_importance_df

As we can see with the graph above, the importance scores show that there is no one singular strong performer, meaning there is little to no data leakage, and clusters (a normally important metric to check when it comes to data leakage) does not hold much of an importance significance meaning that the training model is mostly accurate and does not use any cheats. 

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# get predicited probabilities for unlabeled data using baseline
print("Making predictions on unlabeled data...")
unlabeled_probabilities = baseline_model.predict_proba(X_unlabeled)

# find the model's confidence in its top prediction for each sample
confidence_scores = np.max(unlabeled_probabilities, axis=1)

# set a confidence threshold
confidence_threshold = 0.98

# filter for high-confidence predictions
high_confidence_mask = confidence_scores > confidence_threshold
X_pseudo_labeled = X_unlabeled[high_confidence_mask]
pseudo_labels = baseline_model.predict(X_pseudo_labeled)

print(f"\\nFound {len(X_pseudo_labeled)} samples with >{confidence_threshold*100}% confidence.")

# combine the original training set with the new pseudo-labeled set
X_combined_train = np.vstack([X_train, X_pseudo_labeled])
y_combined_train = np.hstack([y_train, pseudo_labels])

# train a new model on the larger, combined dataset
print("\\nTraining new model on combined data...")
semi_supervised_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
semi_supervised_model.fit(X_combined_train, y_combined_train)
print("Training complete.")

# evaluate the new model on the same validation set
new_predictions = semi_supervised_model.predict(X_val)
new_accuracy = accuracy_score(y_val, new_predictions)

# compare results
print(f"\\nOriginal Baseline Model Accuracy: {accuracy * 100:.2f}%")
print(f"New Semi-Supervised Model Accuracy: {new_accuracy * 100:.2f}%")