In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

def load_and_preprocess_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)

    # Drop irrelevant columns
    df_cleaned = df.drop(columns=["ID", "Unnamed: 22"], errors="ignore")

    # Encode categorical features
    if "Machine" in df_cleaned.columns:
        le = LabelEncoder()
        df_cleaned["Machine"] = le.fit_transform(df_cleaned["Machine"].astype(str))

    # Handle missing values
    df_cleaned.fillna(df_cleaned.mean(), inplace=True)

    # Reduce memory usage
    for col in df_cleaned.select_dtypes(include=["float64"]):
        df_cleaned[col] = df_cleaned[col].astype("float32")
    for col in df_cleaned.select_dtypes(include=["int64"]):
        df_cleaned[col] = df_cleaned[col].astype("int32")

    return df_cleaned

def create_model():
    return RandomForestClassifier(n_estimators=100, random_state=42)

def save_model(model, filename="antivirus_model.pkl"):
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

if __name__ == "__main__":
    # Example usage
    file_path = "Training_Set.csv"  # Change this to your dataset path
    df = load_and_preprocess_data(file_path)

    # Split data
    X = df.drop(columns=["label"])
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create model
    model = create_model()

    # Train model (Uncomment below if you want to train now)
    model.fit(X_train, y_train)
    # save_model(model)

    print("Model created. Train it using model.fit(X_train, y_train)")


Model created. Train it using model.fit(X_train, y_train)


In [5]:
save_model(model)

Model saved as antivirus_model.pkl


In [7]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load("antivirus_model.pkl")

# Example file features (replace with real extracted features)
new_file_data = {
    "SizeOfCode": 47616,
    "SizeOfInitializedData": 21504,
    "AddressOfEntryPoint": 49600,
    "ImageBase": 4194304,
    "Subsystem": 2,
    "DllCharacteristics": 32768,
    "SizeOfStackReserve": 1048576,
    "SizeOfHeapReserve": 1048576,
    "NumberOfRvaAndSizes": 16,
    "SectionsMeanEntropy": 3.233288256,
    "SectionsMinEntropy": 0,
    "SectionsMaxEntropy": 6.454907966,
    "ImportsNbDLL": 11,
    "ImportsNb": 114,
    "ExportsNb": 0,
    "ResourcesNb": 16,
    "ResourcesMeanEntropy": 3.479949179,
    "ResourcesMinEntropy": 1.174874633,
    "ResourcesMaxEntropy": 5.156196358,
    "VersionInformationSize": 15
}

# Convert to DataFrame
new_file_df = pd.DataFrame([new_file_data])

# Predict if the file is malware (1) or safe (0)
prediction = model.predict(new_file_df)

if prediction[0] == 1:
    print("⚠️ The file is a VIRUS!")
else:
    print("✅ The file is SAFE.")


✅ The file is SAFE.


In [8]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score

# Load testing dataset
test_df = pd.read_csv("Testing_Set.csv")

# Define important features
important_features = [
    "SizeOfCode", "SizeOfInitializedData", "AddressOfEntryPoint", "ImageBase",
    "Subsystem", "DllCharacteristics", "SizeOfStackReserve", "SizeOfHeapReserve",
    "NumberOfRvaAndSizes", "SectionsMeanEntropy", "SectionsMinEntropy", "SectionsMaxEntropy",
    "ImportsNbDLL", "ImportsNb", "ExportsNb", "ResourcesNb", "ResourcesMeanEntropy",
    "ResourcesMinEntropy", "ResourcesMaxEntropy", "VersionInformationSize"
]

# Prepare test data
X_test = test_df[important_features]
y_test = test_df["label"]

# Load trained model
model = joblib.load("antivirus_model.pkl")  # Ensure this file exists

# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy * 100:.2f}%")


✅ Model Accuracy: 96.02%
