In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the data
data_path = ''
data = pd.read_csv(data_path)

# Handle infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Separate features and target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Print model evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Feature Importances
feature_importances = model.feature_importances_
features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

# Print all feature importances
print("Feature Importances:")
print(features)


In [None]:
results = []

# Loop through feature subsets from top 20 to top 30 features
for top_n in range(21, 22):  # From top 20 to top 30 features
    # Select the top 'top_n' features based on importance
    top_features = features.index[:top_n]
    X_train_reduced = scaler.fit_transform(X_train[top_features])
    X_test_reduced = scaler.transform(X_test[top_features])
    
    # Initialize and train a new RandomForest model
    model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and evaluate the model
    y_pred_reduced = model_reduced.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    
    # Store the result
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")



In [None]:
results = []

# Loop through feature subsets from top 20 to top 30 features
for top_n in range(15, 20):  # From top 20 to top 30 features
    # Select the top 'top_n' features based on importance
    top_features = features.index[:top_n]
    X_train_reduced = scaler.fit_transform(X_train[top_features])
    X_test_reduced = scaler.transform(X_test[top_features])
    
    # Initialize and train a new RandomForest model
    model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and evaluate the model
    y_pred_reduced = model_reduced.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    
    # Store the result
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")

In [None]:

results = []

# Loop through feature subsets from top 20 to top 30 features
for top_n in range(20, 25):  # From top 20 to top 30 features
    # Select the top 'top_n' features based on importance
    top_features = features.index[:top_n]
    X_train_reduced = scaler.fit_transform(X_train[top_features])
    X_test_reduced = scaler.transform(X_test[top_features])
    
    # Initialize and train a new RandomForest model
    model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and evaluate the model
    y_pred_reduced = model_reduced.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    
    # Store the result
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the data
data_path = ''
data = pd.read_csv(data_path)

# Handle infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Separate features and target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Print model evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Feature Importances
feature_importances = model.feature_importances_
features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

# Print all feature importances
print("Feature Importances:")
print(features)
