In [None]:
# Step 1: Understand Data
import pandas as pd

data = pd.read_csv("Employee Hopping.csv")
print(data.head())
print(data.shape)
print(data.columns)
print(data.dtypes)
print(data.info())
print(data['Target_Column'].value_counts())  # Replace 'Target_Column' with the actual target column name

# Step 2: Extract X and y
X = data.drop(['Target_Column'], axis=1)  # Replace 'Target_Column' with the actual target column name
y = data['Target_Column']

# Step 3: Feature Engineering
X = pd.get_dummies(X, columns=['Categorical_Column1', 'Categorical_Column2', ...])  # Replace with actual categorical columns

# Step 4: Check shape of X and y
print(X.shape)

# Step 5: Model Development
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

# Step 6: Testing
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Feature Importance Value
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': rf_classifier.feature_importances_})
print(feature_importance)


# Step 8: Visualize RF Decision Tree using graphviz
from sklearn.tree import export_graphviz
import graphviz

# Choose one tree from the forest (e.g., first tree)
tree_to_visualize = 0
tree_dot_data = export_graphviz(rf_classifier.estimators_[tree_to_visualize],
                                out_file=None,
                                feature_names=X.columns,
                                class_names=['Stay', 'Leave'],  # Replace with actual class names
                                filled=True, rounded=True, special_characters=True)

graph = graphviz.Source(tree_dot_data)
graph.render("RandomForestTree")  # Save the visualization as a file

# Step 9: Fit RF models with a range of tree numbers and print Out-Of-Bag error
range_trees = [15, 20, 30, 40, 50, 100, 150, 200, 300, 400]
oob_errors = []

for n_trees in range_trees:
    rf_classifier.set_params(n_estimators=n_trees, warm_start=True, oob_score=True)
    rf_classifier.fit(X_train, y_train)
    oob_errors.append(1 - rf_classifier.oob_score_)

    print(f"Number of Trees: {n_trees}, Out-Of-Bag Error: {1 - rf_classifier.oob_score_}")

# Step 10: Plot OOB error for each tree
import matplotlib.pyplot as plt

plt.plot(range_trees, oob_errors, marker='o')
plt.xlabel("Number of Trees")
plt.ylabel("Out-Of-Bag Error")
plt.title("Out-Of-Bag Error vs. Number of Trees")
plt.show()


In [None]:
# Step 11: Compare with Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# Create Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)

# Visualize Decision Tree
dt_dot_data = tree.export_graphviz(dt_classifier,
                                   out_file=None,
                                   feature_names=X.columns,
                                   class_names=['Stay', 'Leave'],  # Replace with actual class names
                                   filled=True, rounded=True, special_characters=True)

dt_graph = graphviz.Source(dt_dot_data)
dt_graph.render("DecisionTree")  # Save the visualization as a file

# Print accuracy score and classification report for Decision Tree
print("Decision Tree Accuracy Score:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))

# Compare RF and DT models
print("\nComparison between RF and DT models:")
print("RF Accuracy Score:", accuracy_score(y_test, y_pred))
print("DT Accuracy Score:", accuracy_score(y_test, y_pred_dt))
