# Step 1: Load the Data from a Single Cell

In [None]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Simulating loading the data into a pandas dataframe
df = pd.read_csv(io.StringIO(survey_results_public))

# Display the first few rows of the dataset to understand its structure
df.head()


# Step 2: Data Preprocessing

In [None]:
# Select relevant columns
df_cleaned = df[['YearsCode', 'Age', 'ConvertedCompYearly', 'RemoteWork', 'DevType', 'Employment']]

# Drop rows with missing values
df_cleaned = df_cleaned.dropna()

# Encoding the 'RemoteWork' column (whether the developer prefers remote work)
le_remote = LabelEncoder()
df_cleaned['RemoteWorkEncoded'] = le_remote.fit_transform(df_cleaned['RemoteWork'])

# Encoding the 'DevType' column (developer type)
# This column contains multiple categories, so we simplify it into a binary classification
df_cleaned['DevTypeEncoded'] = df_cleaned['DevType'].apply(lambda x: 1 if 'Developer' in x else 0)

# Encoding 'Employment' (full-time, part-time, etc.)
le_employment = LabelEncoder()
df_cleaned['EmploymentEncoded'] = le_employment.fit_transform(df_cleaned['Employment'])

# Feature selection: We will use 'YearsCode', 'Age', 'ConvertedCompYearly', 'DevTypeEncoded', and 'EmploymentEncoded' as the features.
X = df_cleaned[['YearsCode', 'Age', 'ConvertedCompYearly', 'DevTypeEncoded', 'EmploymentEncoded']]
y = df_cleaned['RemoteWorkEncoded']


# Step 3: Train-Test Split

In [None]:
# Train-test split (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the split size
X_train.shape, X_test.shape


# Step 4: Model Training

In [None]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


# Step 5: Model Evaluation

In [None]:
# Print classification report for detailed metrics
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print accuracy score
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Step 6: Prediction

In [None]:
# Example developer data (years of experience, age, salary, dev type, employment type)
sample = pd.DataFrame({
    'YearsCode': [5],  # Example: 5 years of experience
    'Age': [30],  # Example: 30 years old
    'ConvertedCompYearly': [95000],  # Example: $95,000 salary
    'DevTypeEncoded': [1],  # Example: Developer type (e.g., 'Developer' or other)
    'EmploymentEncoded': [0]  # Example: Employment type (e.g., Full-time or other)
})

# Make a prediction using the trained model
prediction = model.predict(sample)

# Convert prediction back to the original category (Remote, Hybrid, On-site)
predicted_work_preference = le_remote.inverse_transform(prediction)

print("Predicted Remote Work Preference:", predicted_work_preference[0])
