In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Caminho do dataset no Kaggle
data_path = "/kaggle/input/the-impacts-of-working-remotely-and-in-an-office/the-impacts-of-working-remotely-and-in-an-office.csv"

# Carregando o dataset
data = pd.read_csv(data_path)

# Explorando os dados
print(data.head())
print(data.info())
print(data.describe())

# Tratando valores ausentes (se existirem)
data.dropna(inplace=True)

# Selecionando as colunas relevantes
features = ['age', 'gender', 'occupation', 'time_bp', 'time_dp', 'like_hw', 'dislike_hw', 'prod_inc', 'relaxed']
X = data[features]
y = data['prefer']  # Coluna alvo

# Convertendo variáveis categóricas para numéricas
X = pd.get_dummies(X, drop_first=True)

# Dividindo os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modelo Random Forest inicial
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Avaliando o modelo inicial
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Acurácia do Random Forest inicial: {accuracy_rf * 100:.2f}%")

# Otimizando hiperparâmetros do Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f"Melhores hiperparâmetros: {grid_search.best_params_}")

best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f"Acurácia do Random Forest otimizado: {accuracy_best_rf * 100:.2f}%")

# Testando outros algoritmos

# Regressão Logística
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Acurácia da Regressão Logística: {accuracy_logistic * 100:.2f}%")

# SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Acurácia do SVM: {accuracy_svm * 100:.2f}%")

# Visualizações

# Distribuição de preferências por gênero
sns.countplot(x='prefer', hue='gender', data=data)
plt.title('Preferência por Gênero (Home x Office)')
plt.show()

# Preferência por ocupação
plt.figure(figsize=(10, 5))
sns.countplot(x='prefer', hue='occupation', data=data)
plt.title('Preferência por Ocupação (Home x Office)')
plt.xticks(rotation=45)
plt.show()

# Impacto da satisfação com o ambiente no home office
sns.scatterplot(x='like_hw', y='prod_inc', hue='prefer', data=data, palette='viridis')
plt.title('Relação entre Preferência, Produtividade e Satisfação com o Home Office')
plt.xlabel('Satisfação com o Home Office')
plt.ylabel('Aumento de Produtividade')
plt.show()

# Matriz de Confusão para o melhor modelo
cm = confusion_matrix(y_test, y_pred_best_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Office', 'Home'], yticklabels=['Office', 'Home'])
plt.title('Matriz de Confusão do Random Forest Otimizado')
plt.ylabel('Classe Real')
plt.xlabel('Classe Predita')
plt.show()

# Relatório de classificação para o melhor modelo
print("\nRelatório de Classificação para o Random Forest Otimizado:")
print(classification_report(y_test, y_pred_best_rf))

# Comparando Acurácia entre os Modelos
model_names = ['Random Forest', 'Logistic Regression', 'SVM']
accuracies = [accuracy_best_rf, accuracy_logistic, accuracy_svm]

plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies, palette='viridis')
plt.title('Comparação de Acurácia entre Modelos')
plt.ylabel('Acurácia (%)')
plt.show()
