# Exercise : 1

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Load the dataset
data = pd.read_csv('HR_comma_sep.csv')

# Inspect the dataset
print(data.head())
print(data.info())
print(data.describe())

# Check for numeric features that may need rescaling
numeric_features = data.select_dtypes(include=['int64', 'float64'])

# Standardize the numeric features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)

# Convert categorical features into dummy/indicator variables
categorical_features = data.select_dtypes(include=['object'])
dummies = pd.get_dummies(categorical_features, drop_first=True)

# Combine scaled numeric features with dummies
scaled_df = pd.DataFrame(scaled_features, columns=numeric_features.columns)
final_data = pd.concat([scaled_df, dummies], axis=1)

# Separate features and target variable
X = final_data
y = data['left']  # Assuming 'left' is the target variable indicating if the employee left

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(solver='liblinear', max_iter=200)  # Using 'liblinear' for small datasets
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Confusion matrix, precision, and recall
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Confusion Matrix:")
print(cm)
print(f"Precision: {precision}")
print(f"Recall: {recall}")

   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2                   157   
1                0.80             0.86               5                   262   
2                0.11             0.88               7                   272   
3                0.72             0.87               5                   223   
4                0.37             0.52               2                   159   

   time_spend_company  Work_accident  left  promotion_last_5years  sales  \
0                   3              0     1                      0  sales   
1                   6              0     1                      0  sales   
2                   4              0     1                      0  sales   
3                   5              0     1                      0  sales   
4                   3              0     1                      0  sales   

   salary  
0     low  
1  medium  
2  medium  
3     low  
4 

# Exercise : 2

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('HR_comma_sep.csv')

# Inspect the dataset
print(data.head())
print(data.info())
print(data.describe())

# Check for numeric features that may need rescaling
numeric_features = data.select_dtypes(include=['int64', 'float64'])

# Standardize the numeric features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)

# Convert categorical features into dummy/indicator variables
categorical_features = data.select_dtypes(include=['object'])
dummies = pd.get_dummies(categorical_features, drop_first=True)

# Combine scaled numeric features with dummies
scaled_df = pd.DataFrame(scaled_features, columns=numeric_features.columns)
final_data = pd.concat([scaled_df, dummies], axis=1)

# Separate features and target variable
X = final_data
y = data['left']  # Assuming 'left' is the target variable indicating if the employee left

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the resulting datasets
print(f'Train features shape: {X_train.shape}')
print(f'Test features shape: {X_test.shape}')
print(f'Train target shape: {y_train.shape}')
print(f'Test target shape: {y_test.shape}')

   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2                   157   
1                0.80             0.86               5                   262   
2                0.11             0.88               7                   272   
3                0.72             0.87               5                   223   
4                0.37             0.52               2                   159   

   time_spend_company  Work_accident  left  promotion_last_5years  sales  \
0                   3              0     1                      0  sales   
1                   6              0     1                      0  sales   
2                   4              0     1                      0  sales   
3                   5              0     1                      0  sales   
4                   3              0     1                      0  sales   

   salary  
0     low  
1  medium  
2  medium  
3     low  
4 