In [7]:
# Import the modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [8]:
#Correctly read in the 'adult_cleaned.csv' file to correctly maintain variable type integrity
adult_cleaned_df = pd.read_csv("Resources/adult_cleaned.csv", dtype={'Income over 50k? 0=no 1=yes':str})
adult_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30718 entries, 0 to 30717
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   age                          30718 non-null  int64 
 1   workclass                    30718 non-null  object
 2   education                    30718 non-null  object
 3   marital.status               30718 non-null  object
 4   occupation                   30718 non-null  object
 5   relationship                 30718 non-null  object
 6   race                         30718 non-null  object
 7   sex                          30718 non-null  object
 8   hours.per.week               30718 non-null  int64 
 9   Income over 50k? 0=no 1=yes  30718 non-null  object
dtypes: int64(2), object(8)
memory usage: 2.3+ MB


In [9]:
#display the dataset in its entirety 
adult_cleaned_df

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,hours.per.week,Income over 50k? 0=no 1=yes
0,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,18,0
1,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,40,0
2,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,40,0
3,34,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,45,0
4,38,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,40,0
...,...,...,...,...,...,...,...,...,...,...
30713,22,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,40,0
30714,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,0
30715,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,1
30716,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,0


In [10]:
# Set the column 'Income over 50k? 0=no 1=yes' as the target variable and the rest as features
X = adult_cleaned_df.drop(columns=['Income over 50k? 0=no 1=yes'])
y = adult_cleaned_df['Income over 50k? 0=no 1=yes']

# Convert categorical variables to numeric using one-hot encoding
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Build and initiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1, max_iter=500)

# Fit the model using training data
classifier.fit(X_train, y_train)

# Make predictions
predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model accuracy: {accuracy:.2f}")


Model accuracy: 0.83


In [11]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[5276  476]
 [ 829 1099]]


In [12]:
# Print the classification report
print('Classification Report:')
print(classification_report(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      5752
           1       0.70      0.57      0.63      1928

    accuracy                           0.83      7680
   macro avg       0.78      0.74      0.76      7680
weighted avg       0.82      0.83      0.82      7680

