In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from pickle import dump, load

# Load the dataset
df_income = pd.read_csv("../Data/income_evaluation_cleaned.csv")
df_income

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
30157,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K
30158,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K
30159,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K
30160,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K


In [2]:
# Transform category columns
def changeWorkclass(workclass):
    if workclass == "Private":
        return 0
    elif workclass == "Self-emp-not-inc":
        return 1
    elif workclass == "Local-gov":
        return 2
    elif workclass == "State-gov":
        return 3
    elif workclass == "Self-emp-inc":
        return 4
    elif workclass == "Federal-gov":
        return 5
    elif workclass == "Without-pay":
        return 6
    else:
        return 999
    
def changeEducation(education):
    if education == "Preschool":
        return 0
    elif education == "1st-4th":
        return 1
    elif education == "5th-6th":
        return 2
    elif education == "7th-8th":
        return 3
    elif education == "9th":
        return 4
    elif education == "10th":
        return 5
    elif education == "11th":
        return 6
    elif education == "12th":
        return 7
    elif education == "HS-grad":
        return 8
    elif education == "Some-college":
        return 9
    elif education == "Assoc-voc":
        return 10
    elif education == "Assoc-acdm":
        return 11
    elif education == "Bachelors":
        return 12
    elif education == "Masters":
        return 13
    elif education == "Prof-school":
        return 14
    elif education == "Doctorate":
        return 15
    else:
        return 999

def changeMarital(marital):
    if marital == "Divorced":
        return 0
    elif marital == "Married-AF-spouse":
        return 1
    elif marital == "Married-civ-spouse":
        return 2
    elif marital == "Married-spouse-absent":
        return 3
    elif marital == "Never-married":
        return 4
    elif marital == "Separated":
        return 5
    elif marital == "Widowed":
        return 6
    else:
        return 999

def changeOccupation(occupation):
    if occupation == "Adm-clerical":
        return 0
    elif occupation == "Armed-Forces":
        return 1
    elif occupation == "Craft-repair":
        return 2
    elif occupation == "Exec-managerial":
        return 3
    elif occupation == "Farming-fishing":
        return 4
    elif occupation == "Handlers-cleaners":
        return 5
    elif occupation == "Machine-op-inspct":
        return 6
    elif occupation == "Other-service":
        return 7
    elif occupation == "Priv-house-serv":
        return 8
    elif occupation == "Prof-specialty":
        return 9
    elif occupation == "Protective-serv":
        return 10
    elif occupation == "Sales":
        return 11
    elif occupation == "Tech-support":
        return 12
    elif occupation == "Transport-moving":
        return 13
    else: 
        return 999
    
def changeRelationship(relationship):
    if relationship == "Husband":
        return 0
    elif relationship == "Wife":
        return 1
    elif relationship == "Not-in-family":
        return 2
    elif relationship == "Own-child":
        return 3
    elif relationship == "Unmarried":
        return 4
    elif relationship == "Other-relative":
        return 5
    else:
        return 999

def changeRace(race):
    if race == "White":
        return 0
    elif race == "Black":
        return 1
    elif race == "Asian-Pac-Islander":
        return 2
    elif race == "Amer-Indian-Eskimo":
        return 3
    elif race == "Other":
        return 4
    else:
        return 999
    
def changeSex(sex):
    if sex == "Female":
        return 0
    elif sex == "Male":
        return 1
    else:
        return 999

def changeCountry(country):
    if country == "United-States":
        return 0
    elif country == "Mexico":
        return 1
    elif country == "Philippines":
        return 2
    elif country == "Germany":
        return 3
    elif country == "Puerto-Rico":
        return 4
    elif country == "Canada":
        return 5
    elif country == "El-Salvador":
        return 6
    elif country == "India":
        return 7
    elif country == "Cuba":
        return 8
    elif country == "England":
        return 9
    elif country == "Jamaica":
        return 10
    elif country == "South":
        return 11
    elif country == "China":
        return 12
    elif country == "Italy":
        return 13
    elif country == "Dominican-Republic":
        return 14
    elif country == "Vietnam":
        return 15
    elif country == "Guatemala":
        return 16
    elif country == "Japan":
        return 17
    elif country == "Poland":
        return 18
    elif country == "Columbia":
        return 19
    elif country == "Haiti":
        return 20
    elif country == "Taiwan":
        return 21
    elif country == "Iran":
        return 22
    elif country == "Portugal":
        return 23
    elif country == "Nicaragua":
        return 24
    elif country == "Peru":
        return 25
    elif country == "Greece":
        return 26
    elif country == "France":
        return 27
    elif country == "Ecuador":
        return 28
    elif country == "Ireland":
        return 29
    elif country == "Hong":
        return 30
    elif country == "Cambodia":
        return 31
    elif country == "Trinadad&Tobago":
        return 32
    elif country == "Laos":
        return 33
    elif country == "Thailand":
        return 34
    elif country == "Yugoslavia":
        return 35
    elif country == "Outlying-US(Guam-USVI-etc)":
        return 36
    elif country == "Hungary":
        return 37
    elif country == "Honduras":
        return 38
    elif country == "Scotland":
        return 39
    elif country == "Holand-Netherlands":
        return 40
    else:
        return 999

In [3]:
# apply the function to transform category columns
df_income["workclass"] = df_income["workclass"].apply(changeWorkclass)
df_income["education"] = df_income["education"].apply(changeEducation)
df_income["marital-status"] = df_income["marital-status"].apply(changeMarital)
df_income["occupation"] = df_income["occupation"].apply(changeOccupation)
df_income["relationship"] = df_income["relationship"].apply(changeRelationship)
df_income["race"] = df_income["race"].apply(changeRace)
df_income["sex"] = df_income["sex"].apply(changeSex)
df_income["native-country"] = df_income["native-country"].apply(changeCountry)
df_income.head(5)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,3,12,4,0,2,0,1,40,0,<=50K
1,50,1,12,2,3,0,0,1,13,0,<=50K
2,38,0,8,0,5,2,0,1,40,0,<=50K
3,53,0,6,2,5,0,1,1,40,0,<=50K
4,28,0,12,2,9,1,1,0,40,8,<=50K


In [4]:
# check if there is any undefined value (=999)
print(f"workclass: {df_income['workclass'].unique()}")
print(f"education: {df_income['education'].unique()}")
print(f"marital-status: {df_income['marital-status'].unique()}")
print(f"occupation: {df_income['occupation'].unique()}")
print(f"relationship: {df_income['relationship'].unique()}")
print(f"race: {df_income['race'].unique()}")
print(f"sex: {df_income['sex'].unique()}")
print(f"native-country: {df_income['native-country'].unique()}")

workclass: [3 1 0 5 2 4 6]
education: [12  8  6 13  4  9 11  3 15 10 14  2  5  0  7  1]
marital-status: [4 2 0 3 5 1 6]
occupation: [ 0  3  5  9  7 11 13  4  6 12  2 10  1  8]
relationship: [2 0 1 3 4 5]
race: [0 1 2 3 4]
sex: [1 0]
native-country: [ 0  8 10  7  1  4 38  9  5  3 22  2 18 19 31 34 28 33 21 20 23 14  6 27
 16 13 12 11 17 35 25 36 39 32 26 24 15 30 29 37 40]


In [5]:
# Split our preprocessed data into our features and target arrays
X = df_income.drop(columns='income').values
y = df_income['income'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Define the scaling function 
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Transform the training set
X_train_scaled = X_scaler.transform(X_train)

print(f"{X_train_scaled.shape}")

(22621, 10)


In [6]:
# Define the label encoder and fit it to the training set
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)

# Transform the labels of the training set
y_train_encoded = le.transform(y_train)

### Define the model
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train_encoded)

LogisticRegression(random_state=1)

In [7]:
# Save the model to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
dump(model, open('../pkl/logisticregression_new.pkl', 'wb'))

# Save the scaling function to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
dump(scaler, open('../pkl/scaler_new.pkl', 'wb'))

In [8]:
###---------- run to test the model ---------
# Define prediction labels.
predict_labels = ['<=50K','>50K']

# Load the model.
LogisticRegression = load(open('../pkl/logisticregression_new.pkl', 'rb'))

# Load the scaler.
scaler = load(open('../pkl/scaler_new.pkl', 'rb'))

# InputData
input_row1 = [[39, "State-gov", "Bachelors", "Never-married", "Adm-clerical", "Not-in-family", "White", "Male", 40, "United-States"]] # <=50K
# input_row2 = [[28, "Private", "Bachelors", "Married-civ-spouse", "Prof-specialty", "Wife", "Black", "Female", 40, "Cuba"]]  # <=50K


input_row1_df = pd.DataFrame(input_row1, columns=["age", "workclass", "education", "marital-status", "occupation", "relationship",  "race", "sex", "hours-per-week", "native-country"]
)
input_row1_df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States


In [9]:
# apply the function to transform category columns
input_row1_df["workclass"] = input_row1_df["workclass"].apply(changeWorkclass)
input_row1_df["education"] = input_row1_df["education"].apply(changeEducation)
input_row1_df["marital-status"] = input_row1_df["marital-status"].apply(changeMarital)
input_row1_df["occupation"] = input_row1_df["occupation"].apply(changeOccupation)
input_row1_df["relationship"] = input_row1_df["relationship"].apply(changeRelationship)
input_row1_df["race"] = input_row1_df["race"].apply(changeRace)
input_row1_df["sex"] = input_row1_df["sex"].apply(changeSex)
input_row1_df["native-country"] = input_row1_df["native-country"].apply(changeCountry)
input_row1_df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,39,3,12,4,0,2,0,1,40,0


In [10]:
# 2. Transform each input using the scaler function.
input_row1_scaled = scaler.transform(input_row1_df)
print(f"printing input row1 scaled: {input_row1_scaled}")
# 3. Make a prediction for each input.
predict = LogisticRegression.predict(input_row1_scaled)
print(f'Prediction 1 is: {predict_labels[predict[0]]}')

printing input row1 scaled: [[ 0.04328778  1.8201091   1.13053162  0.94606574 -1.47727866  0.28238756
  -0.34809437  0.69683916 -0.07632394 -0.21991236]]
Prediction 1 is: <=50K
