In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from pickle import dump, load

# Load the dataset
df_income = pd.read_csv("../Data/income_evaluation_cleaned.csv")

In [2]:
income_cat = ['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [3]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_income[income_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(income_cat)

# Merge one-hot encoded features and drop the originals
df_income = df_income.merge(encode_df, left_index=True, right_index=True).drop(income_cat,1)
df_income

Unnamed: 0,age,hours-per-week,income,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,40,<=50K,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50,13,<=50K,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,38,40,<=50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,53,40,<=50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,28,40,<=50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,27,38,<=50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30158,40,40,>50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30159,58,40,<=50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30160,22,20,<=50K,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
# Split our preprocessed data into our features and target arrays
X = df_income.drop(columns='income').values
y = df_income['income'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Define the scaling function 
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Transform the training set
X_train_scaled = X_scaler.transform(X_train)


# Define the label encoder and fit it to the training set
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)

# Transform the labels of the training set
y_train_encoded = le.transform(y_train)

### Define the model
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train_encoded)

LogisticRegression(random_state=1)

In [5]:
# Save the model to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
dump(model, open('logisticregression.pkl', 'wb'))

# Save the scaling function to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
dump(scaler, open('scaler.pkl', 'wb'))

In [29]:
###---------- run to test the model ---------
# Define prediction labels.
predict_labels = ['<=50K','>50K']

# Load the model.
LogisticRegression = load(open('logisticregression.pkl', 'rb'))

# Load the scaler.
scaler = load(open('scaler.pkl', 'rb'))

# InputData
input_row1 = [[39, "State-gov", "Bachelors", "Never-married", "Adm-clerical", "Not-in-family", "White", "Male", 40, "United-States"]] # <=50K
# input_row2 = [[28, "Private", "Bachelors", "Married-civ-spouse", "Prof-specialty", "Wife", "Black", "Female", 40, "Cuba"]]  # <=50K


input_row1_df = pd.DataFrame(input_row1, columns=["age", "workclass", "education", "marital-status", "occupation", "relationship",  "race", "sex", "hours-per-week", "native-country"]
)
input_row1_df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States


In [31]:
# Encode user input
income_cat = ['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(input_row1_df[income_cat]))

# # Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(income_cat)

# # Merge one-hot encoded features and drop the originals
input_row1_df = input_row1_df.merge(encode_df, left_index=True, right_index=True).drop(income_cat,1)

In [34]:
input_row1_df

Unnamed: 0,age,hours-per-week,workclass_State-gov,education_Bachelors,marital-status_Never-married,occupation_Adm-clerical,relationship_Not-in-family,race_White,sex_Male,native-country_United-States
0,39,40,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
# 2. Transform each input using the scaler function.
input_row1_scaled = scaler.transform(input_row1_df)
print(f"printing input row1 scaled: {input_row1_scaled}")
# 3. Make a prediction for each input.
predict = LogisticRegression.predict(input_row1_scaled)
print(f'Prediction 1 is: {predict_labels[predict[0]]}')

ValueError: X has 10 features, but StandardScaler is expecting 100 features as input.