In [18]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
import requests
import io
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
imputer_num = SimpleImputer(missing_values=np.nan, strategy='mean')
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
imputer_cat = SimpleImputer(strategy="most_frequent")
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from flytekit import task, workflow
from sklearn.preprocessing import LabelEncoder


In [19]:
url = "https://github.com/smadarab/flytelab/raw/main/census.csv" # Make sure the url is the raw version of the file on GitHub
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')),sep=',')
print("df is created",df.columns)
#df.dropna(inplace=True)
#df = df.reset_index()
print(df.columns)
train = df

df is created Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')
Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')


In [20]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from joblib import dump, load


In [21]:
ohe = OneHotEncoder(handle_unknown = 'ignore')
num_cols = ['age', 'education-num', 'capital-gain',
            'capital-loos', 'hour-per-week']
cat_cols = ['workclass', 
            'marital-status', 'occupation', 
            'relationship', 'race', 
            'sex', 'native-country']
log_transform_cols = ['capital-loos', 'capital-gain']    
def get_cat_cols(X):
    return X[cat_cols]
def get_num_cols(X):
    return X[num_cols]
def get_log_transform_cols(X):
    return X[log_transform_cols]
def get_dummies(X):
    print('\n \n',type(X))
    return pd.get_dummies(pd.DataFrame(X))
def one_hot_encode(X):
    print("one_hot_encode")
    ohe = OneHotEncoder(handle_unknown = 'ignore')
    ohe.fit(pd.DataFrame(X))
    dump(ohe, 'onehot.joblib') 
    y = ohe.transform(pd.DataFrame(X)).toarray()
    print('\n \n',y)
    return y
def label_encode(X):
    print("label_encode")
    df = pd.DataFrame(X)
    le = LabelEncoder()
    df[cols] = df[cols].apply(LabelEncoder().fit_transform)
    df.status = le.fit_transform(X)
def cat_imputer(X):
    print(X.shape)
    return(imputer_cat.fit_transform(X))
    #return X.apply(lambda col: imputer_cat.fit_transform(col))  

log_transform_pipeline = Pipeline([
('get_log_transform_cols', FunctionTransformer(get_log_transform_cols, validate=False)),
('imputer', SimpleImputer(strategy='mean')),   
('log_transform', FunctionTransformer(np.log1p))
])

num_cols_pipeline = Pipeline([
('get_num_cols', FunctionTransformer(get_num_cols, validate=False)),
('imputer', SimpleImputer(strategy='mean')),
('min_max_scaler', MinMaxScaler())
])

cat_cols_pipeline = Pipeline([
('get_cat_cols', FunctionTransformer(get_cat_cols, validate=False)),
('imputer', SimpleImputer(strategy="most_frequent")),
#('get_dummies', FunctionTransformer(get_dummies, validate=False))
('one_hot_encode', FunctionTransformer(one_hot_encode, validate=False))    
])       

steps_ = FeatureUnion([
('log_transform', log_transform_pipeline),
('num_cols', num_cols_pipeline),
('cat_cols', cat_cols_pipeline)
])


In [22]:
full_pipeline = Pipeline([('steps_', steps_)])
y = train['income'].map({'<=50K': 0, '>50K': 1})
X = full_pipeline.fit_transform(train)

one_hot_encode

 
 [[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [28]:
from git import Repo
import  git

repo = Repo('/Users/apple/flytelab')

In [29]:
repo

<git.repo.base.Repo '/Users/apple/flytelab/.git'>

In [35]:
import subprocess
subprocess.call(["git", "add","."])
subprocess.call(["git", "commit","-m","kfbjebfe"])

[my_project 55b638b] kfbjebfe
 3 files changed, 567 insertions(+), 4 deletions(-)
 create mode 100644 projects/my_project/my_project/onehot.joblib
 create mode 100644 projects/my_project/my_project/renamed.ipynb


0

In [36]:
subprocess.call(["git", "push"])

To https://github.com/smadarab/flytelab
   9e21e82..55b638b  my_project -> my_project


0

In [86]:
model = AdaBoostClassifier(n_estimators=300)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train = np.nan_to_num(X_train)
y_train=np.nan_to_num(y_train)
print("X_train dimensiona",X_train)
model.fit(X_train, y_train)

X_train dimensiona [[ 0.          0.          0.52054795 ...  1.          0.
   0.        ]
 [ 0.          0.          0.32876712 ...  1.          0.
   0.        ]
 [ 0.          0.          0.36986301 ...  1.          0.
   0.        ]
 ...
 [ 0.          0.          0.10958904 ...  1.          0.
   0.        ]
 [ 0.         10.23383392  0.4109589  ...  1.          0.
   0.        ]
 [ 0.          0.          0.1369863  ...  1.          0.
   0.        ]]


AdaBoostClassifier(n_estimators=300)

# Test part

In [87]:
num_cols = ['age', 'education-num', 'capital-gain',
            'capital-loos', 'hour-per-week']
cat_cols = ['workclass', 
            'marital-status', 'occupation', 
            'relationship', 'race', 
            'sex', 'native-country']
log_transform_cols = ['capital-loos', 'capital-gain']    
def get_cat_cols(X):
    return X[cat_cols]
def get_num_cols(X):
    return X[num_cols]
def get_log_transform_cols(X):
    return X[log_transform_cols]
def get_dummies(X):
    print('\n \n',type(X))
    return pd.get_dummies(pd.DataFrame(X))
def one_hot_encode(X):
    print("one_hot_encode")
    ohe = load('filename.joblib') 
    y = ohe.transform(pd.DataFrame(X)).toarray()
    print('\n \n',y)
    return y
def label_encode(X):
    print("label_encode")
    df = pd.DataFrame(X)
    le = LabelEncoder()
    df[cols] = df[cols].apply(LabelEncoder().fit_transform)
    df.status = le.fit_transform(X)
def cat_imputer(X):
    print(X.shape)
    return(imputer_cat.fit_transform(X))
    #return X.apply(lambda col: imputer_cat.fit_transform(col))  

log_transform_pipeline = Pipeline([
('get_log_transform_cols', FunctionTransformer(get_log_transform_cols, validate=False)),
('imputer', SimpleImputer(strategy='mean')),   
('log_transform', FunctionTransformer(np.log1p))
])

num_cols_pipeline = Pipeline([
('get_num_cols', FunctionTransformer(get_num_cols, validate=False)),
('imputer', SimpleImputer(strategy='mean')),
('min_max_scaler', MinMaxScaler())
])

cat_cols_pipeline = Pipeline([
('get_cat_cols', FunctionTransformer(get_cat_cols, validate=False)),
('imputer', SimpleImputer(strategy="most_frequent")),
#('get_dummies', FunctionTransformer(get_dummies, validate=False))
('one_hot_encode', FunctionTransformer(one_hot_encode, validate=False))    
])       

steps_ = FeatureUnion([
('log_transform', log_transform_pipeline),
('num_cols', num_cols_pipeline),
('cat_cols', cat_cols_pipeline)
])

In [88]:
age =  50
education_num =  13.0
capital_gain =  0.0
capital_loos = 0.0
hour_per_week = 13.0
workclass = "Self-emp-not-inc"
marital_status = "Married-civ-spouse"
occupation = "Exec-managerial"
relationship = "Husband"
race = "White"
sex = "Male"
native_country ="United-States"
final_weight = 77516
education="Bachelors"

In [89]:
dict_val = {'age':age, 'workclass':workclass, 'final-weight':final_weight, 'education':education, 'education-num':education_num,
       'marital-status':marital_status, 'occupation':occupation, 'relationship':relationship, 'race':race, 'sex':sex,
       'capital-gain':capital_gain, 'capital-loos':capital_loos, 'hour-per-week':hour_per_week, 'native-country':native_country}
       

In [90]:
X_train = pd.DataFrame(dict_val,index=[0])

In [91]:
X_train

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country
0,50,Self-emp-not-inc,77516,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States


In [92]:
full_pipeline = Pipeline([('steps_', steps_)])
X = full_pipeline.fit_transform(X_train)

one_hot_encode

 
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]




In [94]:
y = model.predict(X)

In [95]:
y

array([0.])