In [1]:
import pandas as pd

In [2]:
from aux_functions import generate_language_features

_ = generate_language_features(pd.DataFrame(["Eirik", "Eirika"], columns=["Name"]))
_.head(5)

Unnamed: 0,Name,final_letter_is_vowel,syllable_count,last_1_letter,last_2_letter,last_3_letter
0,EIRIK,0,1,K,IK,RIK
1,EIRIKA,1,2,A,KA,IKA


In [3]:
from sklearn.preprocessing import OneHotEncoder

def get_one_hot_encoder(df, 
                categorical_columns = ['last_1_letter',	'last_2_letter', 'last_3_letter']
                ):
    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)    
    encoder.fit(df[categorical_columns])
    print(encoder)
    
    return encoder

encoder = get_one_hot_encoder(_)

OneHotEncoder(handle_unknown='ignore', sparse=False)


In [4]:

print("Names goes in, this comes out:")
nuff = generate_language_features(pd.DataFrame(["Eirik", "Eirika"], columns=["Name"]))#.drop(['Name'],axis=1)
nuff.head()

ohe = encoder.transform(nuff[['last_1_letter',	'last_2_letter', 'last_3_letter']])
print("encoder transformed")
print(ohe)
encoder.get_feature_names_out()

ohe_df = pd.DataFrame(ohe, columns=encoder.get_feature_names_out())
ohe_df

#tt = nuff.append(ohe_df, axis=0)
tt = pd.concat([nuff, ohe_df], axis=1)
tt

Names goes in, this comes out:


Unnamed: 0,Name,final_letter_is_vowel,syllable_count,last_1_letter,last_2_letter,last_3_letter
0,EIRIK,0,1,K,IK,RIK
1,EIRIKA,1,2,A,KA,IKA


encoder transformed
[[0. 1. 1. 0. 0. 1.]
 [1. 0. 0. 1. 1. 0.]]


array(['last_1_letter_A', 'last_1_letter_K', 'last_2_letter_IK',
       'last_2_letter_KA', 'last_3_letter_IKA', 'last_3_letter_RIK'],
      dtype=object)

Unnamed: 0,last_1_letter_A,last_1_letter_K,last_2_letter_IK,last_2_letter_KA,last_3_letter_IKA,last_3_letter_RIK
0,0.0,1.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,1.0,0.0


Unnamed: 0,Name,final_letter_is_vowel,syllable_count,last_1_letter,last_2_letter,last_3_letter,last_1_letter_A,last_1_letter_K,last_2_letter_IK,last_2_letter_KA,last_3_letter_IKA,last_3_letter_RIK
0,EIRIK,0,1,K,IK,RIK,0.0,1.0,1.0,0.0,0.0,1.0
1,EIRIKA,1,2,A,KA,IKA,1.0,0.0,0.0,1.0,1.0,0.0


In [5]:
# Lets make one for all names i ntk
from ntk import Ntk

def load_gendered_names():
    ntk = Ntk()

    m = [n for n in ntk.gutter if n not in ntk.jenter]
    f = [n for n in ntk.jenter if n not in ntk.gutter]

    names = m + f
    genders = ['Male' for n in m] + ["Female" for n in f]

    names_df = pd.DataFrame([(k, v) for k,v in zip(names, genders)], columns=['Name', 'Gender'])
    return names_df


df = load_gendered_names()
df.head(2)
len(df)


Unnamed: 0,Name,Gender
0,Sakir,Male
1,Jef,Male


5492

In [6]:
names_features_df = generate_language_features(load_gendered_names())
names_features_df.head(3)
len(names_features_df)

Unnamed: 0,Name,Gender,final_letter_is_vowel,syllable_count,last_1_letter,last_2_letter,last_3_letter
0,SAKIR,Male,0,1,R,IR,KIR
1,JEF,Male,0,1,F,EF,JEF
2,HOLGER,Male,0,2,R,ER,GER


5492

In [7]:
ohe = get_one_hot_encoder(names_features_df, 
        categorical_columns=[n for n in names_features_df.columns if n.startswith("last_")])

OneHotEncoder(handle_unknown='ignore', sparse=False)


In [8]:
cat_features = names_features_df[[n for n in names_features_df.columns if n.startswith("last_")]]
encoded_features = ohe.transform(cat_features)
#encoded_features


ohe_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out())

#ohe_df

non_categorical = names_features_df[[n for n in names_features_df.columns if not n.startswith("last_")]]

df_transformed = pd.concat([non_categorical, ohe_df], axis=1)
df_transformed.head(3)
len(df_transformed)


Unnamed: 0,Name,Gender,final_letter_is_vowel,syllable_count,last_1_letter_*,last_1_letter_A,last_1_letter_B,last_1_letter_C,last_1_letter_D,last_1_letter_E,last_1_letter_F,last_1_letter_G,last_1_letter_H,last_1_letter_I,last_1_letter_J,...,last_3_letter_ØGG,last_3_letter_ØRG,last_3_letter_ØRK,last_3_letter_ØRN,last_3_letter_ØRT,last_3_letter_ØVE,last_3_letter_ØYA,last_3_letter_ÚLI,last_3_letter_ÚNA,last_3_letter_ÚNI,last_3_letter_ÚSI,last_3_letter_ÚST,last_3_letter_ÝR,last_3_letter_ÞÓR,last_3_letter_ﾘRN
0,SAKIR,Male,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JEF,Male,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HOLGER,Male,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


5492

In [9]:
y = df_transformed['Gender']
X = df_transformed.drop(columns=['Gender', 'Name'])


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=99)

In [11]:
X_train.shape, type(X_train)
y_train.shape, type(y_train)

X_train

((4119, 1865), pandas.core.frame.DataFrame)

((4119,), pandas.core.series.Series)

Unnamed: 0,final_letter_is_vowel,syllable_count,last_1_letter_*,last_1_letter_A,last_1_letter_B,last_1_letter_C,last_1_letter_D,last_1_letter_E,last_1_letter_F,last_1_letter_G,last_1_letter_H,last_1_letter_I,last_1_letter_J,last_1_letter_K,last_1_letter_L,...,last_3_letter_ØGG,last_3_letter_ØRG,last_3_letter_ØRK,last_3_letter_ØRN,last_3_letter_ØRT,last_3_letter_ØVE,last_3_letter_ØYA,last_3_letter_ÚLI,last_3_letter_ÚNA,last_3_letter_ÚNI,last_3_letter_ÚSI,last_3_letter_ÚST,last_3_letter_ÝR,last_3_letter_ÞÓR,last_3_letter_ﾘRN
4643,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1934,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3275,1,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3300,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5283,1,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1737,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3240,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5305,1,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#%pip install flaml

#sklearn.__version__

In [22]:
# Initialize an AutoML instance
from flaml import AutoML
automl = AutoML()

automl_settings = {
    "time_budget": 50,#43200, #10801,  # in seconds (This will train for 10 minutes) 3600=1h
    "metric": 'macro_f1',
    "task": 'classification',
    "log_file_name": "nlp.log",
    'n_jobs': -1#,
#    'eval_method': 'cv',
#    'n_splits': 5
}

# perhaps flaml also can nest models, as ensambles...
# ensemble – boolean or dict | default=False. Whether to perform ensemble after search. 
# Can be a dict with keys ‘passthrough’ and ‘final_estimator’ to specify the passthrough 
# and final_estimator in the stacker.


# Fit the models
automl.fit(X_train=X_train, y_train=y_train, verbose=4, **automl_settings)

[flaml.automl: 12-02 12:59:03] {1485} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 12-02 12:59:03] {1489} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 12-02 12:59:03] {1540} INFO - Minimizing error metric: 1-macro_f1
INFO:flaml.automl:Minimizing error metric: 1-macro_f1
[flaml.automl: 12-02 12:59:03] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']
INFO:flaml.automl:List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 12-02 12:59:03] {1826} INFO - iteration 0, current learner lgbm
INFO:flaml.automl:iteration 0, current learner lgbm
[flaml.tune.tune: 12-02 12:59:03] {403} INFO - trial 1 config: {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0}
INFO:flaml.tu

In [23]:
# Time to find best model (in seconds)
automl.time_to_find_best_model

# Inspect the best model
automl.model

# look at estimator
automl.model.estimator


flaml_pred = automl.predict(X_val)
# Inspect the confusion matrix
#automl.model.score(X_val, y_val)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, flaml_pred)

from sklearn.metrics import accuracy_score

acc = str(round(accuracy_score(y_val, flaml_pred)*100, 2)).replace(".", "_")
acc

from sklearn.metrics import classification_report
print(classification_report(y_val, flaml_pred, target_names=['0','1']))


24.745869636535645

<flaml.model.ExtraTreesEstimator at 0x10e600100>

ExtraTreesClassifier(criterion='entropy', max_features=0.6603699418332465,
                     max_leaf_nodes=395, n_estimators=4, n_jobs=-1)

array([[585,  89],
       [104, 595]])

'85_94'

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       674
           1       0.87      0.85      0.86       699

    accuracy                           0.86      1373
   macro avg       0.86      0.86      0.86      1373
weighted avg       0.86      0.86      0.86      1373



In [14]:
type(automl.model.estimator)

sklearn.ensemble._forest.ExtraTreesClassifier

In [24]:

time_used = automl_settings.get('time_budget', 'unknown_time')
model_name = automl.model.estimator.__class__.__name__

file_name = f'../../data/models/automl_{model_name}_{time_used}_s_acs_{acc}.pkl'
'''pickle and save the automl object'''
import pickle
with open(file_name, 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

'''load pickled automl object'''
with open(file_name, 'rb') as f:
    automl = pickle.load(f)

automl.predict_proba(X_val)[:,1]

'pickle and save the automl object'

'load pickled automl object'

array([0.05077413, 1.        , 1.        , ..., 0.79444444, 0.75      ,
       0.        ])

In [16]:
#feature1 = generate_language_features(pd.DataFrame(["Eirik"], columns=["Name"]))

# her trenger jeg features slik mine egne funksjoner har definert det:
# 1 gen features
# 2 ohe på kategorisk varabler
# drop navn

#automl.predict_proba(feature1)

In [17]:
from sklearn.base import TransformerMixin, BaseEstimator
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print("Debug transform")
        print(X)
        print(pd.DataFrame(X).head())
        print(X.shape)
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [18]:
import pandas as pd
from sklearn.pipeline import Pipeline

class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

def drop_cols(input_df, cols=['Name', 'Gender']):
    # for col in cols:
    #     if col in input_df:
    #         input_df.drop()
    input_df.drop(labels = cols, axis=1, errors='ignore', inplace=True)
    # input_df["text"] = input_df["text"].map(lambda t: t.upper())
    return input_df



from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("one_hot", OneHotEncoder(handle_unknown="ignore"), ["last_1_letter", "last_2_letter", "last_3_letter"]),
    ],
    remainder='passthrough'
)

import lightgbm as lgb


# this pipeline has a single step
pipeline = Pipeline(
    verbose=True,
    steps=[
    ("generate_features", DataframeFunctionTransformer(generate_language_features)),
   ("drop_cols", DataframeFunctionTransformer(drop_cols)),
   # ('dbg', Debug()),
    ("ohe_cat_variables", preprocessor)
    # ,
    # ("model", lgb.LGBMClassifier(colsample_bytree=0.5205191706285026, learning_rate=1.0,
    #            max_bin=511, min_child_samples=2, n_estimators=42, num_leaves=10,
    #            reg_alpha=0.0922808777931273, reg_lambda=0.0009765625,
    #            verbose=-1))
])



# # sample dataframe
# df = pd.DataFrame({
#    # "id":[1,2,3,4],
#     "Name":["Eirik","Bar","BAz","quux"]
# })

# apply the pipeline to the input dataframe
_ = pipeline.fit_transform(X_train.copy())
print("What?")
_.shape



print("Transform someting:")
pipeline[-1].get_feature_names_out()
print(pipeline.transform(pd.DataFrame({'Name': ['Eirik', 'Ingeridene']})))

pipeline


KeyError: 'Name'

In [None]:
file_name = f'data/_pickled2_pipe.pkl'
'''pickle and save the automl object'''
import pickle
with open(file_name, 'wb') as f:
    pickle.dump(pipeline, f, pickle.HIGHEST_PROTOCOL)

'''load pickled automl object'''
with open(file_name, 'rb') as f:
    pipeline = pickle.load(f)


# test
pd.DataFrame.sparse.from_spmatrix(
    pipeline.transform(pd.DataFrame({"Name": ["Eirik"]})), 
    columns=pipeline[2].get_feature_names_out()
    )

'pickle and save the automl object'

'load pickled automl object'

Unnamed: 0,one_hot__last_1_letter_A,one_hot__last_1_letter_B,one_hot__last_1_letter_C,one_hot__last_1_letter_D,one_hot__last_1_letter_E,one_hot__last_1_letter_F,one_hot__last_1_letter_G,one_hot__last_1_letter_H,one_hot__last_1_letter_I,one_hot__last_1_letter_J,one_hot__last_1_letter_K,one_hot__last_1_letter_L,one_hot__last_1_letter_M,one_hot__last_1_letter_N,one_hot__last_1_letter_O,...,one_hot__last_3_letter_ØRG,one_hot__last_3_letter_ØRK,one_hot__last_3_letter_ØRN,one_hot__last_3_letter_ØRT,one_hot__last_3_letter_ØVE,one_hot__last_3_letter_ØYA,one_hot__last_3_letter_ÚLI,one_hot__last_3_letter_ÚNA,one_hot__last_3_letter_ÚNI,one_hot__last_3_letter_ÚSI,one_hot__last_3_letter_ÚST,one_hot__last_3_letter_ÝR,one_hot__last_3_letter_ÞÓR,remainder__final_letter_is_vowel,remainder__syllable_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# Load data
df = load_gendered_names()
y = df['Gender']
X = df.drop(columns=['Gender'])

df[X.duplicated()] # I get duplicates when the names and gender gets dropped. That is prob a problem.

# tmp enrich data using pipeline
X.shape
X_train_transformed = pipeline.transform(X)
X_train_transformed.shape

# Test
#test = pd.DataFrame.sparse.from_spmatrix(
#    X_train_transformed, 
#    columns=pipeline[2].get_feature_names_out()
#    )
#test['Gender'] = df['Gender']
#test['Name'] = df['Name']
#test
pipeline[2].get_feature_names_out()


Unnamed: 0,Name,Gender


(5505, 1)

(5505, 1652)

array(['one_hot__last_1_letter_A', 'one_hot__last_1_letter_B',
       'one_hot__last_1_letter_C', ..., 'one_hot__last_3_letter_ÞÓR',
       'remainder__final_letter_is_vowel', 'remainder__syllable_count'],
      dtype=object)

In [19]:

from sklearn.model_selection import train_test_split
# split data
#X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=99)
X_train, X_val, y_train, y_val = train_test_split(X_train_transformed, y, random_state=99)
X_train.shape
type(X_train)





NameError: name 'X_train_transformed' is not defined

In [20]:
X_train_df = pd.DataFrame.sparse.from_spmatrix(
   X_train, 
   columns=pipeline[2].get_feature_names_out()
   ).reindex()
#X_train_df
type(X_train_df)
X_train_df.shape
type(y_train)
y_train.shape

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [21]:


# what model should I choose?

# Initialize an AutoML instance
from flaml import AutoML
automl = AutoML()

automl_settings = {
    "time_budget": 2000,  # in seconds (This will train for 10 minutes) 3600=1h
    "metric": 'macro_f1',
    "task": 'classification',
    "log_file_name": "nlp.log",
    'n_jobs': -1
}

# Fit the models
automl.fit(X_train=X_train_df, y_train=y_train, verbose=1, **automl_settings)

NameError: name 'X_train_df' is not defined

In [144]:
# still hard. let's see if https://github.com/scikit-learn-contrib/sklearn-pandas 
# can make this easier..

#df = pd.DataFrame({
#    "id":[1,2,3,4],
#    "Name":["Eirik","Bar","BAz","quux"]
#})
#from sklearn_pandas import DataFrameMapper, gen_features
#
#
#class GetColumnsStartingWith:
#>    def __init__(self, start_str):
 #       self.pattern = start_str
#
#    def __call__(self, X:pd.DataFrame=None):
#        return [c for c in X.columns if c.startswith(self.pattern)]
#
#mapper = DataFrameMapper([
#     #gen_features(columns=["Name"], classes=[generate_language_features], )
#     ('Name', DataframeFunctionTransformer(generate_language_features), {''}) #,
#     #(GetColumnsStartingWith("last_"),  OneHotEncoder(handle_unknown="ignore")),
# ])


# df
# mapper.fit_transform(df)


# not easier, because the DataFrameMapper seems to always expect us to work col-by-col
# so the feature generation does not fit into this, and I end up with yet-another thing thats
# need to be pickled with the model..