We have 2 tables: *session* and *production*.
For construction model we do:
- From production table filter sessions with known gender.
- Create table of vectors with category features.
- join with features from time logs
- Build model with RF base.

For predictions:
- buld features from 2 tables and predict from created model

Input variable will be "session_id" like 'u10003'. We retrive data from 2 given tables and construct vector of features.

In [1]:
from gender_project.model import *
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import classification_report

In [2]:
def get_category_sequence(array):
#     print(type(array))
    return " ".join([" ".join(row[-4:]) for row in array])

def create_table(connection):
    # make connection to session_table
    session_table = pd.read_sql_table("session", connection)
    product_table = pd.read_sql_table("product", connection)

    #list of known sessions
    sessions_with_gender = sorted(session_table[session_table['gender'].isna()==False]['session_id'].unique())
    # filter product table by the list and sort by session_id
    product_table = product_table[product_table['session_id'].isin(sessions_with_gender)].sort_values(by='session_id', axis = 0)
    session_table = session_table[session_table['session_id'].isin(sessions_with_gender)].sort_values(by='session_id', axis = 0)
    # generate time features
    session_table["day"] = session_table.start_time.dt.day
    session_table["dayofweek"]=session_table.start_time.dt.dayofweek
    session_table["start_hour"]=session_table.start_time.dt.hour
    session_table["duration"]=(session_table.end_time - session_table.start_time).dt.seconds
    # generate category features
    array = product_table.values
    sequence = [get_category_sequence(array[array[:,0]==x]) for x in sessions_with_gender]
    session_table["categories"] = sequence
    # generate additional features
    session_table["number_of_views"] = session_table["categories"].apply(lambda x: int(len(x.split())/4))
    session_table["average_time_per_view"] = session_table["duration"]/session_table["number_of_views"]
    # columns to result
    result_columns = "session_id day dayofweek start_hour categories number_of_views average_time_per_view gender".split()
    return session_table[result_columns]

# generate table

In [3]:
from  sqlalchemy import create_engine
print(f'connect to "ftp" table.')
engine = create_engine("mysql://guest:relational@relational.fit.cvut.cz:3306/ftp", echo=False)
connection = engine.connect()
print('create table from 2 tables(product, session)')
table = create_table(connection)
table[:5]

connect to "ftp" table.
create table from 2 tables(product, session)


Unnamed: 0,session_id,day,dayofweek,start_hour,categories,number_of_views,average_time_per_view,gender
0,u10001,14,4,0,A00001 B00001 C00001 D00001,1,6.0,female
1,u10002,12,4,14,A00002 B00002 C00002 D24897,1,1.0,female
2,u10003,14,4,0,A00002 B00002 C00002 D00002 A00002 B00002 C000...,12,69.916667,female
3,u10004,14,4,0,A00002 B00006 C00015 D00030,1,2.0,female
4,u10005,14,4,0,A00002 B00002 C00003 D00033 A00002 B00002 C000...,3,65.666667,female


In [4]:
print(f'check on NA values.')
table.isna().sum()

check on NA values.


session_id               0
day                      0
dayofweek                0
start_hour               0
categories               0
number_of_views          0
average_time_per_view    0
gender                   0
dtype: int64

In [5]:
print(f'target encoding:')
le = LabelEncoder()
target = le.fit_transform(table['gender']).astype(int)
pprint(dict([(x, le.transform([x])[0]) for x in ['male', 'female']]))

target encoding:
{'female': 0, 'male': 1}


# generate pipeline

In [6]:
seq_features = ["categories"]
descrete_features = 'day dayofweek start_hour'.split()
qn_features = ['number_of_views']
log_features = ['average_time_per_view']

seq_vectorizer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # for future unknown values
    ('raw docs', CategoryTransformer()),
    ('vectorizer', CountVectorizer(ngram_range = (1,3), min_df = 5))])

descrete_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # for future unknown values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

qn_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # for future unknown values
    ('quantile', QuantileTransformer(output_distribution='normal')),
    ('scaler', StandardScaler())])

log_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # for future unknown values
    ('log', FunctionTransformer(np.log)),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat_sequence', seq_vectorizer, seq_features),
        ('descrete', descrete_transformer, descrete_features),
        ('quantile_normal', qn_transformer, qn_features),
        ('log', log_transformer, log_features)],
        n_jobs =-1)

model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('cls', RandomForestClassifier(n_jobs = -1, 
                                                      random_state = 0, 
                                                      class_weight = 'balanced', 
                                                      n_estimators = 100))
                       ])


In [7]:
print('check pipeline')
model.fit(table[table.columns[1:-1]], target)

check pipeline


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('cat_sequence',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('raw docs',
                                                                   <gender_project.model.CategoryTransformer object at 0x7f200fdd1d60>),
                                                                  ('vectorizer',
                                                                   CountVectorizer(min_df=5,
                                                                                   ngram_range=(1,
                                                                                                3)))]),
                                                  ['categories']),
        

In [8]:
from sklearn import set_config
set_config(display='diagram')   
# diplays HTML representation in a jupyter context
model

# cross-validation model

In [9]:
print('split data on train and validation sets.')
train, validation , train_y, validation_y = train_test_split(table[table.columns[1:-1]], target, random_state = 15, test_size = 0.2,
                                                            stratify = target)

split data on train and validation sets.


In [10]:
score  = cross_val_score(model, train, train_y, cv = 5, scoring = 'roc_auc', n_jobs = -1)
print(f'cross val score : {score.mean():.4f}+-{score.std():.4f} on train set.')

cross val score : 0.8497+-0.0028 on train set.


In [11]:
model.fit(train, train_y)
predictions_proba = model.predict_proba(validation)[:, 1]
predictions = model.predict(validation)
print(f'ROC_AUC score on validation set : {roc_auc_score(validation_y, predictions_proba):.4f}\n')
print('classification report:')
print(classification_report(validation_y, predictions))

ROC_AUC score on validation set : 0.8519

classification report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      2341
           1       0.80      0.58      0.67       659

    accuracy                           0.88      3000
   macro avg       0.85      0.77      0.80      3000
weighted avg       0.87      0.88      0.87      3000



# save model

In [12]:
import pickle

model.fit(table[table.columns[1:-1]],target)

with open('A.pkl', mode = 'wb') as file:
    pickle.dump(model, file, protocol=2)