In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
)



numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy='median'),),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy='most_frequent', fill_value='missing',),),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)
text_transformer = Pipeline(steps=[
    ('text_bow', CountVectorizer())
])

preprocessor = ColumnTransformer(
    transformers=[
#        ("num", numeric_transformer, ['Age']),
        ("text", text_transformer, 'Name'),
#        ("cat", categorical_transformer, ['Cabin']),
    ]
)

pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression())
    ]
)

textTransformer_0 = Pipeline(steps=[
    ('text_bow', CountVectorizer())
])


train = pd.read_csv('../data/train.csv')
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.9, random_state=42
    )
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]


results = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred, average="macro"),
    "recall": recall_score(y_test, y_pred, average="macro"),
    "auc": roc_auc_score(y_test, y_pred_proba, average="macro"),
}

print(results)

{'accuracy': 0.7780548628428927, 'precision': 0.768049980026326, 'recall': 0.7662297960909298, 'auc': 0.7909200299196724}


In [57]:
catTransformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('cat_ohe', OneHotEncoder(handle_unknown='ignore'))
])
text_transformer = Pipeline(steps=[
    ('text_bow', CountVectorizer())
])
numTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())
])

ct = ColumnTransformer(
transformers=[
   # ('cat', catTransformer, ['Cabin']),
   # ('num', numTransformer, ['Age']),
   ('text1', text_transformer, 'Name'),
])

pipeline = Pipeline(steps=[
    ('feature_engineer', ct),
    ('RF', LogisticRegression())
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print('accuracy %s' % accuracy_score(preds, y_test))


accuracy 0.7780548628428927
