In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('clean_df.csv', 
                        dtype={'has_children': str,
                              'direction_same': str})

In [3]:
df.columns

Index(['destination', 'passanger', 'weather', 'temperature', 'time', 'coupon',
       'expiration', 'gender', 'maritalStatus', 'has_children', 'education',
       'occupation', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20',
       'Restaurant20To50', 'direction_same', 'Y', 'rich', 'middleclass',
       'poor', 'age0', 'age21', 'age31', 'age41', 'age51', 'distance'],
      dtype='object')

In [4]:
coupons = df["coupon"].unique().tolist()

In [5]:
df_rest20 = df[df["coupon"]=="Restaurant(<20)"]

df_coffee = df[df["coupon"]== "Coffee House"]

df_bar = df[df["coupon"]== "Bar"]

df_takeaway = df[df["coupon"]=="Carry out & Take away"]

df_rest50 = df[df["coupon"]=="Restaurant(20-50)"]

In [6]:
print(len(df_rest20))
print(len(df_coffee))
print(len(df_bar))
print(len(df_takeaway))
print(len(df_rest50))

2653
3816
1913
2280
1417


In [7]:
df_bar["Y"].value_counts()/df_bar.shape[0]

0    0.588082
1    0.411918
Name: Y, dtype: float64

In [8]:
df_rest50["Y"].value_counts()/df_rest50.shape[0]

0    0.553987
1    0.446013
Name: Y, dtype: float64

In [9]:
df_coffee["Y"].value_counts()/df_coffee.shape[0]

0    0.503669
1    0.496331
Name: Y, dtype: float64

In [10]:
df_takeaway["Y"].value_counts()/df_takeaway.shape[0]

1    0.737719
0    0.262281
Name: Y, dtype: float64

In [11]:
df_rest20["Y"].value_counts()/df_rest20.shape[0]

1    0.709009
0    0.290991
Name: Y, dtype: float64

In [22]:
from sklearn.model_selection import train_test_split
RANDOM_SEED = 2021
x = df_bar.drop(columns=['Y'])
y = df_bar.Y
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=RANDOM_SEED, test_size=0.2)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_features_a = X_train.select_dtypes(['int64', 'float64']).columns
cat_features_a = X_train.select_dtypes(['object']).columns
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])
cat_transformer = OneHotEncoder()
preprocessor_a = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features_a),
    ('cat', cat_transformer, cat_features_a)
])
X_train = preprocessor_a.fit_transform(X_train)
X_test = preprocessor_a.transform(X_test)

from sklearn.utils import class_weight

from sklearn.linear_model import LogisticRegression
logreg_clf = LogisticRegression(solver='saga', max_iter=1000,
                               random_state=RANDOM_SEED)

best_est = logreg_clf.fit(X_test,y_test)


from sklearn import metrics

print("test_acc " + str(best_est.score(X_test, y_test)))
print("f1_score " + str(metrics.f1_score(y_test, best_est.predict(X_test))))

test_acc 0.7650130548302873
f1_score 0.6590909090909091


In [None]:
f1 = [0.7159685863874345,0.7401129943502824,0.6767015706806283,0.7587719298245614,0.7258485639686684]
np.mean(f1)

0.7234807290423151

In [None]:
"""
coffe 
test_acc 0.7159685863874345
f1_score 0.7071524966261808

rest_20
test_acc 0.7401129943502824
f1_score 0.823529411764706

bar
test_acc 0.6767015706806283
f1_score 0.7196367763904653

takeaway
test_acc 0.7587719298245614
f1_score 0.8604060913705585

rest_50
test_acc 0.7258485639686684
f1_score 0.5569620253164557
"""