In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA



In [2]:
# HYPER PARAMETERS


class CFG:
    emb_dim = 200


args = CFG

In [3]:
path = "../input/predict-meals/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
submit = pd.read_csv(path + "sample_submission.csv")
train.head()

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,중식계,석식계
0,2016-02-01,월,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",1039.0,331.0
1,2016-02-02,화,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",867.0,560.0
2,2016-02-03,수,2601,56,180,111,0.0,모닝롤/베이글 우유/두유/주스 계란후라이 표고버섯죽/쌀밥 (쌀:국내산) 콩나물국...,"카레덮밥 (쌀,현미흑미:국내산) 팽이장국 치킨핑거 (닭고기:국내산) 쫄면야채무침 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개 황태양념구이 (황태:러시아산) 고기...",1017.0,573.0
3,2016-02-04,목,2601,104,220,355,0.0,"모닝롤/토마토샌드 우유/두유/주스 계란후라이 닭죽/쌀밥 (쌀,닭:국내산) 근대국...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 쇠고기무국 주꾸미볶음 부추전 시금치나물 ...","미니김밥*겨자장 (쌀,현미흑미:국내산) 우동 멕시칸샐러드 군고구마 무피클 포...",978.0,525.0
4,2016-02-05,금,2601,278,181,34,0.0,모닝롤/와플 우유/두유/주스 계란후라이 쇠고기죽/쌀밥 (쌀:국내산) 재첩국 방...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 떡국 돈육씨앗강정 (돼지고기:국내산) 우엉잡채...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...",925.0,330.0


In [4]:
df_all = pd.concat([train, test])

In [5]:
# Basic text preprocessing


def split_process(x, q):
    x_ = []
    x = x.split(" ")
    for i in x:
        if "(" in i and ":" in i and ")" in i:
            continue
        if "/" in i:
            x_.extend(i.split("/"))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove("")
    return x_

In [6]:
# Get all combinations for training w2v (train + test)

food_combinations = []
for i in ["조식메뉴", "중식메뉴", "석식메뉴"]:
    food_combinations += df_all[i].apply(lambda x: split_process(x, i)).to_list()

In [7]:
# Train or load w2v model

TRAIN_W2V = True
try:
    model = Word2Vec.load("food_embedding.model")
    print("Model loaded")
except:
    if TRAIN_W2V:
        print("Training w2v")
        model = Word2Vec(
            sentences=food_combinations,
            vector_size=args.emb_dim,
            window=7,
            min_count=0,
            workers=4,
            sg=0,
            epochs=5000,
        )
        model.save("food_embedding.model")
    else:
        print("Model loading failed. Do not train.")

Model loaded


In [8]:
def process_date(df):
    df["일자"] = pd.to_datetime(df["일자"], format="%Y-%m-%d")
    df["년"] = df["일자"].dt.year
    df["월"] = df["일자"].dt.month
    df["일"] = df["일자"].dt.day
    df["주"] = df["일자"].dt.week
    df["요일"] = df["일자"].dt.weekday
    df = df.drop("일자", axis=1)
    return df


def get_food_embedding(x):
    x_ = []
    x = x.split(" ")
    for i in x:
        if "(" in i and ":" in i and ")" in i:
            continue
        if "/" in i:
            x_.extend(i.split("/"))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove("")
    vec_ = np.zeros(args.emb_dim)
    for i in x_:
        vec = model.wv.get_vector(i)
        vec_ += vec
    vec_ /= len(x_)
    return vec_

In [9]:
# General preprocessing
train = process_date(train)
day_encoder = LabelEncoder()
train["요일"] = day_encoder.fit_transform(train["요일"])

  df["주"] = df["일자"].dt.week


In [10]:
# Get embedding
train['조식메뉴_embedding'] = train['조식메뉴'].apply(lambda x: get_food_embedding(x))
train['중식메뉴_embedding'] = train['중식메뉴'].apply(lambda x: get_food_embedding(x))
train['석식메뉴_embedding'] = train['석식메뉴'].apply(lambda x: get_food_embedding(x))

In [11]:
train[["현본사소속재택근무자수", "중식계", "석식계"]] = train[["현본사소속재택근무자수", "중식계", "석식계"]].astype(
    "int"
)
train["출근"] = train["본사정원수"] - (
    train["본사휴가자수"] + train["본사출장자수"] + train["현본사소속재택근무자수"]
)
train["휴가비율"] = train["본사휴가자수"] / train["본사정원수"]
train["출장비율"] = train["본사출장자수"] / train["본사정원수"]
train["야근비율"] = train["본사시간외근무명령서승인건수"] / train["출근"]
train["재택비율"] = train["현본사소속재택근무자수"] / train["본사정원수"]
train["식사가능자수"] = train["본사정원수"] - train["본사휴가자수"] - train["현본사소속재택근무자수"]

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   요일              1205 non-null   int64  
 1   본사정원수           1205 non-null   int64  
 2   본사휴가자수          1205 non-null   int64  
 3   본사출장자수          1205 non-null   int64  
 4   본사시간외근무명령서승인건수  1205 non-null   int64  
 5   현본사소속재택근무자수     1205 non-null   int64  
 6   조식메뉴            1205 non-null   object 
 7   중식메뉴            1205 non-null   object 
 8   석식메뉴            1205 non-null   object 
 9   중식계             1205 non-null   int64  
 10  석식계             1205 non-null   int64  
 11  년               1205 non-null   int64  
 12  월               1205 non-null   int64  
 13  일               1205 non-null   int64  
 14  주               1205 non-null   int64  
 15  조식메뉴_embedding  1205 non-null   object 
 16  중식메뉴_embedding  1205 non-null   object 
 17  석식메뉴_embedding  1205 non-null   o

In [13]:
y_lunch = train["중식계"]
y_dinner = train["석식계"]
train.drop(["조식메뉴", "중식메뉴", "석식메뉴", "중식계", "석식계"], axis=1, inplace=True)

In [14]:
X_common = train[
    [
        "년",
        "월",
        "일",
        "식사가능자수",
        "주",
        "요일",
        "출근",
        "휴가비율",
        "출장비율",
        "야근비율",
        "재택비율",
    ]
]

In [28]:
train.loc[:, '석식메뉴_embedding']

0       [-0.27641419001988005, 0.49010108198438374, 0....
1       [0.3575967748959859, -0.4345718224843343, 0.37...
2       [-0.40610839639391216, 0.1882472378867013, 1.0...
3       [0.5150311092535654, 1.2116996347904205, -0.00...
4       [0.18974069612366812, 0.6862473083393914, 0.97...
                              ...                        
1200    [0.27814774711926776, -0.26777855679392815, -0...
1201    [-0.16018237421909967, -0.028216694792111714, ...
1202    [0.532713749579021, -0.7373491334063667, 0.063...
1203    [-0.8323680758476257, 0.07871099561452866, -0....
1204    [-0.9803705215454102, -0.31528616944948834, -0...
Name: 석식메뉴_embedding, Length: 1205, dtype: object

In [35]:
len(train.iloc[:, 12].to_numpy().tolist())

1205

In [38]:
X_common

Unnamed: 0,년,월,일,식사가능자수,주,요일,출근,휴가비율,출장비율,야근비율,재택비율
0,2016,2,1,2551,5,0,2401,0.019223,0.057670,0.099125,0.000000
1,2016,2,2,2551,5,1,2378,0.019223,0.066513,0.134146,0.000000
2,2016,2,3,2545,5,2,2365,0.021530,0.069204,0.046934,0.000000
3,2016,2,4,2497,5,3,2277,0.039985,0.084583,0.155907,0.000000
4,2016,2,5,2323,5,4,2142,0.106882,0.069589,0.015873,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
1200,2021,1,20,2517,3,2,2319,0.025142,0.066376,0.001725,0.131076
1201,2021,1,21,2540,3,3,2309,0.030841,0.077439,0.200087,0.117667
1202,2021,1,22,2425,3,4,2177,0.085484,0.083138,0.000459,0.101576
1203,2021,1,25,2549,4,0,2396,0.035870,0.051291,0.257095,0.109621


In [19]:
emb_arr_lunch = np.array(train.iloc[:, 11].to_numpy().tolist())  # Ver 2
emb_arr_dinner = np.array(train.iloc[:, 12].to_numpy().tolist())  # Ver 2

X_train_lunch = np.concatenate((X_common.to_numpy(), emb_arr_lunch), axis=1)
X_train_dinner = np.concatenate((X_common.to_numpy(), emb_arr_dinner), axis=1)

In [20]:
from sklearn.linear_model import BayesianRidge

# 대회 규칙
# 평가산식 : MAE(Mean Absolute Error)
lunch_model = BayesianRidge(n_iter=300, verbose=True)
dinner_model = BayesianRidge(n_iter=300, verbose=True)

In [21]:
lunch_model.fit(X_train_lunch, y_lunch)

Convergence after  11  iterations


BayesianRidge(verbose=True)

In [22]:
dinner_model.fit(X_train_dinner, y_dinner)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# inference

In [None]:
# Apply general preprocessing
test = process_date(test)
test["요일"] = day_encoder.transform(test["요일"])
test["조식메뉴_embedding"] = test["조식메뉴"].apply(lambda x: get_food_embedding(x))
test["중식메뉴_embedding"] = test["중식메뉴"].apply(lambda x: get_food_embedding(x))
test["석식메뉴_embedding"] = test["석식메뉴"].apply(lambda x: get_food_embedding(x))
test.drop(['조식메뉴', '중식메뉴', '석식메뉴'], axis=1, inplace=True)

test["현본사소속재택근무자수"] = test["현본사소속재택근무자수"].astype("int")
test["식사가능자수"] = test["본사정원수"] - test["본사휴가자수"] - test["현본사소속재택근무자수"]
test["출근"] = test["본사정원수"] - (test["본사휴가자수"] + test["본사출장자수"] + test["현본사소속재택근무자수"])
test["휴가비율"] = test["본사휴가자수"] / test["본사정원수"]
test["출장비율"] = test["본사출장자수"] / test["본사정원수"]
test["야근비율"] = test["본사시간외근무명령서승인건수"] / test["출근"]
test["재택비율"] = test["현본사소속재택근무자수"] / test["본사정원수"]

In [None]:
X_test_common = test.iloc[:, :9]

In [None]:
# Get embedding
test_emb_arr_lunch = np.array(test.iloc[:, 10].to_numpy().tolist()) # Ver 2
test_emb_arr_dinner = np.array(test.iloc[:, 11].to_numpy().tolist()) # Ver 2
# Concat
test_lunch = np.concatenate((X_test_common.to_numpy(), test_emb_arr_lunch), axis=1)
test_dinner = np.concatenate((X_test_common.to_numpy(), test_emb_arr_dinner), axis=1)

In [None]:
# Inference

test_pred_lunch = lunch_model.predict(test_lunch)
test_pred_dinner = dinner_model.predict(test_dinner)

In [None]:
submit['중식계'] = test_pred_lunch
submit['석식계'] = test_pred_dinner

In [None]:
submit.to_csv("random_forest.csv", index=False)