In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

print(tf.__version__)

2.4.1


In [2]:
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [3]:
train.shape
test.shape

(21000, 13)

(9000, 12)

In [4]:
train.head()
test.head()

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary
0,0,1,44,愛知県,2,1,2,1,24,2.0,1.6,9.2,428.074887
1,1,2,31,奈良県,1,0,0,0,13,9.0,0.7,12.4,317.930517
2,2,2,36,山口県,1,0,0,2,14,4.0,0.4,16.9,357.350316
3,3,0,22,東京都,2,0,0,0,4,3.0,0.4,6.1,201.310911
4,4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,4.9,178.067475


Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime
0,0,3,39,鹿児島県,2,1,5,1,19,1.0,1.8,14.2
1,1,1,31,宮城県,1,0,0,4,0,0.0,0.5,18.6
2,2,0,20,愛知県,2,1,2,0,2,2.0,1.2,2.3
3,3,0,28,三重県,2,0,0,0,10,3.0,0.3,0.0
4,4,1,41,愛媛県,2,0,0,0,23,3.0,0.5,10.1


In [5]:
def create_features(df):
    def overtime_zeto2median(df):
    # 不自然に残業時間0の層があるため、欠損値的な扱いをして給与の中央値を入れる
        df_ex_zero = df[df["overtime"]>0][["overtime"]]
        df.loc[df["overtime"] <= 0, "overtime"] = df_ex_zero["overtime"].median()
        return df
    
    # 東京、大阪にフラグを立てる
    def live_in_city(df):
        df["isCity"] = df["area"].isin(["東京都", "大阪府"]).astype(int)
        if "isCity" not in cat_features:
            cat_features.append("isCity")
        return df

    # 性別と役職を紐づけた特徴量
    def sex_and_position(df):
        df["sex_and_position"] = df["sex"].astype(str) + "_" + df["position"].astype(str)
        if "sex_and_position" not in cat_features:
            cat_features.append("sex_and_position")
        return df

    # 年齢層でラベル化
    def age_layer(df):
        df["age_layer"] = 0
        df.loc[20>=df["age"], "age_layer"] = 0
        df.loc[(30>=df["age"])&(df["age"]>20), "age_layer"] = 1
        df.loc[df["age"]>30, "age_layer"] = 2
        if "age_layer" not in cat_features:
            cat_features.append("age_layer")
        return df

    # 年齢層ラベルと役職を紐づけた特徴量
    def agelayer_and_position(df):
        df["agelayer_and_position"] = df["age_layer"].astype(str) + "_" + df["position"].astype(str)
        if "agelayer_and_position" not in cat_features:
            cat_features.append("agelayer_and_position")
        return df

    # 教育と役職を紐づけた特徴量
    def education_and_position(df):
        df["education_and_position"] = df["education"].astype(str) + "_" + df["position"].astype(str)
        if "education_and_position" not in cat_features:
            cat_features.append("education_and_position")
        return df


    # 通勤時間と既婚or未婚、都市住み
    def adjust_commute(df):
        df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1), "commute"] = \
            df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
        df.loc[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1), "commute"] = \
            df[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
        df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0), "commute"] = \
            df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
        return df

    
    # 残業時間/勤続年数
    def overtime_by_service_length(df):
        df["overtime_by_service_length"] = df["overtime"] / (df["service_length"]+1)
        if "overtime_by_service_length" not in num_features:
            num_features.append("overtime_by_service_length")
        return df

    # 残業時間/年齢
    def overtime_by_age(df):
        df["overtime_by_age"] = df["overtime"] / df["age"]
        if "overtime_by_age" not in num_features:
            num_features.append("overtime_by_age")
        return df

    # 勉強時間/勤続年数
    def study_time_by_service_length(df):
        df["study_time_by_service_length"] = df["study_time"] / (df["service_length"]+1)
        if "study_time_by_service_length" not in num_features:
            num_features.append("study_time_by_service_length")
        return df

    # 勉強時間/年齢
    def study_time_by_age(df):
        df["study_time_by_age"] = df["study_time"] / df["age"]
        if "study_time_by_age" not in num_features:
            num_features.append("study_time_by_age")
        return df

    # 年齢/勤続年数
    def age_by_service_length(df):
        df["age_by_service_length"] = df["age"] / (df["service_length"]+1)
        if "age_by_service_length" not in num_features:
            num_features.append("age_by_service_length")
        return df


    df = overtime_zeto2median(df)
    df = live_in_city(df)
    df = sex_and_position(df)
    df = education_and_position(df)
    df = adjust_commute(df)
    df = age_diff_service_length(df)
    df = age_layer(df)
    df = agelayer_and_position(df)
    df = overtime_by_service_length(df)
    df = overtime_by_age(df)
    df = study_time_by_service_length(df)
    df = study_time_by_age(df)
    df = age_by_service_length(df)
    return df

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# エンコーディング
def Apply_Encoding(df):
    def Label(df):
        df['area'] = LabelEncoder().fit_transform(df['area'])
        return df
    
    def One_hot(df):
        df = pd.get_dummies(df,columns=['area','sex_and_position','education_and_position','agelayer_and_position'], drop_first=True)
#             df[col] = OneHotEncoder(drop='first').fit_transform(df[col])
        return df
    
    df = Label(df)
    df = One_hot(df)
    return df

# 正規化



In [7]:
cat_features = ["position", "sex", "partner", "education", "area"]
num_features = ["age", "num_child", "service_length", "study_time", "commute", "overtime"]

In [8]:
target_col = 'salary'
target = train[target_col]
train.drop(columns=[target_col], inplace=True)

In [9]:
drop_cols = ['id']
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

In [10]:
train_pre = create_features(train)
test_pre = create_features(test)

In [12]:
train_pre.head()
train_pre.columns

Unnamed: 0,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,...,sex_and_position,education_and_position,age_diff_service_length,age_layer,agelayer_and_position,overtime_by_service_length,overtime_by_age,study_time_by_service_length,study_time_by_age,age_by_service_length
0,1,44,愛知県,2,1,2,1,24,2.0,0.6,...,2_1,1_1,1.76,2,2_1,0.368,0.209091,0.08,0.045455,1.76
1,2,31,奈良県,1,0,0,0,13,9.0,0.7,...,1_2,0_2,2.214286,2,2_2,0.885714,0.4,0.642857,0.290323,2.214286
2,2,36,山口県,1,0,0,2,14,4.0,0.4,...,1_2,2_2,2.4,2,2_2,1.126667,0.469444,0.266667,0.111111,2.4
3,0,22,東京都,2,0,0,0,4,3.0,0.0,...,2_0,0_0,4.4,1,1_0,1.22,0.277273,0.6,0.136364,4.4
4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,...,2_0,1_0,4.166667,1,1_0,0.816667,0.196,0.5,0.12,4.166667


Index(['position', 'age', 'area', 'sex', 'partner', 'num_child', 'education',
       'service_length', 'study_time', 'commute', 'overtime', 'isCity',
       'sex_and_position', 'education_and_position', 'age_diff_service_length',
       'age_layer', 'agelayer_and_position', 'overtime_by_service_length',
       'overtime_by_age', 'study_time_by_service_length', 'study_time_by_age',
       'age_by_service_length'],
      dtype='object')

In [15]:
train_pre.to_csv('../data/train_pre.csv')
test_pre.to_csv('../data/test_pre.csv')

In [18]:
train_enc = Apply_Encoding(train_pre)
test_enc = Apply_Encoding(test_pre)

In [21]:
train_enc.head()
test_enc.head()

Unnamed: 0,position,age,sex,partner,num_child,education,service_length,study_time,commute,overtime,...,agelayer_and_position_0_2,agelayer_and_position_1_0,agelayer_and_position_1_1,agelayer_and_position_1_2,agelayer_and_position_1_3,agelayer_and_position_2_0,agelayer_and_position_2_1,agelayer_and_position_2_2,agelayer_and_position_2_3,agelayer_and_position_2_4
0,1,44,2,1,2,1,24,2.0,0.6,9.2,...,0,0,0,0,0,0,1,0,0,0
1,2,31,1,0,0,0,13,9.0,0.7,12.4,...,0,0,0,0,0,0,0,1,0,0
2,2,36,1,0,0,2,14,4.0,0.4,16.9,...,0,0,0,0,0,0,0,1,0,0
3,0,22,2,0,0,0,4,3.0,0.0,6.1,...,0,1,0,0,0,0,0,0,0,0
4,0,25,2,0,0,1,5,3.0,0.2,4.9,...,0,1,0,0,0,0,0,0,0,0


Unnamed: 0,position,age,sex,partner,num_child,education,service_length,study_time,commute,overtime,...,agelayer_and_position_0_2,agelayer_and_position_1_0,agelayer_and_position_1_1,agelayer_and_position_1_2,agelayer_and_position_1_3,agelayer_and_position_2_0,agelayer_and_position_2_1,agelayer_and_position_2_2,agelayer_and_position_2_3,agelayer_and_position_2_4
0,3,39,2,1,5,1,19,1.0,0.8,14.2,...,0,0,0,0,0,0,0,0,1,0
1,1,31,1,0,0,4,0,0.0,0.5,18.6,...,0,0,0,0,0,0,1,0,0,0
2,0,20,2,1,2,0,2,2.0,0.2,2.3,...,0,0,0,0,0,0,0,0,0,0
3,0,28,2,0,0,0,10,3.0,0.3,8.0,...,0,1,0,0,0,0,0,0,0,0
4,1,41,2,0,0,0,23,3.0,0.5,10.1,...,0,0,0,0,0,0,1,0,0,0


In [24]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
train_ss = ss.fit_transform(train_enc)
test_ss = ss.fit_transform(test_enc)

In [26]:
train_ss

array([[-0.18524204,  1.01423593,  1.00333891, ..., -0.52459545,
        -0.34208373, -0.24399964],
       [ 0.63131607, -0.1990181 , -0.9966722 , ...,  1.90623078,
        -0.34208373, -0.24399964],
       [ 0.63131607,  0.26761806, -0.9966722 , ...,  1.90623078,
        -0.34208373, -0.24399964],
       ...,
       [ 1.44787419,  0.26761806,  1.00333891, ..., -0.52459545,
         2.92326094, -0.24399964],
       [-1.00180016, -1.13229043,  1.00333891, ..., -0.52459545,
        -0.34208373, -0.24399964],
       [-0.18524204, -0.29234534, -0.9966722 , ..., -0.52459545,
        -0.34208373, -0.24399964]])

In [29]:
def mean_norm(df):
    return df.apply(lambda x: (x-x.mean())/ x.std(), axis=0)

train_norm = mean_norm(train_enc)
test_norm = mean_norm(test_enc)
train_norm.head()
test_norm.head()

Unnamed: 0,position,age,sex,partner,num_child,education,service_length,study_time,commute,overtime,...,agelayer_and_position_0_2,agelayer_and_position_1_0,agelayer_and_position_1_1,agelayer_and_position_1_2,agelayer_and_position_1_3,agelayer_and_position_2_0,agelayer_and_position_2_1,agelayer_and_position_2_2,agelayer_and_position_2_3,agelayer_and_position_2_4
0,-0.185238,1.014212,1.003315,1.00131,0.705723,-0.088298,1.093489,-0.551922,0.250411,-0.580611,...,-0.030875,-0.751145,-0.291813,-0.137017,-0.028463,-0.011953,2.738917,-0.524583,-0.342076,-0.243994
1,0.631301,-0.199013,-0.996648,-0.998644,-0.705252,-0.98407,0.065146,1.561014,0.52533,0.020443,...,-0.030875,-0.751145,-0.291813,-0.137017,-0.028463,-0.011953,-0.36509,1.906185,-0.342076,-0.243994
2,0.631301,0.267612,-0.996648,-0.998644,-0.705252,0.807475,0.158632,0.051774,-0.299425,0.865676,...,-0.030875,-0.751145,-0.291813,-0.137017,-0.028463,-0.011953,-0.36509,1.906185,-0.342076,-0.243994
3,-1.001776,-1.038938,1.003315,-0.998644,-0.705252,-0.98407,-0.776225,-0.250074,-1.399099,-1.162883,...,-0.030875,1.331237,-0.291813,-0.137017,-0.028463,-0.011953,-0.36509,-0.524583,-0.342076,-0.243994
4,-1.001776,-0.758963,1.003315,-0.998644,-0.705252,-0.088298,-0.682739,-0.250074,-0.849262,-1.388278,...,-0.030875,1.331237,-0.291813,-0.137017,-0.028463,-0.011953,-0.36509,-0.524583,-0.342076,-0.243994


Unnamed: 0,position,age,sex,partner,num_child,education,service_length,study_time,commute,overtime,...,agelayer_and_position_0_2,agelayer_and_position_1_0,agelayer_and_position_1_1,agelayer_and_position_1_2,agelayer_and_position_1_3,agelayer_and_position_2_0,agelayer_and_position_2_1,agelayer_and_position_2_2,agelayer_and_position_2_3,agelayer_and_position_2_4
0,1.499887,0.566331,1.022418,0.992638,2.844446,-0.080373,0.6411,-0.804253,0.826414,1.103082,...,-0.031637,-0.776575,-0.281986,-0.138329,-0.027898,-0.014908,-0.363402,-0.52882,3.003543,-0.229939
1,-0.155617,-0.181027,-0.977965,-1.007304,-0.712198,2.615369,-1.134887,-1.108165,-0.004509,1.968836,...,-0.031637,-0.776575,-0.281986,-0.138329,-0.027898,-0.014908,2.75147,-0.52882,-0.332903,-0.229939
2,-0.983369,-1.208643,1.022418,0.992638,0.710459,-0.978954,-0.947941,-0.500341,-0.835431,-1.238389,...,-0.031637,-0.776575,-0.281986,-0.138329,-0.027898,-0.014908,-0.363402,-0.52882,-0.332903,-0.229939
3,-0.983369,-0.461286,1.022418,-1.007304,-0.712198,-0.978954,-0.200157,-0.196428,-0.558457,-0.116844,...,-0.031637,1.287563,-0.281986,-0.138329,-0.027898,-0.014908,-0.363402,-0.52882,-0.332903,-0.229939
4,-0.155617,0.75317,1.022418,-1.007304,-0.712198,-0.978954,1.014992,-0.196428,-0.004509,0.296357,...,-0.031637,-0.776575,-0.281986,-0.138329,-0.027898,-0.014908,2.75147,-0.52882,-0.332903,-0.229939


In [31]:
train_enc.head()

Unnamed: 0,position,age,sex,partner,num_child,education,service_length,study_time,commute,overtime,...,agelayer_and_position_0_2,agelayer_and_position_1_0,agelayer_and_position_1_1,agelayer_and_position_1_2,agelayer_and_position_1_3,agelayer_and_position_2_0,agelayer_and_position_2_1,agelayer_and_position_2_2,agelayer_and_position_2_3,agelayer_and_position_2_4
0,1,44,2,1,2,1,24,2.0,0.6,9.2,...,0,0,0,0,0,0,1,0,0,0
1,2,31,1,0,0,0,13,9.0,0.7,12.4,...,0,0,0,0,0,0,0,1,0,0
2,2,36,1,0,0,2,14,4.0,0.4,16.9,...,0,0,0,0,0,0,0,1,0,0
3,0,22,2,0,0,0,4,3.0,0.0,6.1,...,0,1,0,0,0,0,0,0,0,0
4,0,25,2,0,0,1,5,3.0,0.2,4.9,...,0,1,0,0,0,0,0,0,0,0


In [32]:
train_norm.to_csv('../data/train_norm.csv')
test_norm.to_csv('../data/test_norm.csv')