<a href="https://colab.research.google.com/github/david3951445/ML_project/blob/main/FinalProject/project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir data # 建立資料夾

In [None]:
import numpy as np
from numpy.core.numeric import NaN
import pandas as pd
from pandas.core.frame import DataFrame
import os 

from datetime import datetime
from pandas.core.reshape.concat import concat

from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

import scipy.stats as st
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
from keras import optimizers
from keras.optimizers import Adam
from keras import backend as K


# ignore warning : This TensorFlow binary is optimized with oneAPI ...
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def main():
    ''' load data '''

    data_report = pd.read_csv('data/report.csv', low_memory=False)
    data_submission = pd.read_csv('data/submission.csv')
    data_birth = pd.read_csv('data/birth.csv')
    # data_breed = pd.read_csv('data/breed.csv')
    # data_spec = pd.read_csv('data/spec.csv')
    
    '''
    # Data pre-processing #
    important data : season of calv-ing,  氣候,  泌乳高峰第幾天 , stocking  rate
    一開始的六個星期中奶量不斷提高，一直到每日25至60升，然後不斷下降
    
    brith.csv
    COL 2, 3 :
        COL 2 - COL 3(前一胎次) = 乾乳期
    COL 4, 5 : 犢牛1, 犢牛2
        insufficient, drop()
    COL 6 : 母牛體重
    COL 7, 9 : 登錄日期, 胎次 
        repeat, drop()
    COL 8 : 計算胎次
        meaningless, drop()
    COL 10 : 分娩難易度
    COL 11, 12: 犢牛體型, 犢牛性 
        insufficient, drop()
    COL 13 : 酪農場代號
        repeat, drop()
    
    bread.csv
    report.csv
    COL 2 : 年
        drop
    COL 3 : 月
    x_train.replace([3, 4, 5], 'spring')
    x_train.replace([6, 7, 8], 'summer')
    x_train.replace([9, 10, 11], 'autumn') 
    x_train.replace([12, 1, 2], 'winter')
    COL 4 : 農場代號
    COL 5 : 乳牛編號
    COL 6, 7 : 父、母
        drop()
    COL 8 : 出生日期
        drop()
    COL 9 : 胎次
        反比
    COL 10 : 泌乳天數 (COL 15 - COL 12)
    COL 11 : 乳量
    COL 12 : 最近分娩
        if 19 has value
            分娩間隔 = COL 12 - COL 19
        else
            分娩間隔 = COL 12 - COL 8 # 第一次分娩 - 出生日期
    COL 13 : 採樣日期 (COL 15 - (1day ~ 3day))
        drop()
    COL 14 : 月齡
        反比
    COL 15 : 檢測日期 (年/月 : COL 2 / COL 3)
        drop()
    COL 16 : 最後配種日期 (=受精)
    COL 17 : 最後配種精液
    COL 18 : 配種次數
        反比
    COL 19 : 前次分娩日期
        drop()
    COL 20 : 第一次配種日期
    COL 21 : 第一次配種精液
    spec.csv (health)
 
    '''

    # # construct train data
    x_train = pd.DataFrame()

    # # COL 3
    temp = data_report.iloc[:, 2]
    temp = temp.replace([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2], ['spring', 'spring', 'spring',\
                                          'summer', 'summer', 'summer',\
                                          'autumn', 'autumn', 'autumn',\
                                          'winter', 'winter', 'winter'])
    #temp.replace([3, 4, 5], 'spring')
    #temp.replace([4, 5, 6], 'summer')
    #temp.replace([5, 6, 7], 'autumn') 
    #temp.replace([6, 1, 2], 'winter')
    x_train = pd.concat([x_train, temp], axis=1) # axis=1 means colume

    # # COL 4, 9, 10, 11, 14, 18
    x_train = pd.concat([x_train, data_report['4']], axis=1)
    x_train = pd.concat([x_train, data_report['9']], axis=1)
    x_train = pd.concat([x_train, data_report['10']], axis=1)
    x_train = pd.concat([x_train, data_report['11']], axis=1) # y_train
    x_train = pd.concat([x_train, data_report['14']], axis=1)
    x_train = pd.concat([x_train, data_report['18']], axis=1)

    # # birth_interval
    temp1 = data_report['12'].copy()
    temp2 = data_report['19'].copy()
    i1 = np.where(temp1.isna())[0]
    i2 = np.where(temp2.isna())[0]

    # 補缺項
    temp1.iloc[i1] = temp1.iloc[i1[0] + 1] , # temporary method
    temp2.iloc[i2] = data_report.iloc[i2, 7] # 第一次分娩 - 出生日期

    birth_interval = day_interval(temp1, temp2, 'birth_interval')
    x_train = pd.concat([x_train, birth_interval], axis=1)

    # # # dry_interval
    # # 先透過 data_birth 計算乾乳期
    data_birth_copy = data_birth.copy() # copy
    data_birth_copy = data_birth_copy.sort_values(by=['1', '9']) # sort 牛編號, 胎次
    i_cow_b = ~data_birth_copy.duplicated(subset=['1']) # find all cow

    temp = data_birth_copy.iloc[:, 2].shift() # 原資料的乾乳時間是下個胎次的
    temp.loc[i_cow_b] = NaN # teporary method, 第一胎次乾乳期 = NaN
    dry_interval = day_interval(data_birth_copy.iloc[:, 1], temp, 'dry_interval')
 
    # 補缺項
    data_birth_copy = pd.concat([data_birth_copy, dry_interval, pd.DataFrame(i_cow_b, columns=['i_cow_b'])], axis=1)  
    index = [a or b for a, b in zip(data_birth_copy['dry_interval'] < 0, data_birth_copy['dry_interval'] > 5*30)]
    data_birth_copy = data_birth_copy.drop(data_birth_copy.loc[index].index) # 不合理的值直接排除 (保留 0~150天)
    mean = np.mean(data_birth_copy['dry_interval'])
    temp_cow_dry = data_birth_copy.fillna(mean) # teporary method, NaN(第一胎次乾乳期) = 平均值

    # # 把 birth 的資料融入 report
    # 索引操作
    data_report_copy = data_report.copy()
    data_report_copy = data_report_copy.sort_values(by=['5', '9']) # sort 牛編號, 胎次
    i_cd_b = temp_cow_dry.set_index(keys = ['1', '9']) # 將牛編號, 胎次轉為 index
    i_cd_b = i_cd_b.drop(i_cd_b.columns.drop(['dry_interval']), axis=1) # 保留 index, dry_interval
    i_cd_r = data_report_copy.set_index(keys = ['5', '9']) # 將牛編號, 胎次轉為 index
    i_cd_r = i_cd_r.drop(i_cd_r.columns, axis=1) # 保留 index 就好
    
    # 補缺項
    for a, b in i_cd_b.index :
        try :
            i_cd_r.loc[(a, b), 'dry_interval'] = i_cd_b.loc[(a, b), 'dry_interval'].iloc[0]
        except :
            continue # 如果birth有report沒有的牛，跳過 (經測試，只有一隻)
    i_cd_r['dry_interval'] = i_cd_r['dry_interval'].fillna(mean) # teporary method, 如果report有birth沒有的牛 乾乳期 = 平均值

    # 塞進 x_train
    array = i_cd_r.to_numpy()
    temp = pd.DataFrame(array, columns=['dry_interval'])
    temp.loc[data_report_copy.index.values, ['dry_interval']] = array
    x_train = pd.concat([x_train, temp], axis=1)

    # # one hot
    x_train = pd.get_dummies(x_train)
    
    # # split x_test from x_train
    index = np.where(x_train['11'].isna())[0]
    temp = x_train.loc[index]

    x_train = x_train.drop(index) # train input data 
    x_train = x_train.dropna() # 保證最後不會有 NaN
    y_train = x_train.pop('11') # train output data
    x_test = temp.drop(['11'], axis=1) # test input data
    print(x_train.shape) 
    print(y_train.shape)
    print(x_test.shape) 
    # x_train.to_csv('test.csv')

    ''' ML model training '''
    # 打在這
    data_number = len(x_train.iloc[:, 0])
    feature_number = len(x_train.iloc[0, :])

    test_number = len(x_test.iloc[:, 0])

    '''scikit learn
    model = DecisionTreeRegressor()  # 選擇Model
    model.fit(x_train, y_train)  # 訓練
    y_predictions = model.predict(x_test)  # 預測
    '''

    # 誤差計算
    def rmse(y_pred,y_true):
      return K.sqrt(K.mean(K.square(y_pred-y_true)))

    # 建立Sequential
    model=Sequential()
    model.add(Dense(256,input_dim=feature_number,activation='relu'))
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1))

    # 編譯(損失函數,評估標準:RMSE 優化器:adam.adamax)
    model.compile(loss=rmse,optimizer="adam",metrics=[rmse])
    #model.compile(loss=rmse,optimizer='Adamax',metrics=[rmse])
    #model.compile(loss=rmse,optimizer='Nadam',metrics=[rmse])

    # 訓練
    model.fit(x_train,y_train,epochs=100,batch_size=64,verbose=1)


    ''' ML model pridict '''
    # # input x_test, output y_predict
    # y_predict = model.predict(x_test)
    # data_submission['1'] = y_predict
    # data_submission.to_csv('out.csv', index=False)
    y_predict = model.predict(x_test)  # 預測
    data_submission['1'] = y_predict
    data_submission.to_csv('out.csv', index=False)

# day intervel of two Series with string type
def day_interval(temp1, temp2, name) :
    date1 = pd.to_datetime(temp1)
    date2 = pd.to_datetime(temp2)
    #date1 = [datetime.strptime(i, "%Y/%m/%d %H:%M") for i in temp1]
    return pd.DataFrame([(a - b).days for a, b in zip(date1, date2)], columns=[name], index=temp1.index) # preserver temp1.index

if __name__ == '__main__':
    main()

(33253, 13)
(33253,)
(4263, 13)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Ep

In [None]:
# scikit-learn
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

# 測試檔案
data_train = pd.read_csv('test/Training_set.csv', header=None).to_numpy()
data_test = pd.read_csv('test/Validation_set.csv', header=None).to_numpy()

train_array = data_train
data_number = len(train_array[:, 0])
feature_number = len(train_array[0, :])-1
x_train = train_array[:, :feature_number]
y_train = train_array[:, feature_number]

test_array = data_test
test_number = len(test_array[:, 0])
x_test = test_array[:, :feature_number]
y_test = test_array[:, feature_number]

model = KNeighborsRegressor()  # 選擇Model
model.fit(x_train, y_train)  # 訓練
y_predictions = model.predict(x_test)  # 預測

print("prediction:", y_predictions)
print("true values:", y_test)


prediction: [0.57  0.848 0.704 0.686 0.95  0.662 0.674 0.82  0.828 0.858 0.64  0.804
 0.564 0.648 0.708 0.622 0.74  0.668 0.93  0.726 0.416 0.762 0.66  0.858
 0.936 0.942 0.71  0.734 0.626 0.674 0.736 0.81  0.702 0.69  0.476 0.952
 0.918 0.798 0.646 0.822 0.642 0.814 0.622 0.626 0.704 0.78  0.836 0.714
 0.482 0.704 0.72  0.61  0.756 0.776 0.672 0.936 0.938 0.482 0.92  0.834
 0.922 0.88  0.626 0.94  0.93  0.872 0.676 0.662 0.832 0.688 0.614 0.69
 0.478 0.628 0.644 0.828 0.652 0.64  0.894 0.708 0.81  0.88  0.64  0.624
 0.636 0.79  0.62  0.888 0.784 0.838 0.664 0.71  0.904 0.7   0.74  0.94
 0.588 0.636 0.718 0.82 ]
true values: [0.56 0.85 0.63 0.66 0.96 0.46 0.66 0.81 0.73 0.8  0.62 0.79 0.59 0.49
 0.49 0.67 0.76 0.7  0.94 0.73 0.34 0.74 0.66 0.79 0.91 0.94 0.64 0.73
 0.71 0.81 0.67 0.85 0.8  0.73 0.64 0.89 0.9  0.88 0.69 0.72 0.56 0.86
 0.69 0.48 0.77 0.78 0.92 0.8  0.54 0.75 0.83 0.73 0.81 0.52 0.71 0.92
 0.94 0.59 0.93 0.89 0.9  0.86 0.79 0.93 0.87 0.91 0.61 0.71 0.82 0.62
 0.68 0.64 0

In [None]:
# Keras Sequential 
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
from keras import optimizers
from keras.optimizers import Adam
from keras import backend as K

# 測試資料
data_train = pd.read_csv('test/Training_set.csv', header=None).to_numpy()
data_test = pd.read_csv('test/Validation_set.csv', header=None).to_numpy()

train_array = data_train
data_number = len(train_array[:, 0])
feature_number = len(train_array[0, :])-1
x_train = train_array[:, :feature_number]
y_train = train_array[:, feature_number]

test_array = data_test
test_number = len(test_array[:, 0])
x_test = test_array[:, :feature_number]
y_test = test_array[:, feature_number]

# 誤差計算
def rmse(y_pred,y_true):
    return K.sqrt(K.mean(K.square(y_pred-y_true)))

# 建立Sequential
model=Sequential()
model.add(Dense(256,input_dim=3,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.08))
model.add(Dense(1))

# 編譯(損失函數,評估標準:RMSE 優化器:adam.adamax)
#model.compile(loss=rmse,optimizer="adam",metrics=[rmse])
model.compile(loss=rmse,optimizer='Adamax',metrics=[rmse])

# 訓練
model.fit(x_train,y_train,epochs=200,batch_size=64)

# 預測
y_predictions=model.predict(x_test)

print("prediction:", y_predictions)
print("true values:", y_test)



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
# xgboost
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

# 測試檔案
data_train = pd.read_csv('test/Training_set.csv', header=None).to_numpy()
data_test = pd.read_csv('test/Validation_set.csv', header=None).to_numpy()

train_array = data_train
data_number = len(train_array[:, 0])
feature_number = len(train_array[0, :])-1
x_train = train_array[:, :feature_number]
y_train = train_array[:, feature_number]

test_array = data_test
test_number = len(test_array[:, 0])
x_test = test_array[:, :feature_number]
y_test = test_array[:, feature_number]

# xgboost model
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='reg:gamma')
model.fit(x_train,y_train) # 訓練

y_predictions = model.predict(x_test)  # 預測

print("prediction:", y_predictions)
print("true values:", y_test)


prediction: [0.5709306  0.8351755  0.70201826 0.6911322  0.96599716 0.7858964
 0.6373249  0.8277904  0.80317855 0.7976576  0.7251878  0.7776781
 0.57484615 0.6988175  0.74707496 0.63468206 0.7941694  0.65663046
 0.91994554 0.7113621  0.43061587 0.7608472  0.7268247  0.8029149
 0.92513216 0.9475089  0.7413105  0.7163768  0.58394164 0.64377785
 0.7289882  0.8069674  0.7289882  0.65723336 0.58043885 0.9634623
 0.9105843  0.8162116  0.5980303  0.8079289  0.7059652  0.7527299
 0.6303393  0.68979484 0.7259644  0.7148053  0.8311399  0.7028618
 0.50037354 0.6924369  0.7494806  0.6127671  0.6917654  0.76573235
 0.6944614  0.94026005 0.9056089  0.383981   0.8910601  0.86239535
 0.9134456  0.8643629  0.6157113  0.942719   0.91472447 0.9110675
 0.7407952  0.69480336 0.81844103 0.74840564 0.67167646 0.7048507
 0.4756728  0.6363284  0.6593508  0.84696376 0.60374427 0.6628752
 0.93887573 0.73014146 0.8069674  0.8643629  0.6025801  0.6529295
 0.6525026  0.775861   0.5527689  0.87952244 0.8046529  0.76

In [None]:
import numpy as np
x=[[1,2,0],
  [2,3,4],
   [0,1,4],
   [1,2,1]]
y=np.max(x,axis=0)
#z=y+10
z=x-y
print(x)
print(y)
print(z)

[[1, 2, 0], [2, 3, 4], [0, 1, 4], [1, 2, 1]]
[2 3 4]
[[-1 -1 -4]
 [ 0  0  0]
 [-2 -2  0]
 [-1 -1 -3]]
