In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
from tensorflow import keras
import matplotlib.pyplot as plt
import copy
from sklearn.metrics import confusion_matrix, f1_score
from matplotlib.dates import DateFormatter
from statsmodels.stats.proportion import proportion_confint
import piecewise_regression
from sklearn import metrics
from numpy import sqrt, argmax

In [8]:
###Import of files
testing_dataset1 = pd.read_csv("testing_dataset1.csv", sep=";")
testing_dataset2 = pd.read_csv("testing_dataset2.csv", sep=";")
merged_df = pd.read_csv("detections_testing.csv", sep=",")

merged_df['date'] = pd.to_datetime(merged_df['date'], format="%Y-%m-%d %H:%M:%S")

In [3]:
### Building of vectors (input of CNN models)

out = []
rfid_year_list = pd.DataFrame()
out3 = []
for start_year in range(1998, 2022):
    start_date = datetime.datetime(year=start_year-1, month=9, day=1)
    end_date = start_date + timedelta(days=517)
    dRange = pd.date_range(start=start_date, end=end_date, freq="12H")
    # perform the grid search on time/rfid values
    values = merged_df[merged_df["year"] == start_year].groupby([pd.Grouper(key="date", freq="12H"), "rfid"])["antenna"].last().unstack().reindex(dRange).fillna(method="ffill").fillna(False).astype(int).values
    values_detect_number = merged_df[merged_df["year"] == start_year].groupby([pd.Grouper(key="date", freq="12H"), "rfid"])["antenna"].size().unstack().reindex(dRange).fillna(value=0).astype(int).values
    rfid_year = merged_df[merged_df["year"] == start_year][["rfid","year"]]
    # look for the corresponding breeding labels for this season
    out.extend(values.T)
    out3.extend(values_detect_number.T)
    if not rfid_year.empty:
        rfid_year_list = pd.concat([rfid_year_list, pd.DataFrame(rfid_year.drop_duplicates())])
        
final_value = [[0] for x in range(len(out))]
val = [[0,0] for x in range(975)]
for i in range(len(out)):
    for j in range(0,975):
        val[j] = [out[i][j+58],out3[i][j+58]]
    final_value[i]= np.array(val)
    
### Smaller vectors for determination of the sex    
final_value_425 = [i[0:425] for i in final_value]

In [4]:
###Import of models
model_M_F = keras.models.load_model('models/model_determination_M_F_without_augmentation')
model_NB_B = keras.models.load_model('models/model_determination_B_NB_without_augmentation')
model_S_F = keras.models.load_model('models/model_determination_S_F_without_augmentation')


In [9]:
### Predictions with the models
proba_NB_B = pd.DataFrame(model_NB_B.predict(np.array(final_value)), columns=['proba_NB','proba_B'])
prediction_NB_B = np.argmax(model_NB_B.predict(np.array(final_value)), axis=-1)
proba_S_F = pd.DataFrame(model_S_F.predict(np.array(final_value)), columns=['proba_F','proba_S'])
prediction_S_F = np.argmax(model_S_F.predict(np.array(final_value)), axis=-1)
classif_M_F = model_M_F.predict(np.array(final_value_425))
data_MF = pd.DataFrame(classif_M_F, columns=["F", "M"])

data_proba = pd.DataFrame({'rfid': rfid_year_list["rfid"], 'year': rfid_year_list["year"], 'pred_NB_B': prediction_NB_B, 'pred_S_F': prediction_S_F})
data_proba.reset_index(inplace=True, drop=True)
data_proba = pd.merge(data_proba, proba_NB_B, left_index=True, right_index=True)
data_proba = pd.merge(data_proba, proba_S_F, left_index=True, right_index=True)

data_proba['pred_NB_B'] = data_proba['pred_NB_B'].replace(0, "NB")
data_proba['pred_NB_B'] = data_proba['pred_NB_B'].replace(1, "B")

data_proba['pred_S_F'] = data_proba['pred_S_F'].replace(0, "F")
data_proba['pred_S_F'] = data_proba['pred_S_F'].replace(1, "S")

data_proba = data_proba.merge(data_MF, left_index=True, right_index=True)

### lifetime sex determination
data_temp_B = data_proba.loc[data_proba["pred_NB_B"] == "B"]
sex_list = data_temp_B.groupby('rfid').mean()[['F', 'M']]
sex_list['pred_M_F'] = sex_list.idxmax(axis=1)
sex_list = sex_list.drop(columns=['F', 'M'])
data_proba = data_proba.drop(columns=['F', 'M'])
data_proba = data_proba.merge(sex_list, on='rfid', how='left')
data_proba['pred_M_F'] = data_proba['pred_M_F'].fillna('I')

### Compilation of B vs. NB and F vs. S to have the outcomes
pred_outcome = []
for i in range(len(data_proba)):
    if data_proba.iloc[i]["pred_NB_B"] == "B":
        pred_outcome.append(data_proba.iloc[i]["pred_S_F"])
    else:
        pred_outcome.append("NB")
data_proba["pred_outcome"] = pred_outcome

In [10]:
### Merge of result with the testing datasets

data_proba = data_proba.merge(testing_dataset1, left_on=['rfid','year'], right_on=['RFID','Year'])
data_proba = data_proba.drop(['RFID','Year'], axis=1)
data_proba.rename({'Output':'dataset1_outcome'}, axis='columns', inplace=True)
data_proba.rename({'BreedingDate':'dataset1_breeding_date'}, axis='columns', inplace=True)

data_proba = data_proba.merge(testing_dataset2, left_on=['rfid','year'], right_on=['RFID','Year'])
data_proba = data_proba.drop(['RFID','Year'], axis=1)
data_proba.rename({'Output':'dataset2_outcome'}, axis='columns', inplace=True)
data_proba.rename({'BreedingDate':'dataset2_breeding_date'}, axis='columns', inplace=True)

# print(data_proba)

                           rfid  year pred_NB_B pred_S_F      proba_NB  \
0       R 0000 0000000046627082  2001        NB        F  9.999959e-01   
1       R 0000 0000000046627082  2002        NB        S  9.998114e-01   
2       R 0000 0000000046627110  2002        NB        S  1.000000e+00   
3       R 0000 0000000055038113  2002         B        F  4.329582e-08   
4       R 0000 0000000072520688  2002        NB        F  8.988630e-01   
..                          ...   ...       ...      ...           ...   
838  A 00000 0 964 001002875260  2020         B        S  1.815171e-08   
839  A 00000 0 964 001002875266  2020         B        S  8.366056e-17   
840  A 00000 0 964 001002875555  2020         B        F  1.660751e-07   
841     R 0000 0000000088473069  2020         B        F  3.424121e-11   
842     R 0000 0000000088473069  2021         B        F  1.140490e-02   

          proba_B       proba_F       proba_S pred_M_F pred_outcome  \
0    4.077511e-06  9.978701e-01  2.12988

In [11]:
## Print of confusion matrices
print(confusion_matrix(data_proba["pred_outcome"], data_proba["dataset1_outcome"], labels=["NB", "F", "S"]))
print(confusion_matrix(data_proba["pred_outcome"], data_proba["dataset2_outcome"], labels=["NB", "F", "S"]))
print(confusion_matrix(data_proba["dataset1_outcome"], data_proba["dataset2_outcome"], labels=["NB", "F", "S"]))

[[183  39   0]
 [ 10 334  15]
 [  2  45 215]]
[[158  63   1]
 [ 14 336   9]
 [  2  47 213]]
[[164  30   1]
 [ 10 404   4]
 [  0  12 218]]


In [12]:
print('Global accuracy for compiled NB vs. F vs. S:')
print("Pred. vs. Dat. 1: " + str(sum(data_proba["pred_outcome"] == data_proba["dataset1_outcome"])/len(data_proba)))
print("Pred. vs. Dat. 2: " + str(sum(data_proba["pred_outcome"] == data_proba["dataset2_outcome"])/len(data_proba)))
print("Dat. 1 vs. Dat. 2: " + str(sum(data_proba["dataset1_outcome"] == data_proba["dataset2_outcome"])/len(data_proba)))

Global accuracy for compiled NB vs. F vs. S:
Pred. vs. Dat. 1: 0.8683274021352313
Pred. vs. Dat. 2: 0.8386714116251482
Dat. 1 vs. Dat. 2: 0.9323843416370107


In [13]:
comparison_NB_B = []
data_proba["dataset1_outcome_NB_B"] = ["B" if x != "NB" else "NB" for x in data_proba["dataset1_outcome"]]
data_proba["dataset2_outcome_NB_B"] = ["B" if x != "NB" else "NB" for x in data_proba["dataset2_outcome"]]

print('Global accuracy for compiled NB vs. B:')
print("Pred. vs. Dat. 1: " + str(sum(data_proba["pred_NB_B"] == data_proba["dataset1_outcome_NB_B"])/len(data_proba)))
print("Pred. vs. Dat. 2: " + str(sum(data_proba["pred_NB_B"] == data_proba["dataset2_outcome_NB_B"])/len(data_proba)))
print("Dat. 1 vs. Dat. 2: " + str(sum(data_proba["dataset1_outcome_NB_B"] == data_proba["dataset2_outcome_NB_B"])/len(data_proba)))

Global accuracy for compiled NB vs. B:
Pred. vs. Dat. 1: 0.9395017793594306
Pred. vs. Dat. 2: 0.9051008303677343
Dat. 1 vs. Dat. 2: 0.9513641755634639


In [14]:
data_comparison_SF = copy.copy(data_proba.loc[(data_proba["dataset1_outcome"] != "NB") & (data_proba["dataset2_outcome"] != "NB")])

print('Global accuracy for compiled F vs. S:')
print("Pred. vs. Dat. 1: " + str(sum(data_comparison_SF["pred_S_F"] == data_comparison_SF["dataset1_outcome"])/len(data_comparison_SF)))
print("Pred. vs. Dat. 2: " + str(sum(data_comparison_SF["pred_S_F"] == data_comparison_SF["dataset2_outcome"])/len(data_comparison_SF)))
print("Dat. 1 vs. Dat. 2: " + str(sum(data_comparison_SF["dataset1_outcome"] == data_comparison_SF["dataset2_outcome"])/len(data_comparison_SF)))

Global accuracy for compiled F vs. S:
Pred. vs. Dat. 1: 0.9043887147335423
Pred. vs. Dat. 2: 0.9106583072100314
Dat. 1 vs. Dat. 2: 0.9749216300940439
