# Feature engineering 

In this notebook you will find the feature engineering part 

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
#nltk.download()


In [43]:
data = pd.read_csv('../../../data/data_merged.csv')

In [44]:
id_col = "numero"
date_col = ["date_transplantation" ]

var_category = [

    "pathologie", 

    "sex", 
    "other_organ_transplantation", 
    "transplanted_twice_during_study_period", 
    "super_urgence", 
    "retransplant", 
    "preoperative_ICU", 
    "preoperative_vasopressor",
    "preoperative_mechanical_ventilation",
    "PFO", 
    "diabetes", 
    "preoperative_pulmonary_hypertension"
    "Insuffisance_renale", 
    "CMV_receveur", 
    "plasmapherese", 
    "preoperative_ECMO", 
    "thoracic_surgery_history", 
    "CMV_donneur", 
    "EBV_donneur", 
    "Sex_donor" 
]

var_continuous = [
        "BMI_donor",
        "age", 
        "heure_arrivee_bloc",
    "Poids", 
    "Taille", 
    "time_on_waiting_liste", 
    "LAS", 
    "body_mass_index",
    "PAPS", 
    "Age_donor", 
    "Poids_donor", 
    "Taille_donor", 
    "Donneur_CPT",
    "Tabagisme_donor",
    "Aspirations_donor", 
    "RX_donor", 
    "PF_donor",
    "oto_score"   
]

text_col = [
    "atcd_medicaux", 
    "atcd_chirugicaux"
]

target = [
    'immediate_extubation', 
    'secondary_intubation'
]

In [45]:
var_continuous = [ i.lower() for i in var_continuous if i.lower() in data.columns]
var_category = [ i.lower() for i in var_category if i.lower() in data.columns]

# Time based features

In [46]:
data.date_transplantation = pd.to_datetime(data.date_transplantation)

data['month'] = data.date_transplantation.dt.month
data['dayofweek'] = data.date_transplantation.dt.dayofweek

data.heure_arrivee_bloc = data.heure_arrivee_bloc.map(lambda x : str(x).split(':')[0])
data.drop('date_transplantation', axis = 1, inplace = True)

# Impute missing values (Quick n dirty)

In [47]:
#changer cette strategie
data = data.fillna(0)
#data[var_continuous] = data[var_continuous].astype(float)
#data[var_category] = data[var_category].replace("NF", 'NAN')

# Similarities between patient and donor

In [48]:
data["diff_age_donor_patient"] = (data.age - data.age_donor).abs()
data["diff_taille_donor_patient"] = (data.taille - data.taille_donor).abs()
data["diff_poids_donor_patient"] = (data.poids - data.poids_donor).abs()
data["diff_sex_donor_patient"] = data.sex_donor.astype(int) != data.sexe.astype(int)
data["diff_body_mass_index"] = (data.body_mass_index - data.bmi_donor).abs()


var_continuous = var_continuous + ["diff_age_donor_patient" , "diff_taille_donor_patient", "diff_poids_donor_patient"]

# Quick NLP Feature

In [49]:
voc_drop = ["depuis", "sous", "non", "oui", "puis", "sans", "mars", "mai", "gauche", "post", "ans"]

In [50]:
def text_cleaning(x, voc_drop) :
    
    porter = PorterStemmer()
    stop_words = set(stopwords.words('french'))
    table = str.maketrans(' ', ' ', string.punctuation)
    
    x = str(x).translate(table)
    x = x.lower().replace("\n", " ").replace("è", "e").replace("é", "e")
    x = [i for i in x.split(" ") if i not in stop_words ]
    x = [i for i in x if not i.isdigit()]
    x = [i for i in x if i not in voc_drop]
    x = " ".join(x)
    
    return(x)

In [51]:
data.atcd_medicaux = data.atcd_medicaux.map(lambda x: text_cleaning(x, voc_drop))
data.atcd_chirugicaux = data.atcd_chirugicaux.map(lambda x: text_cleaning(x, voc_drop))

In [58]:
data.cmv_receveur = data.cmv_receveur.map(lambda x : 1 if x == "Positif" else 0)
data.cmv_donneur = data.cmv_donneur.map(lambda x : 1 if x == "Positif" else 0)
data.ebv_donneur = data.ebv_donneur.map(lambda x : 1 if x == "Positif" else 0)

In [59]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(data.atcd_medicaux)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Split and export dataset 

In [60]:
import pickle 
pickle.dump(X_train_counts, open( "../../../data/counts_vec.p", "wb" ) )
pickle.dump(X_train_tfidf, open( "../../../data/tfidf_vec.p", "wb" ) )

In [61]:
pickle.dump(var_category, open( "../../../data/var_category.p", "wb" ) )
pickle.dump(var_continuous, open( "../../../data/var_continuous.p", "wb" ) )

In [62]:
y = pd.get_dummies(data.target)["successful IE"]
X = data.drop(["target", "numero"], axis = 1)

X.to_csv('../../../data/X.csv', index = False)
pd.DataFrame(data = {'target' : y}).to_csv('../../../data/y.csv', index = False)