# Importation des modules 

In [2]:
import ast
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from mlxtend.plotting import plot_decision_regions

import numpy as np

import pandas as pd
# pd.options.display.max_columns = None
# pd.set_option('display.max_rows', 500)

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

import seaborn as sns

import re

from sklearn import datasets
from sklearn.datasets import( load_iris, load_digits, fetch_lfw_people )
from sklearn.ensemble import( AdaBoostClassifier, AdaBoostRegressor, ExtraTreesClassifier, 
                              GradientBoostingClassifier,RandomForestClassifier, StackingClassifier )

from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from sklearn.impute import( KNNImputer, SimpleImputer )
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import( LogisticRegression, LinearRegression, Ridge, Lasso )
from sklearn.model_selection import( cross_val_score, GridSearchCV, StratifiedShuffleSplit, train_test_split )
from sklearn.metrics import( accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, 
                             f1_score, mean_squared_error, r2_score, RocCurveDisplay )
                             
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor )
from sklearn.utils import resample

from scipy.stats import mode 

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import warnings
warnings.filterwarnings( "ignore", category=DeprecationWarning ) # to avoid deprecation warnings

# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template( layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6",
                                                              "#2A7FAF", "#23B1AB", "#0E3449", "#015955"] )
pio.templates.default = "jedha"
pio.renderers.default = "svg" # to be replaced by "iframe" if working on JUPITER or "iframe_connected" or "notebook"

from xgboost import XGBRegressor


## Import Dataset

In [5]:
dataset = pd.read_csv("src/titanic.csv")

faces = fetch_lfw_people(min_faces_per_person=60)
# data = pd.read_csv('s3://full-stack-bigdata-datasets/Machine Learning Supervisé/Régression logistique Cross validation/Datasets/ex2data1.txt', header=None)

## Graph de correlation 

In [12]:
short_data = dataset.filter(['Age', 'Survived'])

# Correlation matrix
corr_matrix = short_data.corr()

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist())


fig.show()

## Séparation Variables explicative X et variable Cible/Target Y 

In [None]:
target_name = 'Survived'

Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Supprimer variables explicative ultra corrélées

In [None]:
corr = X.corr()

high_corr_list = []
cols = corr.columns

for j in cols:
    for i, item in corr[j].iteritems():
        if (i!=j) and abs(item) > 0.9:
            high_corr_list.append((i,j))
high_corr_list

no_keep = [high_corr_list[i][0] for i in range(len(high_corr_list)) if i%2 == 0]

columns_to_keep = [c for c in X.columns if c not in no_keep]

X_clean = X.loc[:, columns_to_keep]
X_clean.columns

## Conversions Numpy 

In [None]:
X = X.values # array (sans nom de colonnes)
Y = Y.tolist() # list 

## Séparation en Entrainement & Test 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
# stratify = Y # option pour même ratio de réponse que sur la table Y.

## Préprocessing Pipeline
### Moyenne quantitative & SVM

In [None]:
imputer = SimpleImputer(strategy="mean") # Valeur Manquante
scaler = StandardScaler() # Normalization

X_train = imputer.fit_transform(X_train)
X_train = scaler.fit_transform(X_train)

X_test = imputer.transform(X_test) # Missing values
X_test = scaler.transform(X_test) # Standardizing numerical features

### Médiane quantitative & plus fréquent qualitatif

In [None]:
# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur

### Encodage valeur cible qualitative (Random Forest)

In [None]:
encoder = LabelEncoder() # Label encoding
Y_train = encoder.fit_transform(Y_train)

### Encodage pour Adaboost 

In [None]:
imputer = KNNImputer()

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

## Training model

### Qualitatif (Classification)

In [None]:

model = LogisticRegression() # Train model

### Quantitatif (Numérique)

In [None]:
model = LinearRegression() # Train Model 

### Ridge model - Quantitatif (Numérique)

In [None]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("3-fold cross-validation...")
model = Ridge()

# Option
## model = Ridge(alpha = 10) # small Alpha (certainement overfitting)
## model = Ridge(alpha = 10000) # large Alpha (certainement underfitting)

### Lasso model - Quantitatif (Numérique)

In [None]:
model = Lasso(alpha = 1)

# Option
## model = Lasso(alpha = 0.01)
## model = Lasso(alpha = 0.0001)

### Decision trees

In [None]:
model = DecisionTreeClassifier() 

In [None]:
model = DecisionTreeRegressor(max_depth=3)

### Random Forest

In [None]:
model = RandomForestClassifier()

### SVC

In [None]:
model = SVC(kernel='rbf', class_weight='balanced')
# model = SVC(kernel="linear")

### XGBoost

In [None]:
model = XGBRegressor(max_depth=3)

### Adaboost

In [None]:
model = AdaBoostRegressor()

## Training

In [None]:

model.fit(X_train, Y_train) # Training is always done on train set !!

## Predictions

### single prediction

In [None]:
Y_train_pred = model.predict(X_train) # Predictions on training set
Y_test_pred = model.predict(X_test) # Prédictions on test set 

### Ridge model : Grid search

In [None]:
params = {
    'alpha': [0.0, 0.1, 0.5, 1.0] # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
# grid = GridSearchCV(model, params, cv = 10, verbose = 1) # verbose : affiche là ou il en est
gridsearch.fit(X_train, Y_train)

### Random Forest model : Grid search

In [None]:
# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)

### SVC model : Grid search

In [None]:

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}

grid = GridSearchCV(model, param_grid, verbose=2, n_jobs=-1)

grid.fit(X_train, Y_train)

### Adaboost : Grid search

In [None]:
parameters = {'n_estimators':[50, 100, 150, 200],
              "learning_rate":[1.0, 0.5, 0.1]}

grid = GridSearchCV(model, parameters)
grid.fit(X_train, Y_train)

### stacking classifier

In [None]:
tree = DecisionTreeClassifier(max_depth=5)
knn = KNeighborsClassifier()
nb = GaussianNB()
logreg = LogisticRegression()
svc = SVC(kernel='rbf', probability=True)

st = StackingClassifier(estimators=[("tree",tree),
    ("knn",knn),
    ("nb",nb),
    ("logreg",logreg),
    ("svc",svc)])

first_order_pred = pd.DataFrame(st.fit_transform(X_train,Y_train), columns=st.named_estimators_.keys())
first_order_pred

## Performances evaluation

### Qualitatif (Classification)

In [None]:
print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

### Quantitatif (Numérique)

In [None]:
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

### Ridge model - Quantitatif (Numérique)

In [None]:

scores = cross_val_score(model, X_train, Y_train, cv=3)
print('The cross-validated R2-score (accuracy) is : ', scores.mean())
print('The standard deviation is : ', scores.std())

### Lasso model - Quantitatif (Numérique)

In [None]:

baseline_pred = [Y_test.values.mean()]*len(Y_train)
print("Score Baseline : ",  r2_score(Y_test, baseline_pred))
print("Score Lasso1 (accuracy) : ",  model.score(X_train, Y_train))

### Grid search

In [None]:
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score (accuracy): ", gridsearch.best_score_)

Y_train_pred = gridsearch.predict(X_train) # Predictions on training set...
Y_test_pred = gridsearch.predict(X_test)

#### Grid search : Ridge & Lasso

In [None]:

print('Test score for the best model : ', gridsearch.best_estimator_.score(X_test,Y_test))

best_model = gridsearch.best_estimator_
scores = cross_val_score(best_model, X_train, Y_train, cv = 10)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

# Lasso 
print("train Mean Squared Error (MSE)")
print(mean_squared_error(Y_train,best_model.predict(X_train)))
print("test Mean Squared Error (MSE)")
print(mean_squared_error(Y_test,best_model.predict(X_test)))


print("Important Predictors")
df = pd.DataFrame({"Gene":X.columns[best_model.coef_!=0],
            "Parameter":best_model.coef_[best_model.coef_!=0]})
print(df)

#### Grid search : Random Forest

In [None]:
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

#### Grid search : SVM

In [None]:
model.score(X_test, Y_test)
print(grid.best_params_)
best = grid.best_estimator_
best.score(X_test, Y_test)

Y_pred = grid.best_estimator_.predict(X_test)
cr = classification_report(Y_test,Y_pred, target_names= faces.target_names)
print(cr)

#### XGBoost & Adaboost

In [None]:

print("score XGBoost/Adaboost default train {}".format(model.score(X_train, Y_train)))
print("score XGBoost/Adaboost default test {}".format(model.score(X_test, Y_test)))

#### DecisionTreeRegressor

In [None]:

print("score Tree max depth 3 train {}".format(model.score(X_train, Y_train)))
print("score Tree max depth 3 test {}".format(model.score(X_test, Y_test)))

#### Adaboost grid search

In [None]:
print("score Adaboost grid search train {}".format(gridsearch.best_estimator_.score(X_train, Y_train)))
print("score Adaboost grid search test {}".format(gridsearch.best_estimator_.score(X_test, Y_test)))

### stacking classifier

In [None]:

print("Score for the stacking classifier on the train set : {}".format(st.score(X_train, Y_train)))
print("Score for the stacking classifier on the test set : {}".format(st.score(X_test, Y_test)))

knn.fit(X_train,Y_train)
tree.fit(X_train,Y_train)
nb.fit(X_train,Y_train)
logreg.fit(X_train,Y_train)

print(" the score for the {0} model on the train set is : \n {1} \n the score for the {0} model on the test set is : \n {2} \n".format("knn",knn.score(X_train,Y_train),knn.score(X_test,Y_test)))
print(" the score for the {0} model on the train set is : \n {1} \n the score for the {0} model on the test set is : \n {2} \n".format("tree",tree.score(X_train,Y_train),tree.score(X_test,Y_test)))
print(" the score for the {0} model on the train set is : \n {1} \n the score for the {0} model on the test set is : \n {2} \n".format("naive bayes",nb.score(X_train,Y_train),nb.score(X_test,Y_test)))
print(" the score for the {0} model on the train set is : \n {1} \n the score for the {0} model on the test set is : \n {2} \n".format("logistic regression",logreg.score(X_train,Y_train),logreg.score(X_test,Y_test)))

## Coeficient valeurs explicatives 

In [None]:
list_coeficients = model.coef_
'''
# Tableau de comparaison
coef = pd.DataFrame()
coef['features'] = X.columns
coef['coef_linear_regressor'] = model.coef_
coef['coef_ridge_small_alpha'] = model2.coef_
coef['coef_ridge_large_alpha'] = model3.coef_
coef
'''

## Classement valeurs explicatives

In [None]:
feature_importance = f_regression(X_train, Y_train)

# Create DataFrame with feature importance
feature_ranking = pd.DataFrame(columns=dataset.columns[:-1], data=feature_importance, index=["f-score", "p-value"])
# Reshape DataFrame and sort by f-score
feature_ranking = feature_ranking.transpose().sort_values(["f-score", "p-value"], ascending=False)
# Create column with feature names
feature_ranking = feature_ranking.reset_index().rename(columns = {'index': 'feature'})

px.bar(feature_ranking.sort_values(["f-score", "p-value"]), x = 'f-score', y = 'feature')

## Visualize confusion matrices

In [None]:

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Train set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(model, X_train, Y_train, ax=ax) # ConfusionMatrixDisplay from sklearn
plt.show()

## Visualize confusion matrices (SVM)

In [None]:

y_pred = grid.best_estimator_.predict(X_test)
cm = confusion_matrix(Y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d")

## Visualize ROC curves

In [None]:

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="ROC Curve on Train set") # Set a title that we will add into ConfusionMatrixDisplay
RocCurveDisplay.from_estimator(model, X_train, Y_train, ax=ax) # RocCurveDisplay from sklearn
plt.show()

## Visualize the model

In [None]:
Y_train_proba = model.predict_proba(X_train)
# Visualize predictions on training Set
fig = px.scatter(x = X_train.flatten().tolist(), y = Y_train, title = "training set")
fig.add_trace(go.Scatter(x = X_train.flatten().tolist(), y = Y_train_pred, name = "predictions"))
fig.add_trace(go.Scatter(x = X_train.flatten().tolist(), y = Y_train_proba[:,1], name = "probabilities"))
fig.show()

In [None]:
Y_test_proba = model.predict_proba(X_test)
# Visualize predictions on test Set
fig = px.scatter(x = X_test.flatten().tolist(), y = Y_test, title = "test set")
fig.add_trace(go.Scatter(x = X_test.flatten().tolist(), y = Y_test_pred, name = "predictions"))
fig.add_trace(go.Scatter(x = X_test.flatten().tolist(), y = Y_test_proba[:,1], name = "probabilites"))
fig.show()

### Graph comparaison Train/Test Qualitatif (Classification)

In [None]:
# Partie préparative
df = pd.DataFrame(X_train, columns = ['Col_1', 'Col_2'])
df["proba"] = model.predict_proba(X_train)[:,1]
df["y"] = Y_train.tolist()
df.sort_values(by="proba", ascending=False)

X = df.iloc[:,:2].values
Y = df.iloc[:,-1].values

# Plotting decision regions
plot_decision_regions(X, Y, clf=model, legend=2)

# Adding axes annotations
plt.xlabel('Col_1')
plt.ylabel('Col_2')
plt.title('Logistic Regression Decision Boundary')
plt.show()

## Visualisations the correlation

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Correlation of Features', y=1.05, size=15)
sns.heatmap(pd.DataFrame(X_train).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)