# Projet Big Data
Théo Dupuis - Clement Frade - Yann Moulaire

## Préparations

### Importation des données via Boto3

In [None]:
import boto3
import pandas as pd
import os

s3 = boto3.resource('s3')
bucket = s3.Bucket('pbd-cty')
s3.Object(bucket.name,'predict.csv').download_file('predict.csv')
s3.Object(bucket.name,'train.csv').download_file('train.csv')

predict = pd.read_csv("predict.csv")
train = pd.read_csv("train.csv")

os.remove("predict.csv")
os.remove("train.csv")

In [None]:
import pandas as pd
import os

predict = pd.read_csv("predict.csv")
train = pd.read_csv("train.csv")

### Division de la base en entraînement et en test

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

train = train.fillna(0)

le = preprocessing.LabelEncoder()
train.Product_Info_2 = le.fit_transform(train.Product_Info_2.astype(str))
train.InsuredInfo_7 = le.fit_transform(train.InsuredInfo_7.astype(str))
train.InsuredInfo_8 = le.fit_transform(train.InsuredInfo_8.astype(str))
train.InsuredInfo_9 = le.fit_transform(train.InsuredInfo_9.astype(str))

X = train.drop(columns='Response')
y = train.Response

scaler = StandardScaler()
X = scaler.fit_transform(X)

pca = PCA(.2)
X = pca.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
scores = []

### Préparation du calcul des scores

In [None]:
from sklearn.metrics import r2_score

def score(model, X_test, y_test):
    return r2_score(y_test, model.predict(X_test).round())

## Algorithmes

    Les différents algorithmes mis en place vont être testés afin de savoir lequel correspond le mieux à nos attentes. Au lieu de, classiquement, tester la précision des méthodes en validant si oui ou non elles appartiennent à la classe qui leur est attribuée (entre 1 et 8), on va prendre en compte la distance de la valeur donnée par rapport à celle voulue. En effet, si un risque est calculé à 2 alors que l'on voulait 3, ce n'est pas grave. En revanche, si un risque est calculé à 8 alors qu'il aurait dû être à 2, c'est une erreur importante.

### K plus proches voisins

In [None]:
from sklearn.neighbors import KNeighborsRegressor

KNN = KNeighborsRegressor()
KNN.fit(X_train, y_train)
print("KNN score avec 5 voisins: ",score(KNN,X_test,y_test),"score de train",score(KNN,X_train,y_train))
scores.append(score(KNN,X_test,y_test))

KNN = KNeighborsRegressor(n_neighbors= 25)
KNN.fit(X_train, y_train)
print("KNN score avec 25 voisins: ",score(KNN,X_test,y_test),"score de train",score(KNN,X_train,y_train))
scores.append(score(KNN,X_test,y_test))

KNN = KNeighborsRegressor(n_neighbors= 1)
KNN.fit(X_train, y_train)
print("KNN score avec 1 seul voisin: ",score(KNN,X_test,y_test),"score de train",score(KNN,X_train,y_train))
scores.append(score(KNN,X_test,y_test))

KNN = KNeighborsRegressor(n_neighbors= 100)
KNN.fit(X_train, y_train)
print("KNN score avec 100 voisins: ",score(KNN,X_test,y_test),"score de train",score(KNN,X_train,y_train))
scores.append(score(KNN,X_test,y_test))

KNN = KNeighborsRegressor(n_neighbors= 200)
KNN.fit(X_train, y_train)
print("KNN score avec 200 voisins: ",score(KNN,X_test,y_test),"score de train",score(KNN,X_train,y_train))
scores.append(score(KNN,X_test,y_test))

KNN = KNeighborsRegressor(n_neighbors= 50)
KNN.fit(X_train, y_train)
print("KNN score avec",50," voisins: ",score(KNN,X_test,y_test),"score de train",score(KNN,X_train,y_train))
scores.append(score(KNN,X_test,y_test))


### Régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression

Linear = LogisticRegression(solver='lbfgs', max_iter = 500)
Linear.fit(X_train,y_train)
print("Linear model avec C=1 score: ",score(Linear,X_test,y_test),"score de train",score(Linear,X_train,y_train))
scores.append(score(Linear,X_test,y_test))

Linear = LogisticRegression(solver='lbfgs', C=0.2, max_iter = 1000)
Linear.fit(X_train,y_train)
print("Linear model avec C=0.2 score: ",score(Linear,X_test,y_test),"score de train",score(Linear,X_train,y_train))
scores.append(score(Linear,X_test,y_test))

Linear = LogisticRegression(solver='lbfgs', C=5, max_iter = 1000)
Linear.fit(X_train,y_train)
print("Linear model avec C=5 score: ",score(Linear,X_test,y_test),"score de train",score(Linear,X_train,y_train))
scores.append(score(Linear,X_test,y_test))


### Arbre de décision

In [None]:
from sklearn.tree import DecisionTreeRegressor

Tree = DecisionTreeRegressor(min_samples_split= 2)
Tree.fit(X_train,y_train)
print("Decision tree avec min_samples_split=",2," score: ",score(Tree,X_test,y_test),"score de train",score(Tree,X_train,y_train))
scores.append(score(Tree,X_test,y_test))

Tree = DecisionTreeRegressor(min_samples_split= 50)
Tree.fit(X_train,y_train)
print("Decision tree avec min_samples_split=",50," score: ",score(Tree,X_test,y_test),"score de train",score(Tree,X_train,y_train))
scores.append(score(Tree,X_test,y_test))

Tree = DecisionTreeRegressor(min_samples_split= 100)
Tree.fit(X_train,y_train)
print("Decision tree avec min_samples_split=",100," score: ",score(Tree,X_test,y_test),"score de train",score(Tree,X_train,y_train))
scores.append(score(Tree,X_test,y_test))

for i in range (700,1000,25):
    Tree = DecisionTreeRegressor(min_samples_split= i)
    Tree.fit(X_train,y_train)
    print("Decision tree avec min_samples_split=",i," score: ",score(Tree,X_test,y_test),"score de train",score(Tree,X_train,y_train))
    scores.append(score(Tree,X_test,y_test))



### Machine à support de vecteur

In [None]:
# Support Vector Machine
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train,y_train)
print("SVM score: ",score(svm,X_test,y_test),score(svm,X_train,y_train))
scores.append(score(svm,X_test,y_test))

svm = LinearSVC(C=0.1)
svm.fit(X_train,y_train)
print("SVM score: ",score(svm,X_test,y_test),score(svm,X_train,y_train))
scores.append(score(svm,X_test,y_test))

svm = LinearSVC(C=10)
svm.fit(X_train,y_train)
print("SVM score: ",score(svm,X_test,y_test),score(svm,X_train,y_train))
scores.append(score(svm,X_test,y_test))


### Perceptron Multi-couches

In [None]:
from sklearn.neural_network import MLPClassifier

nn_adam = MLPClassifier(solver="adam", hidden_layer_sizes=(100,50), max_iter=3000)
nn_adam.fit(X_train, y_train)
print("Adam")
print("  Train accuracy:", nn_adam.score(X_train, y_train))
print("  Test  accuracy:", nn_adam.score(X_test, y_test))
print("score:", score(nn_adam,X_test,y_test))

nn_sgd = MLPClassifier(solver="sgd", max_iter=3000)
nn_sgd.fit(X_train, y_train)
print("SGD")
print("  Train accuracy:", nn_sgd.score(X_train, y_train))
print("  Test  accuracy:", nn_sgd.score(X_test, y_test))
print("score:", score(nn_sgd,X_test,y_test))

nn_lbfgs = MLPClassifier(solver="lbfgs")
nn_lbfgs.fit(X_train, y_train)
print("LBFGS")
print("  Train accuracy:", nn_lbfgs.score(X_train, y_train))
print("  Test  accuracy:", nn_lbfgs.score(X_test, y_test))
print("score:", score(nn_lbfgs,X_test,y_test))

In [None]:
# Save Model Using joblib
import pandas
from sklearn import model_selection
from sklearn.neighbors import KNeighborsRegressor
import joblib

# Fit the model on training set
model = KNN = KNeighborsRegressor(n_neighbors= 50)
model.fit(X_train,y_train)
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(model, filename)