In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data=pd.read_csv("C:/Users/hp/Downloads/train_set.csv/train_set.csv")
data=data.drop(['moyeneGeneral','Note1', 'Note2', 'Note3','Note4', 'Note5', 'Note6', 'Note7', 'Note8','Nom','ID'],axis=1)
data.head()

Unnamed: 0.1,Unnamed: 0,"Serie,x",GroupeAnonymat,LieuNaissance,Centre,Etablissement,Decision,Willaya,moughataa,DateNaissance
0,27634,SN,SN09,Bathet meit,Lycée Jedida,Lycée Jedida,Ajourné,Nouakchott 2 (Ouest),KSAR,12 ديسمبر 1998
1,36311,LO,LO1,Beir Taouress,Lycée Arafat 2 (A),Lycée Arafat 2,Ajourné,Nouakchott 3 (Sud),ARAFAT,26 ديسمبر 1991
2,21059,LM,LM3,Akjoujt,Lycée Akjoujt,Lycée Akjoujt,Ajourné,Inchiri,AKJOUJT,8 مايو 1995
3,24532,SN,SN06,Chelkhet Tiyab,Lycée Toujounine 1,Lycée Toujounine 1,Ajourné,Nouakchott 1 (Nord),TOUJOUNINE,24 avr. 1999
4,17861,SN,SN03,Aere M'Bar,Lycée Nouadhibou 2,Lycée Wava Nouadhibou,Ajourné,Dakhlet NDB,NOUADHIBOU,10 mai 2001


## Preprocessing

In [3]:
### creer une colonne pour connaitre l'age
data['age'] = data['DateNaissance'].apply(  
    lambda x: int(x.split("-")[2] if '-' in x else int(x.split(" ")[2]) )
    ).apply(lambda x : 2021 - x if x > 1900  else ( 121 - x if x > 21 else 21 - x))

In [8]:
#data=data.drop('DateNaissance',axis=1)

In [4]:
data=data.drop('Unnamed: 0',axis=1)

In [5]:
# Transformer les observations
data['Decision'] = data['Decision'].replace(['Ajourné', 'Sessionnaire', 'Abscent', 'Examen annulé à cause du Téléphone', 'Examen annulé à cause du comportement'], 'Ajourné')

In [6]:
### encoder la variable target 
data['Decision']=pd.get_dummies(data['Decision'],drop_first=True)
data.head()

Unnamed: 0,"Serie,x",GroupeAnonymat,LieuNaissance,Centre,Etablissement,Decision,Willaya,moughataa,DateNaissance,age
0,SN,SN09,Bathet meit,Lycée Jedida,Lycée Jedida,1,Nouakchott 2 (Ouest),KSAR,12 ديسمبر 1998,23
1,LO,LO1,Beir Taouress,Lycée Arafat 2 (A),Lycée Arafat 2,1,Nouakchott 3 (Sud),ARAFAT,26 ديسمبر 1991,30
2,LM,LM3,Akjoujt,Lycée Akjoujt,Lycée Akjoujt,1,Inchiri,AKJOUJT,8 مايو 1995,26
3,SN,SN06,Chelkhet Tiyab,Lycée Toujounine 1,Lycée Toujounine 1,1,Nouakchott 1 (Nord),TOUJOUNINE,24 avr. 1999,22
4,SN,SN03,Aere M'Bar,Lycée Nouadhibou 2,Lycée Wava Nouadhibou,1,Dakhlet NDB,NOUADHIBOU,10 mai 2001,20


In [13]:
data.head()

Unnamed: 0,"Serie,x",GroupeAnonymat,LieuNaissance,Centre,Etablissement,Decision,Willaya,moughataa,age
0,SN,SN09,Bathet meit,Lycée Jedida,Lycée Jedida,1,Nouakchott 2 (Ouest),KSAR,23
1,LO,LO1,Beir Taouress,Lycée Arafat 2 (A),Lycée Arafat 2,1,Nouakchott 3 (Sud),ARAFAT,30
2,LM,LM3,Akjoujt,Lycée Akjoujt,Lycée Akjoujt,1,Inchiri,AKJOUJT,26
3,SN,SN06,Chelkhet Tiyab,Lycée Toujounine 1,Lycée Toujounine 1,1,Nouakchott 1 (Nord),TOUJOUNINE,22
4,SN,SN03,Aere M'Bar,Lycée Nouadhibou 2,Lycée Wava Nouadhibou,1,Dakhlet NDB,NOUADHIBOU,20


## Preparer les donnes pour le modele

In [7]:
# Séparer les variables indépendantes et la variable dépendante
X = data.drop('Decision',axis=1)
y = data['Decision']


In [8]:
# Diviser les données en données d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify =y)

In [11]:
# Définir les colonnes catégorielles et numériques
cat_cols = ['Serie,x', 'GroupeAnonymat', 'LieuNaissance', 'Centre', 'Etablissement', 'Willaya', 'moughataa']
num_cols = ['age']

In [None]:
# Prétraitement des données
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


## implementer le modele

In [13]:
classifier = DecisionTreeClassifier()

params={
    'max_depth':[4,6,8,10,12],
    'max_features':[2,3,4,5,6]
}

best_classifier=GridSearchCV(classifier,params)
best_classifier.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [4, 6, 8, 10, 12],
                         'max_features': [2, 3, 4, 5, 6]})

In [14]:
# Prédire les classes sur les données de test
y_pred = best_classifier.predict(X_test)

In [15]:
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=uint8)

In [16]:
# Évaluer les performances du modèle
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

Accuracy: 0.9175559740531491
Precision: 0.9175387191293428
Recall: 1.0
F1 score: 0.9569962890198647


## tester le modele sur les donnes de test

In [17]:
### Importation
new=pd.read_csv("C:/Users/hp/Downloads/test_set.csv/test_set.csv",sep=";")
new=new.drop('Unnamed: 0',axis=1)
new=new.drop('prediction',axis=1)

In [18]:
new

Unnamed: 0,Nom,"Serie,x",GroupeAnonymat,LieuNaissance,Centre,Etablissement,Willaya,moughataa,DateNaissance,ID
0,Mohamed Hama Ahmed Lely,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,15 يوليو 2001,ID_6
1,Fatimatou Mohamed Sid Abbe,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,12 août 1999,ID_15
2,El Alye Cheikh Ahmed Maihmatt,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,18 sept. 1999,ID_18
3,Haine Mohamed Lemin Telba,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,24 août 1998,ID_19
4,Aichata Sid'Ahmed Ahmed Ely,SN,SN09,El Megve,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,31-mai-95,ID_20
...,...,...,...,...,...,...,...,...,...,...
15925,Djeinaba Mountaga Dia,SN,SN08,Riyad,Lycée El Mina 3,Patriote El Mina,Nouakchott 3 (Sud),EL MINA,5 سبتمبر 2002,ID_39814
15926,Magatt Daouda Diaw,SN,SN08,Rosso,Lycée El Mina 3,Patriote El Mina,Nouakchott 3 (Sud),EL MINA,11 ديسمبر 1998,ID_39818
15927,Zeinebou Tourad Messoud,LM,LM4,Sebkha,Lycée El Mina 4,Patriote El Mina,Nouakchott 3 (Sud),EL MINA,2 janv. 2000,ID_39819
15928,Hajeratou Moussa Diallo,SN,SN08,Blajmil,Lycée El Mina 3,Patriote El Mina,Nouakchott 3 (Sud),EL MINA,30 ديسمبر 1997,ID_39821


In [19]:
new1=new.drop('Nom',axis=1)


In [None]:
X_test2 = preprocessor.transform(new1)

In [None]:
y_pred2 = best_classifier.predict(new1)

In [136]:
y_pred2

array([1, 1, 1, ..., 1, 1, 1], dtype=uint8)

In [137]:
g=pd.DataFrame(y_pred2)
g.value_counts()

1    15915
0       15
dtype: int64

In [138]:
proba=best_classifier.predict_proba(X_test2)

In [139]:
proba

array([[0.08198585, 0.91801415],
       [0.08198585, 0.91801415],
       [0.08198585, 0.91801415],
       ...,
       [0.09617058, 0.90382942],
       [0.08198585, 0.91801415],
       [0.03215434, 0.96784566]])

In [140]:
pre=proba[:, 0]
len(pre)

15930

In [144]:
new1.head()

Unnamed: 0,ID,"Serie,x",GroupeAnonymat,LieuNaissance,Centre,Etablissement,Willaya,moughataa,age,predicted,predicted2
0,ID_15,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,22,0.076045,0.081986
1,ID_22,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,18,0.076045,0.081986
2,ID_23,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,18,0.076045,0.081986
3,ID_29,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,21,0.076045,0.081986
4,ID_34,SN,SN09,Vassala,Lycée Vassala,Lycée Vassala,Hod Charghy,BASSEKNOU,21,0.076045,0.081986


In [141]:
new1['predicted2']=pre

In [142]:
submission=new1[['ID','predicted2']]

In [143]:
submission

Unnamed: 0,ID,predicted2
0,ID_15,0.081986
1,ID_22,0.081986
2,ID_23,0.081986
3,ID_29,0.081986
4,ID_34,0.081986
...,...,...
15925,ID_13569,0.017544
15926,ID_31670,0.081986
15927,ID_31759,0.096171
15928,ID_5855,0.081986


In [145]:
submission.to_csv('DecisionTree1.csv',index=False)