In [34]:
import numpy as np

**Chargement de données**

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import pandas as pd

dataset = pd.read_csv('/content/drive/MyDrive/205e1808-6-dataset/data.csv')
print(dataset.shape)


(45200, 10)


### **Conversion de la colonne 'datetime'**


In [37]:
dataset['datetime'] = pd.to_datetime(dataset['datetime'])

# Extraction de l'heure et du jour de la semaine
dataset['hour'] = dataset['datetime'].dt.hour
dataset['day_of_week'] = dataset['datetime'].dt.dayofweek
print(dataset.shape)

(45200, 12)


### Séparation des caractéristiques (X) et de la variable cible (y)**bold text**

In [38]:
X = dataset.iloc[:, [2, 3, 4, 5, 6,7,8, 10, 11]].values
y = dataset.iloc[:, [9]].values.ravel()
print(X.shape)

(45200, 9)


### Affichage du nombre de valeurs manquantes pour chaque **variable**


In [39]:
print(dataset.isnull().sum())
print(X)

ID                0
datetime          0
siteid         4518
offerid           0
category          0
merchant          0
countrycode       0
browserid      2345
devid          6864
click             0
hour              0
day_of_week       0
dtype: int64
[[6310005.0 99217 41706 ... nan 19 2]
 [nan 287164 89522 ... 'Tablet' 15 5]
 [2463708.0 948989 12052 ... nan 10 4]
 ...
 [2656998.0 324233 48430 ... nan 21 6]
 [nan 908599 904 ... 'Tablet' 7 0]
 [9223301.0 487571 41640 ... 'Mobile' 13 1]]


## **Remplacement des valeurs manquantes**


In [40]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[:, [5, 6]] = imputer.fit_transform(X[:, [5, 6]])
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
X[:, [0]] = imputer.fit_transform(X[:, [0]])



### **Encodage des colonnes catégorielles**

In [41]:
categorical_columns = [4, 5, 6, 7, 8]

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in categorical_columns:
    X[:, column] = label_encoder.fit_transform(X[:, column])

### **Séparation du jeu de données**

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### **Standardisation des caractéristiques**

In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(dataset.isnull().sum())

ID                0
datetime          0
siteid         4518
offerid           0
category          0
merchant          0
countrycode       0
browserid      2345
devid          6864
click             0
hour              0
day_of_week       0
dtype: int64


In [52]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=150)
classifier.fit(X_train, y_train)

In [53]:
y_pred = classifier.predict(X_test)

In [54]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.900995575221239


In [55]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f1)


0.888550779764169


In [56]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[4657  384]
 [ 495 3504]]


In [49]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)

print(f"F1 Score: {f1}")

F1 Score: 0.8877900342335489


In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Print the number of "1"s and "0"s in training data
print("Training Data:")
print("Number of '1's:", sum(y_train == 1))
print("Number of '0's:", sum(y_train == 0))

# Print the number of "1"s and "0"s in testing data
print("\nTesting Data:")
print("Number of '1's:", sum(y_test == 1))
print("Number of '0's:", sum(y_test == 0))


# Calculate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Calculate accuracy for each class
accuracy_0 = conf_matrix[0, 0] / sum(conf_matrix[0, :])
accuracy_1 = conf_matrix[1, 1] / sum(conf_matrix[1, :])

# Print accuracy for each class
print("\nAccuracy for '0':", accuracy_0)
print("Accuracy for '1':", accuracy_1)


Training Data:
Number of '1's: 16001
Number of '0's: 20159

Testing Data:
Number of '1's: 3999
Number of '0's: 5041

Confusion Matrix:
[[4654  387]
 [ 498 3501]]

Accuracy for '0': 0.9232295179527872
Accuracy for '1': 0.8754688672168042
