In [2]:
import ast
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from mlxtend.plotting import plot_decision_regions

import numpy as np

import pandas as pd
# pd.options.display.max_columns = None
# pd.set_option('display.max_rows', 500)

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

import seaborn as sns

import re

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from sklearn.impute import( KNNImputer, SimpleImputer )
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import( LogisticRegression, LinearRegression, Ridge, Lasso )
from sklearn.model_selection import( cross_val_score, GridSearchCV, StratifiedShuffleSplit, train_test_split )
from sklearn.metrics import( accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, 
                             f1_score, mean_squared_error,plot_confusion_matrix, r2_score, RocCurveDisplay, )
                             
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor )
from sklearn import tree
from sklearn.utils import resample



In [3]:
dataset = pd.read_csv('src/conversion_data_test.csv')
dataset['new_user'] =  dataset['new_user'].astype(str)
dataset['new_user'] = [ 'Yes' if x == '1' else 'No' for x in dataset['new_user'] ]

X = dataset
X_test = dataset

## Préprocessing Pipeline

In [4]:
# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_test = preprocessor.fit_transform(X_test) # Preprocessing influenceur

## Apply Model 

In [5]:

model = joblib.load('src/model.joblib')
Y_pred = model.predict(X_test)


dataset['converted'] = Y_pred

In [6]:
converted = dataset[dataset['converted'] == 1]

In [9]:
print(converted.dtypes)
print(converted.head())

country                object
age                     int64
new_user               object
source                 object
total_pages_visited     int64
converted               int64
dtype: object
    country  age new_user  source  total_pages_visited  converted
0        UK   28       No     Seo                   16          1
22       US   19       No  Direct                   14          1
41       US   17       No     Seo                   17          1
43       US   43      Yes     Ads                   16          1
190      US   31      Yes  Direct                   15          1


## Predict Converted User 

In [10]:
dataset_plot = converted
target = 'converted'
explain_values = dataset_plot.drop(columns=[target])

for column in explain_values.columns:
    if dataset_plot[column].dtypes == "object":
        # Quantitative Values
        fig = px.histogram(dataset_plot[column])
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.update_layout(showlegend=False)
        fig.show()

    else:
        # Qualitative Values 
        cat_data = dataset_plot.groupby(column)[target].sum()
        fig = px.bar(x=cat_data.index, y=cat_data, labels=dict(x="", y=""))
        fig.update_layout(title= f"{column.replace('_', ' ')}")
        fig.show()