# Part 3: Machine learning

## Regression to see if there is a link between price and visiting time

In [49]:
# Common imports
import numpy as np
import os
import pandas as pd

# To plot pretty figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
import matplotlib.pyplot as plt
#%matplotlib notebook
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings = lambda *a, **kw: None

# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
import sklearn

Load data

In [50]:
df=pd.read_excel("webscraping.xlsx")
df.head()

Unnamed: 0,Name,Type,City,Visiting time,Adults,Children,Latitude,Longitude
0,Récréalle Centre Récréatif,recreational,Alle-sur-Semois,Not Available,,,49.842121,4.972678
1,Château de Jehay,cultural,Amay,1h,5.0,3.0,50.549776,5.324099
2,Espace muséal d'Andenne - Le Phare,cultural,Andenne,1h30 à 2h00,8.0,0.0,50.489398,5.096547
3,Les Jardins d'eau d'Annevoie,natural,Annevoie,1h30,10.0,6.0,50.341611,4.838456
4,Dinant Evasion - Lesse kayaks,recreational,Anseremme (Dinant),5h (21 km) • 2h30 (12 km) • 2h00 (9 km),,,50.23791,4.907923


#### Clean the column visiting time 

In [51]:
import numpy as np
import re

# Function to extract numeric values from a string
def extract_numeric_values(string):
    numeric_values = re.findall(r'\d+', string)
    return [int(value) for value in numeric_values]

# Replace invalid values with NaN
df['Visiting time'] = df['Visiting time'].replace('Not Available', np.nan)

# Convert values to string type
df['Visiting time'] = df['Visiting time'].astype(str)

# Extract numeric values from string values
df['Visiting time'] = df['Visiting time'].apply(lambda x: extract_numeric_values(x))

# Calculate average visiting time in minutes
df['Visiting time'] = df['Visiting time'].apply(lambda x: np.mean(x) * 60 if len(x) > 0 else np.nan)

# Verify the updated 'visiting time' column
print(df['Visiting time'])

0         NaN
1        60.0
2       495.0
3       930.0
4       607.5
        ...  
231       NaN
232     210.0
233       NaN
234     930.0
235    1800.0
Name: Visiting time, Length: 236, dtype: float64


- removed the h symbol
- convert hours to minutes
- changed unreadable data to NaN
- if range I took the average time of the range

In [52]:
df

Unnamed: 0,Name,Type,City,Visiting time,Adults,Children,Latitude,Longitude
0,Récréalle Centre Récréatif,recreational,Alle-sur-Semois,,,,49.842121,4.972678
1,Château de Jehay,cultural,Amay,60.0,5,3,50.549776,5.324099
2,Espace muséal d'Andenne - Le Phare,cultural,Andenne,495.0,8,0,50.489398,5.096547
3,Les Jardins d'eau d'Annevoie,natural,Annevoie,930.0,10,6,50.341611,4.838456
4,Dinant Evasion - Lesse kayaks,recreational,Anseremme (Dinant),607.5,,,50.237910,4.907923
...,...,...,...,...,...,...,...,...
231,Aqualibi,recreational,Wavre,,25,25,50.716969,4.610416
232,Aventure Parc Wavre,recreational,Wavre,210.0,,,50.716969,4.610416
233,Walibi,recreational,Wavre,,45,45,50.716969,4.610416
234,Musée de la Fraise et Jardin des petits fruits,cultural,Wépion,930.0,5,4,50.414660,4.855069


In [1]:
df.to_excel('webscraping.xlsx', index=False)

NameError: name 'df' is not defined

## Pre-processing 

In [53]:
cols_M1 = ['Visiting time']

Visiting time can be considered as a predictor variable for the attraction prices. This model assumes that the visiting time alone can explain the variation in prices.

In [54]:
cols_M2 = ['Visiting time', 'Type', 'City']

Here we include both the visiting time, the type, and the city of attraction as predictor variables. This model considers that both the visiting time and the city can influence the attraction prices. 

In [55]:
cols_M3 = ['Visiting time', 'Type','Adults', 'Children'] 

We include the visiting time, type, adults price, and children price as predictor variables. This model assumes that all four variables contribute to the variation in attraction prices. The adults price and children price variables account for the pricing differences based on the age category of visitors.

#### Create the label variable y

In [56]:
y = df.iloc[:, -1]
x= df.iloc[:, :-1]

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

In [59]:
x_train_M1=x_train[cols_M1]
x_test_M1 =x_test[cols_M1]

x_train_M2=x_train[cols_M2]
x_test_M2 =x_test[cols_M2]

x_train_M3=x_train[cols_M3]
x_test_M3 =x_test[cols_M3]

#### Building the pre-processors

In [60]:
from sklearn.compose import make_column_selector as selector

In [61]:
#For M1
categorical_columns_selector = selector(dtype_include=object)
categorical_columns_M1 = categorical_columns_selector(x_train_M1)

numeric_columns_M1 = [col for col in x_train_M1.columns if col not in categorical_columns_M1]

print(categorical_columns_M1)
print(numeric_columns_M1)

[]
['Visiting time']


In [62]:
#For M2
categorical_columns_selector = selector(dtype_include=object)
categorical_columns_M2 = categorical_columns_selector(x_train_M2)

numeric_columns_M2 = [col for col in x_train_M2.columns if col not in categorical_columns_M2]

print(categorical_columns_M2)
print(numeric_columns_M2)

['Type', 'City']
['Visiting time']


In [63]:
#For M3
categorical_columns_selector = selector(dtype_include=object)
categorical_columns_M3 = categorical_columns_selector(x_train_M3)

numeric_columns_M3 = [col for col in x_train_M3.columns if col not in categorical_columns_M3]

print(categorical_columns_M3)
print(numeric_columns_M3)

['Type', 'Adults', 'Children']
['Visiting time']


In [64]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [65]:
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [66]:
from sklearn.preprocessing import MinMaxScaler

In [67]:
numeric_transformer = Pipeline(
    steps=[("minmax", MinMaxScaler())]
)
numeric_transformer

Pipeline(steps=[('minmax', MinMaxScaler())])

## Training and model comparison

In [68]:
test_accuracy={}
test_auc_roc={}

### Model M1: simple logistic regression

In [69]:
from sklearn.compose import ColumnTransformer

In [70]:
preprocessor_M1 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns_M1),
        ("cat", categorical_transformer, categorical_columns_M1),
    ]
)
preprocessor_M1

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('minmax', MinMaxScaler())]),
                                 ['Visiting time']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 [])])

In [71]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [72]:
from sklearn.model_selection import KFold

In [73]:
k=KFold(n_splits=5,shuffle=True,random_state=42)

In [74]:
logistic = LogisticRegressionCV(
    cv=k,
    refit=True,
    solver="newton-cg",
    tol=1e-7,
    random_state=42,
)

In [75]:
pipeline_logit_M1 = Pipeline(
    steps=[("preprocessor", preprocessor_M1), ("classifier", logistic)]
)
pipeline_logit_M1

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('minmax',
                                                                   MinMaxScaler())]),
                                                  ['Visiting time']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  [])])),
                ('classifier',
                 LogisticRegressionCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                                      random_state=42, solver='newton-cg',
                                      tol=1e-07))])

In [76]:
from sklearn.impute import SimpleImputer

In [79]:
from sklearn.impute import SimpleImputer

# Create an instance of SimpleImputer with the most_frequent strategy
imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer on the training data and transform the training data
x_train_imputed = imputer.fit_transform(x_train)

# Transform the test data using the fitted imputer
x_test_imputed = imputer.transform(x_test)

In [80]:
imputer = SimpleImputer(strategy='mean')
x_train_M1_imputed = imputer.fit_transform(x_train_M1)
x_test_M1_imputed = imputer.transform(x_test_M1)

# Scaling numeric features
scaler = MinMaxScaler()
x_train_M1_scaled = scaler.fit_transform(x_train_M1_imputed)
x_test_M1_scaled = scaler.transform(x_test_M1_imputed)

# Training the logistic regression model
logreg = LogisticRegression()
logreg.fit(x_train_M1_scaled, y_train)

# Predicting on the test set
y_pred = logreg.predict(x_test_M1_scaled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

nettoyer la base de donnée

In [81]:
y_test_hat= pipeline_logit_M1.predict(x_test_M1)
y_test_hat_probs = pipeline_logit_M1.predict_proba(x_test_M1)[:,1]

test_accuracy['logit_m1'] = accuracy_score(y_test, y_test_hat)*100
test_auc_roc['logit_m1'] = roc_auc_score(y_test, y_test_hat_probs)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_test_hat))
print('Testing AUC: %.4f %%' % test_auc_roc['logit_m1'])
print('Testing accuracy: %.4f %%' % test_accuracy['logit_m1']) 

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### Model M2: logistic regression with more variables

In [82]:
preprocessor_M2 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns_M2),
        ("cat", categorical_transformer, categorical_columns_M2),
    ]
)
preprocessor_M2

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('minmax', MinMaxScaler())]),
                                 ['Visiting time']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Type', 'City'])])

In [83]:
pipeline_logit_M2 = Pipeline(
    steps=[("preprocessor", preprocessor_M2), ("classifier", logistic)]
)
pipeline_logit_M2

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('minmax',
                                                                   MinMaxScaler())]),
                                                  ['Visiting time']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Type', 'City'])])),
                ('classifier',
                 LogisticRegressionCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                                      random_state=42, solver='newton-cg',
                                      tol=1e-07))])

In [84]:
pipeline_logit_M2.fit(x_train_M2,y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [85]:
y_test_hat= pipeline_logit_M2.predict(x_test_M2)
y_test_hat_probs = pipeline_logit_M2.predict_proba(x_test_M2)[:,1]

test_accuracy['logit_m2'] = accuracy_score(y_test, y_test_hat)*100
test_auc_roc['logit_m2'] = roc_auc_score(y_test, y_test_hat_probs)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_test_hat))
print('Testing AUC: %.4f %%' % test_auc_roc['logit_m2'])
print('Testing accuracy: %.4f %%' % test_accuracy['logit_m2']) 

NotFittedError: This LogisticRegressionCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Model M3: logistic regression with more variables

In [86]:
preprocessor_M3 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns_M3),
        ("cat", categorical_transformer, categorical_columns_M3),
    ]
)
preprocessor_M3

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('minmax', MinMaxScaler())]),
                                 ['Visiting time']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Type', 'Adults', 'Children'])])

In [87]:
pipeline_logit_M3 = Pipeline(
    steps=[("preprocessor", preprocessor_M3), ("classifier", logistic)]
)
pipeline_logit_M3

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('minmax',
                                                                   MinMaxScaler())]),
                                                  ['Visiting time']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Type', 'Adults',
                                                   'Children'])])),
                ('classifier',
                 LogisticRegressionCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                                      random_state=42, solver='newton-cg',
                                      tol=1e-07))])

In [88]:
pipeline_logit_M3.fit(x_train_M3,y_train)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'int', 'str']

In [89]:
y_test_hat= pipeline_logit_M3.predict(x_test_M3)
y_test_hat_probs = pipeline_logit_M3.predict_proba(x_test_M3)[:,1]

test_accuracy['logit_m3'] = accuracy_score(y_test, y_test_hat)*100
test_auc_roc['logit_m3'] = roc_auc_score(y_test, y_test_hat_probs)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_test_hat))
print('Testing AUC: %.4f %%' % test_auc_roc['logit_m3'])
print('Testing accuracy: %.4f %%' % test_accuracy['logit_m3']) 

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

### Model choice: which model do you prefer? 

In [90]:
test_auc_roc

{}

In [91]:
test_accuracy

{}

### Re-fit the prefered model on the full data set

In [92]:
x_M2=x[cols_M2]
pipeline_logit_M2.fit(x_M2,y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
y_hat= pipeline_logit_M2.predict(x_M2)
y_hat_probs = pipeline_logit_M2.predict_proba(x_M2)[:,1]

test_accuracy['logit_m2'] = accuracy_score(y,y_hat,)*100
test_auc_roc['logit_m2'] = roc_auc_score(y,y_hat_probs)*100

print('Confusion matrix:\n', confusion_matrix(y, y_hat))
print('Testing AUC: %.4f %%' % test_auc_roc['logit_m2'])
print('Testing accuracy: %.4f %%' % test_accuracy['logit_m2']) 

## Use your model to predict the prices of museum in a city

In [None]:
df=pd.read_csv("webscraping.xlsx")
df.shape