In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

import warnings
warnings.filterwarnings("ignore")




In [2]:
import os
os.listdir('/kaggle/input/datacamps-data-science-associate-certification')

['fitness_class_2212.csv']

Data Preprocssing

In [3]:
df = pd.read_csv('/kaggle/input/datacamps-data-science-associate-certification/fitness_class_2212.csv') 
df.head()

Unnamed: 0,booking_id,months_as_member,weight,days_before,day_of_week,time,category,attended
0,1,17,79.56,8,Wed,PM,Strength,0
1,2,10,79.01,2,Mon,AM,HIIT,0
2,3,16,74.53,14,Sun,AM,Strength,0
3,4,5,86.12,10,Fri,AM,Cycling,0
4,5,15,69.29,8,Thu,AM,HIIT,0


In [4]:
df.shape

(1500, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   booking_id        1500 non-null   int64  
 1   months_as_member  1500 non-null   int64  
 2   weight            1480 non-null   float64
 3   days_before       1500 non-null   object 
 4   day_of_week       1500 non-null   object 
 5   time              1500 non-null   object 
 6   category          1500 non-null   object 
 7   attended          1500 non-null   int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 93.9+ KB


In [6]:
df.months_as_member.unique()

array([ 17,  10,  16,   5,  15,   7,  11,   9,  23,  13,   8,  22,   6,
        33,  24,  14,   2,  12,  26,  28,  27,   1,   3,  21,  18,  19,
        53,  20,  34,  25,  32,  73,  55,   4,  35,  54,  76,  62,  42,
       105,  90,  29,  60,  30, 107,  52,  37,  38,  48,  51,  40,  89,
        57,  36,  44,  39,  41,  47,  58,  66,  45,  43,  61,  50,  65,
        31,  97,  59,  93, 148, 111,  69])

This data seems fine

In [7]:
df.days_before.unique()

array(['8', '2', '14', '10', '6', '4', '9', '12', '5', '3', '7', '13',
       '12 days', '20', '1', '15', '6 days', '11', '13 days', '3 days',
       '16', '1 days', '7 days', '8 days', '10 days', '14 days', '17',
       '5 days', '2 days', '4 days', '29'], dtype=object)

- We see that this include days mentioned resulting it as Object type. Let's fix it


In [8]:
df['days_before'] = df['days_before'].str.replace(' days', '').astype(int)


In [9]:
df['days_before'].value_counts().index.sort_values()

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 29], dtype='int64')

In [10]:
df.day_of_week.unique()

array(['Wed', 'Mon', 'Sun', 'Fri', 'Thu', 'Wednesday', 'Fri.', 'Tue',
       'Sat', 'Monday'], dtype=object)

In [11]:
df['day_of_week'] = df['day_of_week'].str.replace('.', '')


In [12]:
day_mapping = {
    'Wednesday': 'Wed',
    'Monday': 'Mon'
}

df['day_of_week'] = df['day_of_week'].replace(day_mapping)

In [13]:
df.day_of_week.unique()

array(['Wed', 'Mon', 'Sun', 'Fri', 'Thu', 'Tue', 'Sat'], dtype=object)

In [14]:
df.category.unique()

array(['Strength', 'HIIT', 'Cycling', 'Yoga', '-', 'Aqua'], dtype=object)

In [15]:
df['category']=df['category'].replace('-', np.nan)

In [16]:
df.category.value_counts()

HIIT        667
Cycling     376
Strength    233
Yoga        135
Aqua         76
Name: category, dtype: int64

In [17]:
df.isnull().sum()

booking_id           0
months_as_member     0
weight              20
days_before          0
day_of_week          0
time                 0
category            13
attended             0
dtype: int64

In [18]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df['weight'] = imputer.fit_transform(df[['weight']])
imputer=SimpleImputer(strategy='most_frequent')
df['category'] = imputer.fit_transform(df[['category']])

In [19]:
df.isnull().sum()

booking_id          0
months_as_member    0
weight              0
days_before         0
day_of_week         0
time                0
category            0
attended            0
dtype: int64

In [20]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['day_of_week'] = label_encoder.fit_transform(df['day_of_week'])

In [21]:
df.duplicated().sum()

0

In [22]:
df.describe()

Unnamed: 0,booking_id,months_as_member,weight,days_before,day_of_week,attended
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,750.5,15.628667,82.610378,8.346667,2.604,0.302667
std,433.157015,12.926543,12.680411,4.077938,1.948088,0.459565
min,1.0,1.0,55.41,1.0,0.0,0.0
25%,375.75,8.0,73.5625,4.0,1.0,0.0
50%,750.5,12.0,81.035,9.0,3.0,0.0
75%,1125.25,19.0,89.3825,12.0,4.0,1.0
max,1500.0,148.0,170.52,29.0,6.0,1.0


In [23]:
import plotly.express as px

category_counts = df['category'].value_counts().reset_index()
category_counts.columns = ['category', 'count']
category_counts = category_counts.sort_values(by='count', ascending=False)

fig = px.bar(category_counts, x='category', y='count', color='category', 
             labels={'category': 'Category', 'count': 'Count'},
             title='Count of Attendance by Category')

fig.update_traces(texttemplate='%{y}', textposition='outside')
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(xaxis_title='Category', yaxis_title='Count', legend_title='Attended',
                  legend=dict(x=0, y=1.15))

fig.show()

In [24]:
import plotly.graph_objects as go
import pandas as pd
cross_tab = pd.crosstab(df['category'], df['attended'], normalize='index') * 100

fig = go.Figure(data=go.Heatmap(
    x=cross_tab.columns,
    y=cross_tab.index,
    z=cross_tab.values,
    colorscale='YlGnBu',
    text=cross_tab.values.round(1),
    hovertemplate='Category: %{y}<br>Attended: %{x}<br>Percentage: %{text}%'
))

fig.update_xaxes(title='Attended')
fig.update_yaxes(title='Category')
fig.update_layout(title='Percentage of Attendance by Category', xaxis_showgrid=False, yaxis_showgrid=False)

fig.show()


In [25]:
fig = px.histogram(
    df,
    x='months_as_member',
    marginal='rug',
    nbins=30,
    title='Histogram with KDE for Months as Member',
    labels={'months_as_member': 'Months as Member'},
    opacity=0.7
)
fig.show()

In [26]:
fig = px.box(
    df,
    x='attended',
    y='months_as_member',
    orientation='v',
    labels={'attended': 'Attendance', 'months_as_member': 'Number of Months'},
    title='Relationship between Attendance and Number of Months'
)

fig.update_xaxes(gridcolor='lightgray')
fig.update_yaxes(gridcolor='lightgray')
fig.update_layout(showlegend=False)

fig.show()

In [27]:
Q1 = df['months_as_member'].quantile(0.25)
Q3 = df['months_as_member'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


In [28]:

df1 = df[(df['months_as_member'] >= lower_bound) & (df['months_as_member'] <= upper_bound)]

fig = px.box(
    df1,
    x='attended',
    y='months_as_member',
    orientation='v',
    labels={'attended': 'Attendance', 'months_as_member': 'Number of Months'},
    title='Relationship between Attendance and Number of Months (Without Outliers)'
)

fig.update_xaxes(gridcolor='lightgray')
fig.update_yaxes(gridcolor='lightgray')
fig.update_layout(showlegend=False)

fig.show()

In [29]:
df1 = pd.get_dummies(df, columns=['category', 'time'],drop_first=True)
df1

Unnamed: 0,booking_id,months_as_member,weight,days_before,day_of_week,attended,category_Cycling,category_HIIT,category_Strength,category_Yoga,time_PM
0,1,17,79.56,8,6,0,0,0,1,0,1
1,2,10,79.01,2,1,0,0,1,0,0,0
2,3,16,74.53,14,3,0,0,0,1,0,0
3,4,5,86.12,10,0,0,1,0,0,0,0
4,5,15,69.29,8,4,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1495,1496,21,79.51,10,0,0,0,1,0,0,0
1496,1497,29,89.55,2,1,0,0,0,1,0,0
1497,1498,9,87.38,4,5,0,0,1,0,0,0
1498,1499,34,68.64,14,3,0,0,0,0,0,0


In [30]:
X =df1.copy()
y=X.pop('attended')


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import metrics

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state=1)

In [33]:
regression_model = LogisticRegression()
regression_model.fit(X_train, y_train)

In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,accuracy_score
y_true = y_test
y_pred = regression_model.predict(X_test) 

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Accuracy Score: {accuracy:.3f}")


Mean Absolute Error (MAE): 0.2111111111111111
Mean Squared Error (MSE): 0.2111111111111111
Root Mean Squared Error (RMSE): 0.45946829173634074
R-squared (R²): 0.014976958525345418
Accuracy Score: 0.789
