In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from imodels import RuleFitClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from rulefit import RuleFit
import warnings
warnings.filterwarnings("ignore")



DATA OVERVIEW

In [None]:
data = pd.read_csv('onlinefoods.csv')
data = data.drop('Unnamed: 12',axis=1)
data = data.drop('Output',axis=1)

data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(8, 4), dpi=100)
sns.countplot(y='Gender', data=data, hue='Gender', palette='deep', legend=False)
plt.title("Gender - Distribution")
plt.xlabel("Count")
plt.ylabel("Gender")
plt.show()

In [None]:
max_age = max(data['Age'])
min_age = min(data['Age'])
print(min_age)
print(max_age)

In [None]:
median_age = data['Age'].median()
mode_age = data['Age'].mode()[0]  

plt.figure(figsize=(8, 4), dpi=100)
sns.histplot(data['Age'], bins=7, kde=True, color="skyblue")
plt.axvline(median_age, color='orange', linestyle='--', linewidth=2, label=f'Median: {median_age}')
plt.axvline(mode_age, color='purple', linestyle='--', linewidth=2, label=f'Mode: {mode_age}')
plt.title("Age - Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 4), dpi=100)
sns.countplot(y='Marital Status', data=data, hue='Marital Status', palette='pastel', legend=False)
plt.title("Marital Status - Distribution")
plt.xlabel("Count")
plt.ylabel("Marital Status")
plt.show()

In [None]:
plt.figure(figsize=(8, 4), dpi=100)
sns.countplot(x='Family size', data=data, hue='Family size', palette='pastel', dodge=False)
plt.title("Family size - Distribution")
plt.ylabel("Count")
plt.xlabel("Family size")
plt.show()

In [None]:
plt.figure(figsize=(8, 4), dpi=100)
sns.countplot(y='Occupation', data=data, hue='Occupation', palette='muted', legend=False)
plt.title("Occupation - Distribution")
plt.xlabel("Count")
plt.ylabel("Occupation")
plt.show()

In [None]:
plt.figure(figsize=(8, 4), dpi=100)
sns.countplot(y='Educational Qualifications', data=data, hue='Educational Qualifications', palette='bright', legend=False)
plt.title("Educational Qualifications - Distribution")
plt.xlabel("Count")
plt.ylabel("Educational Qualifications")
plt.show()

In [None]:

income_counts = data['Monthly Income'].value_counts()

colors = sns.color_palette("pastel")
plt.figure(figsize=(10, 8), dpi=100)
plt.pie(income_counts, labels=income_counts.index, autopct='%1.1f%%', startangle=140,
        colors=colors, wedgeprops={'edgecolor': 'black', 'linewidth': 1}, textprops={'fontsize': 12})
plt.title("Monthly Income - Distribution", fontsize=16, fontweight='bold')

plt.show()


In [None]:
fig = px.scatter_geo(
    data, 
    lat='latitude', 
    lon='longitude', 
    text='Pin code', 
    title='Geographical Chart with Pincodes'
)

# Show the plot
fig.show()

In [None]:
data['Feedback'] = data['Feedback'].str.strip()

feedback_palette = {'Positive': '#66ff66', 'Negative': '#ff6666'}

plt.figure(figsize=(8, 4), dpi=100)
ax = sns.countplot(x='Feedback', data=data, hue='Feedback', palette=feedback_palette, dodge=False)

total = len(data)
for p in ax.patches:
    height = p.get_height()
    percentage = (height / total) * 100
    ax.annotate(f'{percentage:.2f}%', (p.get_x() + p.get_width() / 2., height / 2),
                ha='center', va='center', fontsize=12, color='black')

plt.title("Feedback - Distribution")
plt.ylabel("Count")
plt.xlabel("Feedback")
plt.show()

DATA PREPROSESSING

 Make Ordinal Columns Numerical

In [None]:
income_order = [
    'No Income', 
    'Below Rs.10000', 
    '10001 to 25000', 
    '25001 to 50000', 
    'More than 50000'
]
data['Monthly Income'] = pd.Categorical(data['Monthly Income'], categories=income_order, ordered=True)
data['Monthly Income Ordinal'] = data['Monthly Income'].cat.codes
data = data.drop('Monthly Income',axis=1)


educational_order = [
    'Uneducated', 
    'School', 
    'Graduate', 
    'Post Graduate', 
    'Ph.D'
]

data['Educational Qualifications'] = pd.Categorical(data['Educational Qualifications'], categories=educational_order, ordered=True)
data['Educational Qualifications Ordinal'] = data['Educational Qualifications'].cat.codes
data = data.drop('Educational Qualifications',axis=1)


data.head(10)

 Make Nominal Columns Numerical

In [None]:
data['Gender'], Gender_uniques = pd.factorize(data['Gender'])
data['Feedback'] = np.where(data['Feedback'] == 'Positive', 1, 0)


data = pd.get_dummies(data, columns=['Occupation'])
data = pd.get_dummies(data, columns=['Marital Status'])

data.head(10)

TRAIN AND TEST MODELS

In [None]:
X = data.drop('Feedback', axis=1).values
y = data['Feedback'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

clf = RuleFit(max_rules=200)  

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

predictions_binary = (predictions > 0.5).astype(int)

precision = precision_score(y_test, predictions_binary, average='macro')
recall = recall_score(y_test, predictions_binary, average='macro')
f1 = f1_score(y_test, predictions_binary, average='macro')
accuracy = accuracy_score(y_test, predictions_binary)

print(f"Rule-Based Accuracy: {accuracy:.4f}")
print(f"Rule-Based Precision: {precision:.4f}")
print(f"Rule-Based Recall: {recall:.4f}")
print(f"Rule-Based F1-Score: {f1:.4f}")



In [37]:
X = data.drop('Feedback', axis=1).values 
y = data['Feedback'].values  

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

params = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_leaf': [2, 3, 4]
}

clf = DecisionTreeClassifier()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(clf, params, cv=skf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_full_scaled, y_train_full)

print("DTree Best parameters from cross-validation:", grid_search.best_params_)
print("DTree Best cross-validation score:", grid_search.best_score_)

best_clf = grid_search.best_estimator_
fold_accuracies = cross_val_score(best_clf, X_train_full_scaled, y_train_full, cv=skf)

print("DTree Cross-validation scores for each fold:", fold_accuracies)
print("DTree Mean cross-validation accuracy:", fold_accuracies.mean())

y_pred = best_clf.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='macro')
test_recall = recall_score(y_test, y_pred, average='macro')
test_f1 = f1_score(y_test, y_pred, average='macro')

print(f"DTree Test Accuracy: {test_accuracy:.4f}")
print(f"DTree Test Precision: {test_precision:.4f}")
print(f"DTree Test Recall: {test_recall:.4f}")
print(f"DTree Test F1-Score: {test_f1:.4f}")


DTree Best parameters from cross-validation: {'max_depth': 3, 'min_samples_leaf': 3}
DTree Best cross-validation score: 0.8064516129032258
DTree Cross-validation scores for each fold: [0.80645161 0.79032258 0.83870968 0.79032258 0.80645161]
DTree Mean cross-validation accuracy: 0.8064516129032258
DTree Test Accuracy: 0.7692
DTree Test Precision: 0.5237
DTree Test Recall: 0.5237
DTree Test F1-Score: 0.5237


In [38]:
X = data.drop('Feedback', axis=1).values
y = data['Feedback'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = GaussianNB()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"NB Accuracy: {accuracy:.4f}")
print(f"NB Precision: {precision:.4f}")
print(f"NB Recall: {recall:.4f}")
print(f"NB F1-Score: {f1:.4f}")

NB Accuracy: 0.8077
NB Precision: 0.6136
NB Recall: 0.6221
NB F1-Score: 0.6175
