# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import Dataset

In [3]:
df = pd.read_csv(r'./lifestyle_sustainability_data.csv',index_col='ParticipantID')
df.head(15)

Unnamed: 0_level_0,Age,Location,DietType,LocalFoodFrequency,TransportationMode,EnergySource,HomeType,HomeSize,ClothingFrequency,SustainableBrands,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,Gender,UsingPlasticProducts,DisposalMethods,PhysicalActivities,Rating
ParticipantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,35,Urban,Mostly Plant-Based,Often,Bike,Renewable,Apartment,800,Rarely,True,5,High,100,1500,Female,Rarely,Composting,High,5
2,28,Suburban,Balanced,Sometimes,Public Transit,Mixed,House,1500,Sometimes,True,4,Moderate,250,3000,Male,Sometimes,Recycling,Moderate,4
3,65,Rural,Mostly Animal-Based,Rarely,Car,Non-Renewable,House,2500,Often,False,2,Low,400,4500,Male,Often,Landfill,Low,1
4,42,Urban,Mostly Plant-Based,Often,Walk,Renewable,Apartment,950,Sometimes,True,4,Moderate,150,2000,Female,Rarely,Recycling,High,5
5,31,Suburban,Balanced,Sometimes,Public Transit,Mixed,House,1800,Often,True,3,Low,300,3500,Non-Binary,Sometimes,Combination,Moderate,3
6,58,Rural,Mostly Animal-Based,Rarely,Car,Non-Renewable,House,2200,Always,False,1,,450,5000,Male,Often,Landfill,,1
7,25,Urban,Mostly Plant-Based,Always,Bike,Renewable,Other,600,Rarely,True,5,High,80,1200,Female,Never,Composting,High,5
8,39,Suburban,Balanced,Often,Public Transit,Renewable,House,1600,Sometimes,True,4,Moderate,200,2800,Female,Sometimes,Recycling,Moderate,4
9,52,Rural,Mostly Plant-Based,Sometimes,Walk,Mixed,House,2000,Often,True,3,Low,350,4000,Male,Rarely,Combination,Low,3
10,29,Urban,Mostly Animal-Based,Rarely,Car,Non-Renewable,Apartment,1100,Always,False,2,,380,4200,Non-Binary,Often,Landfill,,1


In [4]:
df.PhysicalActivities.unique()

array(['High', 'Moderate', 'Low', nan], dtype=object)

In [5]:
# Import EDA library

In [6]:
import sweetviz as sv

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
# Create the report
my_report = sv.analyze(df)

# Display the report in an HTML file
my_report.show_html("EDA_Report.html")

Done! Use 'show' commands to display/save.   |██████████████████| [100%]   00:03 -> (00:00 left)


Report EDA_Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Preprocessing 

### segregrate data into continuos , nominal  and ordinal
### create order for ordinal data

In [8]:
continuous_columns = ['Age','HomeSize','MonthlyElectricityConsumption','MonthlyWaterConsumption']
nominal_columns = ['Location', 'DietType','TransportationMode','SustainableBrands','Gender','DisposalMethods' ]
ordinal_columns = ['LocalFoodFrequency','EnergySource','ClothingFrequency','EnvironmentalAwareness','CommunityInvolvement','UsingPlasticProducts','PhysicalActivities']

In [9]:
order = [['Rarely','Sometimes','Often','Always'],
       ['Renewable','Mixed','Non-Renewable',],
       ['Rarely', 'Sometimes', 'Often', 'Always'],
       [1,2,3,4,5],
       ['Low','Moderate','High'],
       ['Never','Rarely','Sometimes','Often'],
       ['Low','Moderate','High']]

# Pipeline and Imputation
#### CommunityInvolvement and PhysicalActivities are having missing values

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

In [11]:
# continuous_transformer = Pipeline([("Imputer",SimpleImputer(strategy='mean')),("step 2 ",StandardScaler(),)])
# nominal_transformer = Pipeline([('S Imputer', SimpleImputer(strategy='most_frequent'),),('sstep 2', OneHotEncoder(handle_unknown='ignore'),)])
# ordinal_transformer = Pipeline([('S Imputer',SimpleImputer(strategy='most_frequent'),), ('step 2',OrdinalEncoder(categories = order),)])


In [45]:
continuous_transformer = Pipeline([("step 2 ",StandardScaler(),)])
nominal_transformer = Pipeline([('sstep 2', OneHotEncoder(handle_unknown='ignore'),)])
ordinal_transformer = Pipeline([('S Imputer',SimpleImputer(strategy='most_frequent'),), ('step 2',OrdinalEncoder(categories = order),)])


# Column Transformers

In [46]:
from sklearn.compose import ColumnTransformer

In [47]:
preprocess = ColumnTransformer([("Continuous Data",continuous_transformer,continuous_columns),
                               ("Nominal Data", nominal_transformer,nominal_columns),
                               ("Ordinal Data", ordinal_transformer,ordinal_columns)])
preprocess

# Model Building using pipeline

1. Logistics Regression
2. SVC
3. KNN Classfication
4. Decision Tree
5. Naive Bayes

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Model Training

In [49]:
X = df.drop(['Rating'] , axis =1 )
Y = df['Rating']

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [51]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=23)


allmodels = [LogisticRegression(),SVC(),KNeighborsClassifier(),DecisionTreeClassifier(),GaussianNB()]
# models = [ Pipeline([("DataPreprocessing",preprocess),("Algorithm",ml_model)]) for ml_model in allmodels]

accuracy_score_all_models = {}
for ml_model in allmodels:
    trained_model = Pipeline([("DataPreprocessing",preprocess),("Algorithm",ml_model)])
    trained_model.fit(X_train,Y_train)
    Y_predict = trained_model.predict(X_test)
    score = accuracy_score(Y_test, Y_predict)
    accuracy_score_all_models[ml_model] = score

# for (k,v)  in accuracy_score_all_models:
#     print(f"{k} : {v}")

accuracy_score_all_models
    
# model = Pipeline([("DataPreprocessing",preprocess),("Algorithm",LogisticRegression())])
# model.fit(X_train,Y_train)
# Y_predict = model.predict(X_test)
# accuracy_score(Y_test, Y_predict)

{LogisticRegression(): 0.65,
 SVC(): 0.71,
 KNeighborsClassifier(): 0.68,
 DecisionTreeClassifier(): 0.67,
 GaussianNB(): 0.64}