# testing for other models via lazypredict


### Importing the dataset and libraries


we have the data from: https://www.kaggle.com/datasets/dumanmesut/individual-carbon-footprint-calculation/data

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns

df=pd.read_csv('Carbon_Emission.csv',sep=";")

# Change display settings to show all columns
pd.set_option('display.max_columns', None)

df

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,obese,male,omnivore,twice a day,coal,private,hybrid,sometimes,230,never,268,medium,5,12,27,9,Yes,[],['Microwave'],2408
9996,normal,female,vegan,twice a day,coal,private,lpg,never,234,frequently,5316,extra large,3,14,8,24,Sometimes,"['Paper', 'Plastic']","['Stove', 'Microwave']",3084
9997,overweight,female,vegetarian,daily,electricity,walk/bicycle,,sometimes,298,very frequently,96,extra large,5,11,5,24,Yes,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']",2377
9998,underweight,male,vegan,more frequently,coal,private,petrol,often,179,rarely,8688,medium,5,19,14,5,Sometimes,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']",4574


### sorting the variable into groups 

In [26]:
variables_quantitative = [
    "Monthly Grocery Bill", "Vehicle Monthly Distance Km", 
    "Waste Bag Weekly Count", "How Long TV PC Daily Hour",
    "How Many New Clothes Monthly", "How Long Internet Daily Hour"]             #"CarbonEmission" wäre auch noch eine Zahlenmenge

variables_mit_mehreren_antwortmoeglichkeiten=["Recycling" ,"Cooking_With"]       #Variable mit mehrere Antwortmöglichkeiten

variables_for_one_hot_encoded=['Body Type','Sex','Diet','How Often Shower',
                               'Heating Energy Source','Social Activity',
                               'Frequency of Traveling by Air','Waste Bag Size','Energy efficiency']

variables_transport_vehicle_type=["Transport","Vehicle Type"]                    #Werte aus 2 Spalten in 1 Spalte zusammengeführt

### variables "Transport" and "Vehicle Type" (they have combined information)


In [27]:
##create new column: 
df["Transport Vehicle Type"]=df["Vehicle Type"] #definiere neue Spalte
df.loc[df["Transport Vehicle Type"].isna(), "Transport Vehicle Type"] = df["Transport"] # Werte aus 'Transport' übernehmen, wenn 'Vehicle Type' NaN ist


##add variable to one-hot-encoding
if "Transport Vehicle Type" not in variables_for_one_hot_encoded: ##wenn es noch nicht hinzugefügt wurde bzw is one element in the list already?
    variables_for_one_hot_encoded.append("Transport Vehicle Type") 


##veranschaulichen der neuen Spalten und ihrer Werte
df[["Transport","Vehicle Type","Transport Vehicle Type"]]


Unnamed: 0,Transport,Vehicle Type,Transport Vehicle Type
0,public,,public
1,walk/bicycle,,walk/bicycle
2,private,petrol,petrol
3,walk/bicycle,,walk/bicycle
4,private,diesel,diesel
...,...,...,...
9995,private,hybrid,hybrid
9996,private,lpg,lpg
9997,walk/bicycle,,walk/bicycle
9998,private,petrol,petrol


### variables "Recycling" and "Cooking_With" (these variables allow multiple answers)

In [28]:
##create new column: 
unique_values_Recycling= set([item for sublist in df['Recycling'].unique() for item in eval(sublist)]) # Liste aller einzigartigen Recycling-Möglichkeiten
# Für jede einzigartige Recycling-Möglichkeit eine neue Spalte hinzufügen und mit 0 oder 1 füllen
for item in unique_values_Recycling:
    df['Recycling '+str(item)] = df['Recycling'].apply(lambda x: 1 if item in x else 0)

##create new column: 
unique_values_cooking_With= set([item for sublist in df['Cooking_With'].unique() for item in eval(sublist)]) # Liste aller einzigartigen Cooking_With-Möglichkeiten
# Für jede einzigartige Cooking_With-Möglichkeit eine neue Spalte hinzufügen und mit 0 oder 1 füllen
for item in unique_values_cooking_With:
    df['Cooking With '+str(item)] = df['Cooking_With'].apply(lambda x: 1 if item in x else 0)



##add column to variables_quantitative for lineare Regression
columns_recycling=['Recycling '+str(item) for item in unique_values_Recycling] #liste mit Spalten-Namen
if columns_recycling[0] not in variables_quantitative: ##wenn es noch nicht hinzugefügt wurde bzw is one element in the list already?
    variables_quantitative += columns_recycling
columns_cooking_with=['Cooking With '+str(item) for item in unique_values_cooking_With] #liste mit Spalten-Namen
if columns_cooking_with[0] not in variables_quantitative: ##wenn es noch nicht hinzugefügt wurde bzw is one element in the list already?
    variables_quantitative += columns_cooking_with


#Darstellung 
df.iloc[:, -13:] #prints the last 13 columns
df.iloc[282:284, -13:] #Person 282 cooks with "nothing" deshalb can bei one-hot-encoding nicht eine Spalte gelöscht werden, person 283 who doesn't recycle deshalb can bei one-hot-encoding nicht eine Spalte gelöscht werden


Unnamed: 0,Recycling,Cooking_With,CarbonEmission,Transport Vehicle Type,Recycling Paper,Recycling Plastic,Recycling Metal,Recycling Glass,Cooking With Airfryer,Cooking With Oven,Cooking With Grill,Cooking With Stove,Cooking With Microwave
282,"['Paper', 'Plastic', 'Metal']",[],1484,public,1,1,1,0,0,0,0,0,0
283,[],"['Stove', 'Grill', 'Airfryer']",2955,public,0,0,0,0,1,0,1,1,0


### One-Hot-Encoding for categorical variables

In [29]:
#manual one-hot-encoding

for variable in variables_for_one_hot_encoded:
    column_name=variable

    ##create new column: 
    unique_values_clumn= list(df[column_name].unique())
    # Für jede einzigartige Cooking_With-Möglichkeit eine neue Spalte hinzufügen und mit 0 oder 1 füllen
    for item in unique_values_clumn:
        df[str(column_name) + ": " +str(item)] = df[column_name].apply(lambda x: 1 if item == x else 0)

    ##add column to variables_quantitative for lineare Regression
    list_dummy_columns=[str(column_name) + ": " +str(item) for item in unique_values_clumn][1:] #liste mit Spalten-Namen
    if list_dummy_columns[0] not in variables_quantitative: #wenn es noch nicht hinzugefügt wurde bzw is one element in the list already?
        variables_quantitative += list_dummy_columns


X_transformed = df[variables_quantitative]

### some other model evaluations 

Quelle: https://pypi.org/project/lazypredict/

how to fix the error according to chatGPT:

The error message indicates that the OneHotEncoder class in the version of scikit-learn you are using does not accept the sparse argument. The sparse argument has been replaced with sparse_output in more recent versions of scikit-learn.

Here's how you can fix this issue:

1. Update scikit-learn: Ensure you are using the latest version of scikit-learn where the argument is sparse_output.

2. Modify LazyRegressor Implementation: If updating scikit-learn is not an option, you may need to manually modify the LazyRegressor implementation in lazypredict to replace the sparse argument with sparse_output.

#### Update scikit-learn
To update scikit-learn, you can use pip:

bash
Code kopieren
pip install --upgrade scikit-learn

#### Modify LazyRegressor Implementation
If you cannot update scikit-learn, you can modify the lazypredict library source code. Here's how you can do it:

1. Locate the lazypredict Library: Find where the lazypredict library is installed. You can typically find this in your Python environment's site-packages directory.

2. Edit the Source Code: Open the Supervised.py file from the lazypredict library. Find the line with OneHotEncoder(handle_unknown="ignore", sparse=False) and change it to OneHotEncoder(handle_unknown="ignore", sparse_output=False).

Here is the modified part of the Supervised.py file:

python
Code kopieren

####### Original line
("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)),

####### Modified line
("encoding", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),

In [31]:


from lazypredict.Supervised import LazyRegressor
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

#elisabeth@AirvonElisabeth ~ % mamba activate elisabeth_python-umgebung_nr1
#control + R: Befehle aus der Vergangenheit suchen
#mamba install lazypredict - hat nicht funktioniert
#pip install lazypredict -deshalb das gemacht

########################################


y = df["CarbonEmission"]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_transformed,y,test_size=.5,random_state =123)

# Initialize LazyRegressor
#clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) #so war in Mathis-Link
clf = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

# Fit LazyRegressor on training data
models,predictions = clf.fit(X_train2, X_test2, y_train2, y_test2)


# Initialize LazyRegressor
#lazy_regressor = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
# Fit LazyRegressor on training data
#models, predictions = lazy_regressor.fit(X_train, X_test, y_train, y_test)


print(models)

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'