In [9]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



df=pd.read_csv('Carbon_Emission.csv',sep=";")


# change display settings to show all columns
pd.set_option('display.max_columns', None)



df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


# Encoding

### variables "Transport" and "Vehicle Type"

In [10]:
##create new column "Transport Vehicle Type" 
df["Transport_Vehicle_Type"]=df["Vehicle Type"] #create a new column
df.loc[df["Transport_Vehicle_Type"].isna(), "Transport_Vehicle_Type"] = df["Transport"] # Werte aus 'Transport' übernehmen, wenn 'Vehicle Type' NaN ist

##veranschaulichen der neuen Spalten und ihrer Werte
df[["Transport","Vehicle Type","Transport_Vehicle_Type"]].head()

Unnamed: 0,Transport,Vehicle Type,Transport_Vehicle_Type
0,public,,public
1,walk/bicycle,,walk/bicycle
2,private,petrol,petrol
3,walk/bicycle,,walk/bicycle
4,private,diesel,diesel


### dummy-variables

In [11]:
variables_for_one_hot_encoded = ['Body Type', 'Sex', 'Diet', 'How Often Shower', 'Heating Energy Source', 'Social Activity', 'Frequency of Traveling by Air', 'Waste Bag Size', 'Energy efficiency', 'Transport_Vehicle_Type']

df = pd.get_dummies(df, columns=variables_for_one_hot_encoded, dummy_na=False, drop_first=True)

df

Unnamed: 0,Transport,Vehicle Type,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Recycling,Cooking_With,CarbonEmission,Body Type_obese,Body Type_overweight,Body Type_underweight,Sex_male,Diet_pescatarian,Diet_vegan,Diet_vegetarian,How Often Shower_less frequently,How Often Shower_more frequently,How Often Shower_twice a day,Heating Energy Source_electricity,Heating Energy Source_natural gas,Heating Energy Source_wood,Social Activity_often,Social Activity_sometimes,Frequency of Traveling by Air_never,Frequency of Traveling by Air_rarely,Frequency of Traveling by Air_very frequently,Waste Bag Size_large,Waste Bag Size_medium,Waste Bag Size_small,Energy efficiency_Sometimes,Energy efficiency_Yes,Transport_Vehicle_Type_electric,Transport_Vehicle_Type_hybrid,Transport_Vehicle_Type_lpg,Transport_Vehicle_Type_petrol,Transport_Vehicle_Type_public,Transport_Vehicle_Type_walk/bicycle
0,public,,230,210,4,7,26,1,['Metal'],"['Stove', 'Oven']",2238,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False
1,walk/bicycle,,114,9,3,9,38,5,['Metal'],"['Stove', 'Microwave']",1892,True,False,False,False,False,False,True,True,False,False,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
2,private,petrol,138,2472,1,14,47,6,['Metal'],"['Oven', 'Microwave']",2595,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False
3,walk/bicycle,,157,74,3,20,5,7,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074,False,True,False,True,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,True,False,False,False,False,False,False,True
4,private,diesel,266,8457,1,3,5,6,['Paper'],['Oven'],4743,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,True,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,private,hybrid,230,268,5,12,27,9,[],['Microwave'],2408,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,True,False,False,True,False,True,False,False,False,False
9996,private,lpg,234,5316,3,14,8,24,"['Paper', 'Plastic']","['Stove', 'Microwave']",3084,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False
9997,walk/bicycle,,298,96,5,11,5,24,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']",2377,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True
9998,private,petrol,179,8688,5,19,14,5,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']",4574,False,False,True,True,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,True,False,False,False,False,True,False,False


### Encoding "Recycling" and "Cooking_With"

In [12]:
##create new column: dummy-variables for Recycling
df['Recycling_Glass'] = df['Recycling'].apply(lambda x: 1 if 'Glass' in x else 0)      #x stands for 'Recycling'
df['Recycling_Metal'] = df['Recycling'].apply(lambda x: 1 if 'Metal' in x else 0)      #x stands for 'Recycling'
df['Recycling_Paper'] = df['Recycling'].apply(lambda x: 1 if 'Paper' in x else 0)      #x stands for 'Recycling'
df['Recycling_Plastic'] = df['Recycling'].apply(lambda x: 1 if 'Plastic' in x else 0)  #x stands for 'Recycling'

##create new column: dummy-variables for Cooking_With
#we do not make a dummy-variable 'Cooking_With_Airfryer' because there is an error in the Dataset: every time it is there is Airfryer it is also Grill
df['Cooking_With_Oven'] = df['Cooking_With'].apply(lambda x: 1 if 'Oven' in x else 0)            #x stands for 'Cooking_With'
df['Cooking_With_Grill'] = df['Cooking_With'].apply(lambda x: 1 if 'Grill' in x else 0)          #x stands for 'Cooking_With'
df['Cooking_With_Stove'] = df['Cooking_With'].apply(lambda x: 1 if 'Stove' in x else 0)          #x stands for 'Cooking_With'
df['Cooking_With_Microwave'] = df['Cooking_With'].apply(lambda x: 1 if 'Microwave' in x else 0)  #x stands for 'Cooking_With'


df.loc[282:283, :]

Unnamed: 0,Transport,Vehicle Type,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Recycling,Cooking_With,CarbonEmission,Body Type_obese,Body Type_overweight,Body Type_underweight,Sex_male,Diet_pescatarian,Diet_vegan,Diet_vegetarian,How Often Shower_less frequently,How Often Shower_more frequently,How Often Shower_twice a day,Heating Energy Source_electricity,Heating Energy Source_natural gas,Heating Energy Source_wood,Social Activity_often,Social Activity_sometimes,Frequency of Traveling by Air_never,Frequency of Traveling by Air_rarely,Frequency of Traveling by Air_very frequently,Waste Bag Size_large,Waste Bag Size_medium,Waste Bag Size_small,Energy efficiency_Sometimes,Energy efficiency_Yes,Transport_Vehicle_Type_electric,Transport_Vehicle_Type_hybrid,Transport_Vehicle_Type_lpg,Transport_Vehicle_Type_petrol,Transport_Vehicle_Type_public,Transport_Vehicle_Type_walk/bicycle,Recycling_Glass,Recycling_Metal,Recycling_Paper,Recycling_Plastic,Cooking_With_Oven,Cooking_With_Grill,Cooking_With_Stove,Cooking_With_Microwave
282,public,,129,1351,6,5,13,7,"['Paper', 'Plastic', 'Metal']",[],1484,False,False,True,True,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,0,1,1,1,0,0,0,0
283,public,,65,436,2,9,46,23,[],"['Stove', 'Grill', 'Airfryer']",2955,False,True,False,False,False,False,True,True,False,False,False,True,False,True,False,False,False,True,False,False,True,False,True,False,False,False,False,True,False,0,0,0,0,0,1,1,0


# linear Regression

In [15]:
bool_and_numeric_cols = df.select_dtypes(include=['bool', 'number']).drop('CarbonEmission',axis=1).columns.tolist()

print(bool_and_numeric_cols)

['Monthly Grocery Bill', 'Vehicle Monthly Distance Km', 'Waste Bag Weekly Count', 'How Long TV PC Daily Hour', 'How Many New Clothes Monthly', 'How Long Internet Daily Hour', 'Body Type_obese', 'Body Type_overweight', 'Body Type_underweight', 'Sex_male', 'Diet_pescatarian', 'Diet_vegan', 'Diet_vegetarian', 'How Often Shower_less frequently', 'How Often Shower_more frequently', 'How Often Shower_twice a day', 'Heating Energy Source_electricity', 'Heating Energy Source_natural gas', 'Heating Energy Source_wood', 'Social Activity_often', 'Social Activity_sometimes', 'Frequency of Traveling by Air_never', 'Frequency of Traveling by Air_rarely', 'Frequency of Traveling by Air_very frequently', 'Waste Bag Size_large', 'Waste Bag Size_medium', 'Waste Bag Size_small', 'Energy efficiency_Sometimes', 'Energy efficiency_Yes', 'Transport_Vehicle_Type_electric', 'Transport_Vehicle_Type_hybrid', 'Transport_Vehicle_Type_lpg', 'Transport_Vehicle_Type_petrol', 'Transport_Vehicle_Type_public', 'Transpor

In [14]:


X = df[bool_and_numeric_cols]

y = df["CarbonEmission"]


##### Splitting the dataset into train and test set ########################################
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.75)

#linear Regresssion
model = LinearRegression()
model.fit(X_train, y_train)


#Evaluating the Model

# Predict the target variable for the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("R-squared (from train-set):",r2_score(y_train, y_train_pred))
print("R-squared (from test-set):",r2_score(y_test, y_test_pred))



R-squared (from train-set): 0.9338866684384936
R-squared (from test-set): 0.9325984966954334
