# different model and fewer features

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RepeatedKFold


df=pd.read_csv('Carbon_Emission.csv',sep=";")

#df = pd.read_csv('/kaggle/input/individual-carbon-footprint-calculation/Carbon Emission.csv')
#data can be found at https://www.kaggle.com/datasets/dumanmesut/individual-carbon-footprint-calculation/data


# change display settings to show all columns
pd.set_option('display.max_columns', None)


#rename
# rename columns: replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_')


df.head()

Unnamed: 0,Body_Type,Sex,Diet,How_Often_Shower,Heating_Energy_Source,Transport,Vehicle_Type,Social_Activity,Monthly_Grocery_Bill,Frequency_of_Traveling_by_Air,Vehicle_Monthly_Distance_Km,Waste_Bag_Size,Waste_Bag_Weekly_Count,How_Long_TV_PC_Daily_Hour,How_Many_New_Clothes_Monthly,How_Long_Internet_Daily_Hour,Energy_efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


# **variables "Transport" and "Vehicle Type"**

In [27]:
##create new column "Transport Vehicle Type" 
df["Transport_Vehicle_Type"]=df["Vehicle_Type"] #create a new column
df.loc[df["Transport_Vehicle_Type"].isna(), "Transport_Vehicle_Type"] = df["Transport"] # Werte aus 'Transport' übernehmen, wenn 'Vehicle Type' NaN ist


##veranschaulichen der neuen Spalten und ihrer Werte
df[["Transport","Vehicle_Type","Transport_Vehicle_Type"]].head()

Unnamed: 0,Transport,Vehicle_Type,Transport_Vehicle_Type
0,public,,public
1,walk/bicycle,,walk/bicycle
2,private,petrol,petrol
3,walk/bicycle,,walk/bicycle
4,private,diesel,diesel


# **Encoding & Scaling - ColumnTransformer**

In [28]:



variables_quantitative =["Vehicle_Monthly_Distance_Km", "How_Many_New_Clothes_Monthly","Waste_Bag_Weekly_Count"]
variables_for_one_hot_encoded=['Frequency_of_Traveling_by_Air','Body_Type','Transport_Vehicle_Type']


X = df[variables_quantitative + variables_for_one_hot_encoded]  

###########################################

cf = ColumnTransformer(
    [(col, OneHotEncoder(drop="first"), [col]) for col in variables_for_one_hot_encoded] +
    [(col, MinMaxScaler(), [col]) for col in variables_quantitative],  
    remainder="passthrough")

cf.fit(X)
X_transformed = cf.transform(X) # Data after scaling
 

#  Regression

In [29]:

y = df["CarbonEmission"]

############################################

kf = RepeatedKFold(n_splits = 4, n_repeats = 10)

X_transformed = pd.DataFrame(X_transformed)
train_scores = []
test_scores = []

for train_index, test_index in kf.split(X):
    X_train = X_transformed.loc[train_index]  #train_test_split muss man jetzt nicht mehr nehmen
    y_train = y.loc[train_index]
    X_test  = X_transformed.loc[test_index]
    y_test  = y.loc[test_index]
    
    model = LGBMRegressor()
    model.fit(X_train, y_train)

   ##Evaluating the Model  ######################################################
    # Predict the target variable for the training and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)


    train_scores.append(r2_score(y_train, y_train_pred))
    test_scores.append(r2_score(y_test, y_test_pred))
    
print(f"average R-squared (from train-set): {np.mean(train_scores):.3f}") #sollte auch train und nicht nur test-score ausgeben lassen damit overfitting überprüfen kann
print(f"average R-squared (from test-set): {np.mean(test_scores):.3f}")  



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 337
[LightGBM] [Info] Number of data points in the train set: 7500, number of used features: 15
[LightGBM] [Info] Start training from score 2265.216533
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 337
[LightGBM] [Info] Number of data points in the train set: 7500, number of used features: 15
[LightGBM] [Info] Start training from score 2262.754800
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 337
[LightGBM] [Info] Number of data points in the train set: 7500, number of used features: 15
[LightGBM] [Info] Start tra