In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ml-olympiad-co2-emissions-prediction-challenge:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F70983%2F7794824%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240317%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240317T060259Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D762c34adabd536d4116c4fea1d7a19111503f395b4183cf641c5d536c0a70ec243e8d483010f058b16482956c58633eaf0e4c0de4caef49d2132f262e3bcb4e3b2478addac04106fb585ac379447e874666db8f3b17947681ac4839b3a18b264c6a16ece8e6cb7b8799b1cbdbc3c9c2e9296883f47d37d04c3d86dbf1b024ba9c20d0346729e7d48c01384fa2928d0aa8c6fbd952df5ef10633ac97cd381c2cc42728ac48a8e1e5297285733095228918c9ea7d653033153f92af4af91d5494b848d4ec2dc143f2898f94c9bd24b86925f1a3a01295ac9f5e767d30adfa1eda4ad87889d09b8b5fc59c168cee3cb77e1c1f81fed007d780f7e22ee1e688fd514'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading ml-olympiad-co2-emissions-prediction-challenge, 321052 bytes compressed
Downloaded and uncompressed: ml-olympiad-co2-emissions-prediction-challenge
Data source import complete.


In [10]:
# %% [code] {"execution":{"iopub.status.busy":"2024-03-11T15:26:48.831005Z","iopub.execute_input":"2024-03-11T15:26:48.831523Z","iopub.status.idle":"2024-03-11T15:26:48.889648Z","shell.execute_reply.started":"2024-03-11T15:26:48.831487Z","shell.execute_reply":"2024-03-11T15:26:48.888722Z"}}
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

Train = pd.read_csv('/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge/train.csv')
Test = pd.read_csv('/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge/test.csv')
sample_submission = pd.read_csv('/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge/sample_submission.csv')

# %% [code] {"execution":{"iopub.status.busy":"2024-03-11T15:26:48.891712Z","iopub.execute_input":"2024-03-11T15:26:48.892389Z","iopub.status.idle":"2024-03-11T15:26:48.917882Z","shell.execute_reply.started":"2024-03-11T15:26:48.892357Z","shell.execute_reply":"2024-03-11T15:26:48.916618Z"}}
# Handle missing data more robustly
numeric_cols_train = Train.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols_train = Train.columns.difference(numeric_cols_train)

numeric_cols_test = Test.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols_test = Test.columns.difference(numeric_cols_test)

common_non_numeric_cols = list(set(non_numeric_cols_train) & set(non_numeric_cols_test))

if len(numeric_cols_train) > 0:
   imputer = KNNImputer(n_neighbors=5)
   Train_numeric_imputed = pd.DataFrame(imputer.fit_transform(Train[numeric_cols_train]), columns=numeric_cols_train)
   Test_numeric_imputed = pd.DataFrame(imputer.transform(Test[numeric_cols_test]), columns=numeric_cols_test)
else:
   Train_numeric_imputed = Train[numeric_cols_train]
   Test_numeric_imputed = Test[numeric_cols_test]

Train_imputed = pd.concat([Train[common_non_numeric_cols], Train_numeric_imputed], axis=1)
Test_imputed = pd.concat([Test[common_non_numeric_cols], Test_numeric_imputed], axis=1)

# Unique country for iteration
unique = Train_imputed['Country Name'].unique()

# %% [code] {"execution":{"iopub.status.busy":"2024-03-11T15:26:48.929443Z","iopub.execute_input":"2024-03-11T15:26:48.930452Z","iopub.status.idle":"2024-03-11T15:27:02.902191Z","shell.execute_reply.started":"2024-03-11T15:26:48.930404Z","shell.execute_reply":"2024-03-11T15:27:02.900943Z"}}
# Training and inferencing with LogisticRegression, ARIMA, and Ensemble
j = 0
for i in unique:
   transposed_df = 0
   fg = Train_imputed[Train_imputed['Country Name'] == i].reset_index(drop=True)
   columns_to_drop = ['Country Code', 'Country Name', 'Indicator']
   columns_to_drop = [col for col in columns_to_drop if col in fg.columns]
   fg = fg.drop(columns_to_drop, axis=1)
   transposed_df = fg.transpose()

   # Separate features (X) and target (y)
   X = fg.drop(fg.columns[fg.columns.str.contains('YR', case=False)], axis=1)
   y = fg.loc[:, fg.columns.str.contains('YR', case=False)]

   # Reshape y into a 1D array or column vector
   y = y.values.ravel()

   # Check if there are any valid CO2 emissions values
   if y.shape[0] > 0:
       # Separate training/validation split
       X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

       # Logistic regression with regularization
       logistic_regression = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
       logistic_regression.fit(X_train, y_train)

       # ARIMA hyperparameter tuning
       param_grid = {
           'order': [(0, 1, 0), (1, 1, 0), (2, 1, 0), (0, 1, 1), (1, 1, 1)],
           'seasonal_order': [(0, 0, 0, 0), (0, 1, 1, 12)]
       }
       model = ARIMA(endog=y_train, exog=X_train, out_of_sample_size=16)
       grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
       grid_search.fit(X=X_train)
       best_arima = grid_search.best_estimator_

       # Ensemble model
       estimators = [
           ('logistic', logistic_regression),
           ('arima', best_arima),
           ('xgb', XGBRegressor(random_state=42))
       ]
       ensemble = VotingRegressor(estimators)
       ensemble.fit(X_train, y_train)

       try:
           fgt = Test_imputed[Test_imputed['Country Name'] == i].reset_index(drop=True)
           columns_to_drop = ['Country Name', 'Indicator']
           fgt = fgt.drop(columns_to_drop, axis=1)
           transposed_df = fgt.transpose()

           # Feature engineering for test data
           transposed_df['lag_1'] = transposed_df['CO2_Emissions'].shift(1)
           transposed_df['rolling_mean'] = transposed_df['CO2_Emissions'].rolling(window=3).mean()
           transposed_df['poly_2'] = np.power(transposed_df['CO2_Emissions'], 2)

           X_test = transposed_df.drop('CO2_Emissions', axis=1)

           # Make predictions using the ensemble model
           forecast = ensemble.predict(X_test)

           row_index = sample_submission[sample_submission.eq(i).any(axis=1)].index[0]
           new_values = [i, forecast[0], forecast[1], forecast[2], forecast[3], forecast[4], forecast[5]]
           sample_submission.loc[row_index] = new_values

           # Evaluate the model on the validation set
           y_val_pred = ensemble.predict(X_val)
           val_mse = mean_squared_error(y_val, y_val_pred)
           val_rmse = np.sqrt(val_mse)
           val_mae = mean_absolute_error(y_val, y_val_pred)
           val_mape = mean_absolute_percentage_error(y_val, y_val_pred)
           print(f"Validation Metrics for {i}:")
           print(f"MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}, MAPE: {val_mape:.4f}")

       except:
           j += 1

   else:
       j += 1
       continue

# %% [code] {"execution":{"iopub.status.busy":"2024-03-11T15:27:02.903585Z","iopub.execute_input":"2024-03-11T15:27:02.903944Z","iopub.status.idle":"2024-03-11T15:27:02.921941Z","shell.execute_reply.started":"2024-03-11T15:27:02.903915Z","shell.execute_reply":"2024-03-11T15:27:02.920545Z"}}
print(sample_submission)

# %% [code] {"execution":{"iopub.status.busy":"2024-03-11T15:27:02.92391Z","iopub.execute_input":"2024-03-11T15:27:02.92439Z","iopub.status.idle":"2024-03-11T15:27:02.947365Z","shell.execute_reply.started":"2024-03-11T15:27:02.924345Z","shell.execute_reply":"2024-03-11T15:27:02.946038Z"}}
sample_submission.to_csv('submission.csv', index=False)

                                   Country Name  2016 [YR2016]  2017 [YR2017]  \
0                                   Afghanistan         3.2253         1.1712   
1                                       Albania         2.3565         2.3681   
2                                       Algeria         1.0482         1.2839   
3                                American Samoa         3.5276         3.1392   
4                                       Andorra         0.8961         3.9242   
..                                          ...            ...            ...   
261                          Sub-Saharan Africa         0.8879         1.0192   
262  Sub-Saharan Africa (excluding high income)         3.2523         1.5872   
263   Sub-Saharan Africa (IDA & IBRD countries)         1.7697         1.7440   
264                         Upper middle income         3.1682         3.6737   
265                                       World         0.7747         2.6921   

     2018 [YR2018]  2019 [Y

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
Train=pd.read_csv('/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge/train.csv')
Test= pd.read_csv('/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge/test.csv')
sample_submission= pd.read_csv('/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge/sample_submission.csv')

In [None]:
#replace ".."with .100
Train=Train.replace('..', .100)
Test=Test.replace('..', 0.100)

In [None]:
#Unique country for iteration
unique = Train['Country Name'].unique()

In [None]:
#trainig and inferencing with LogisticRegression and ARIMA
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
j=0
for i in unique:
    transposed_df=0
    fg = Train[Train['Country Name'] == i].reset_index(drop=True)
    columns_to_drop = ['Country Code', 'Country Name','Indicator']
    fg = fg.drop(columns_to_drop, axis=1)
    transposed_df = fg.transpose()
    #print(fg)
    #print('done for now')
    Xtrain = transposed_df[[0,1, 2,3,4,5,6,7,8,9,10]]
    Xtrain
    Ytrain=transposed_df[11]
    Ytrain
    logistic_regression = LogisticRegression()
    try:
        logistic_regression.fit(Xtrain,Ytrain)
        fgt = Test[Test['Country Name'] == i].reset_index(drop=True)
        columns_to_drop = [ 'Country Name','Indicator']
        fgt = fgt.drop(columns_to_drop, axis=1)
        transposed_df = fgt.transpose()
        #print(transposed_df)

        Xtest = transposed_df[[0,1, 2,3,4,5,6,7,8,9,10]]
        forecast = logistic_regression.predict(Xtest.astype(float))

        model = ARIMA(Ytrain.astype(float), order=(0,1,0))  # Adjust order as needed
        model_fit = model.fit()


        forecast2 = model_fit.forecast(steps=16)

        row_index = sample_submission[sample_submission.eq(i).any(axis=1)].index[0]


        new_values = [i, forecast[0], forecast[1],forecast[2], forecast[3], forecast[4],forecast2[31]]
        sample_submission.loc[row_index] = new_values
    except:
        j+=1





In [None]:
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)