<a href="https://colab.research.google.com/github/codsalah/Machine-Learning-Projects/blob/main/grain_volume_XGboost_GradBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:


import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'grains-and-cereals-futures:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3639012%2F6461889%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240410%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240410T234043Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db69098954e761f85bf34a8ec9651f7805b7d110da98b70e52972a64f568516c38e5226013bb5645e7803380ed15b9e17730d9ee2e04d71f5a9e1c599159919d2d3ae134c42bfbf963a2da8f5cc7532a7fcb7e7c1fb988c0a55984d37a98eb9231bd8c20693f7695bcf3540bb3e7de55124547e668eae35851e23825e71c50155620ab5d9d1bdcc5a41e5ee48eb2b3ea1be24c9a18f18dd9197d24a34879e68471dc68f1e4337ee51bbeaa627a7a8328ccab8663eb07dfa8a97e32ec33ba1763a106ada6d7001ed0b41e93de8a7e5be18796c66fefe13dfd4a3777d5ef3c398c2ebc54c1c5fb323d6f4f2897b2ce9dc77829ae8ad86e9e626fc8e5e605a2d5c4f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading grains-and-cereals-futures, 908671 bytes compressed
Downloaded and uncompressed: grains-and-cereals-futures
Data source import complete.


<center><h1>Introduction

1. Date: The date when the data was recorded. Format: YYYY-MM-DD.
2. Open: Market's opening price for the day.
3. High: Maximum price reached during the trading session.
4. Low: Minimum traded price during the day.
5. Close: Market's closing price.
6. Volume: Number of contracts traded during the session.
7. Ticker: Unique market quotation symbol for the grain future.
8. Commodity: Specifies the type of grain the future contract represents (e.g., corn, oat).

<center><h1>Importing Libraries

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.preprocessing import (StandardScaler,
                                   OneHotEncoder,
                                   )

from sklearn.model_selection import train_test_split


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

sns.set(style="whitegrid")

from sklearn.metrics import (mean_absolute_error,
                            r2_score)


<center><h1>Explore Data

In [4]:
df = pd.read_csv("/kaggle/input/grains-and-cereals-futures/all_grains_data.csv")
df.head()

Unnamed: 0,ticker,commodity,date,open,high,low,close,volume
0,ZC=F,Corn,2000-07-17,182.25,182.25,178.25,179.25,19385
1,ZC=F,Corn,2000-07-18,179.0,183.25,179.0,180.75,25205
2,ZC=F,Corn,2000-07-19,180.0,182.75,178.75,182.0,17126
3,ZC=F,Corn,2000-07-20,181.5,187.0,181.0,186.0,18742
4,ZC=F,Corn,2000-07-21,185.5,188.0,185.0,187.75,16814


In [5]:
df.tail()

Unnamed: 0,ticker,commodity,date,open,high,low,close,volume
34860,ZS=F,Soybean,2023-09-06,1365.5,1365.5,1359.5,1360.25,102
34861,ZS=F,Soybean,2023-09-07,1349.5,1350.0,1345.0,1345.0,310
34862,ZS=F,Soybean,2023-09-08,1339.0,1349.5,1339.0,1349.5,198
34863,ZS=F,Soybean,2023-09-11,1352.5,1352.5,1352.0,1352.5,36
34864,ZS=F,Soybean,2023-09-12,1344.75,1350.0,1342.5,1343.75,8642


In [6]:
df.shape

(34865, 8)

In [7]:
df.dtypes

ticker        object
commodity     object
date          object
open         float64
high         float64
low          float64
close        float64
volume         int64
dtype: object

In [8]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.dtypes

ticker               object
commodity            object
date         datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume                int64
dtype: object

In [9]:
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day

In [10]:
df = df.drop('date', axis=1)
df.head()

Unnamed: 0,ticker,commodity,open,high,low,close,volume,Year,Month,Day
0,ZC=F,Corn,182.25,182.25,178.25,179.25,19385,2000,7,17
1,ZC=F,Corn,179.0,183.25,179.0,180.75,25205,2000,7,18
2,ZC=F,Corn,180.0,182.75,178.75,182.0,17126,2000,7,19
3,ZC=F,Corn,181.5,187.0,181.0,186.0,18742,2000,7,20
4,ZC=F,Corn,185.5,188.0,185.0,187.75,16814,2000,7,21


In [11]:
df.isnull().sum()

ticker       0
commodity    0
open         0
high         0
low          0
close        0
volume       0
Year         0
Month        0
Day          0
dtype: int64

no missing data :D

In [12]:
columns_to_analyze = ["ticker","commodity"]

for column in columns_to_analyze:
    unique_values_count = df[column].nunique()
    print(f"Number of unique values in '{column}' column: {unique_values_count}")
    print("____________________________________________________________________")

Number of unique values in 'ticker' column: 6
____________________________________________________________________
Number of unique values in 'commodity' column: 6
____________________________________________________________________


In [13]:
duplicates = df.duplicated().sum()
duplicates

0

In [14]:
df.columns

Index(['ticker', 'commodity', 'open', 'high', 'low', 'close', 'volume', 'Year',
       'Month', 'Day'],
      dtype='object')

<center><h1>Preprocessing

In [15]:
def get_X_y(df):
    FEATURES = [
        'ticker',
        'commodity',
        'open',
        'high',
        'low',
        'close',
        'Year',
       'Month',
        'Day'
    ]

    TARGET = 'volume'

    X = df[FEATURES]
    y = df[TARGET]

    return X, y

In [16]:
X, y = get_X_y(df)


In [17]:
categorical_columns = ['ticker', 'commodity']

X = pd.get_dummies(X, columns=categorical_columns, dtype=int)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                   random_state=42,
                                   test_size=0.25,
                                   shuffle=True)

In [19]:
print(f"X_train shape : {X_train.shape}\nX_train shape : {y_train.shape}\nX_train shape : {X_test.shape}\nX_train shape : {y_test.shape}")

X_train shape : (26148, 19)
X_train shape : (26148,)
X_train shape : (8717, 19)
X_train shape : (8717,)


<center><h1>Models

In [20]:
model = xgb.XGBRegressor(max_depth=10, learning_rate=0.24, n_estimators=50, objective='reg:linear', booster='gbtree')
XGB=model.fit(X_train,y_train)
prediction=XGB.predict(X_test)



In [21]:
r2 = r2_score(y_test, prediction)

mae = mean_absolute_error(y_test, prediction)

print("R2 Score:", r2)
print("Mean Squared Error (MSE):", mae)

R2 Score: 0.904820677876828
Mean Squared Error (MSE): 6778.563880129563


In [22]:
DFR = RandomForestRegressor(n_estimators=60,
                           max_depth=60,
                           max_leaf_nodes=120,
                           random_state=17)
DFR=DFR.fit(X_train,y_train)
prediction33=DFR.predict(X_test)

In [23]:
r2 = r2_score(y_test, prediction33)

mae = mean_absolute_error(y_test, prediction33)

print("R2 Score:", r2)
print("Mean Squared Error (MSE):", mae)

R2 Score: 0.870630667199224
Mean Squared Error (MSE): 8658.100027291395


In [24]:
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [25]:
GBR2 = GradientBoostingRegressor(learning_rate=0.1,
                      n_estimators=100,
                      max_depth=60,
                      max_leaf_nodes=120,
                      random_state=17)
GBR2=GBR2.fit(X_train,y_train)
predictionGBR=GBR2.predict(X_test)
r2 = r2_score(y_test, predictionGBR)

mae = mean_absolute_error(y_test, predictionGBR)

print("R2 Score:", r2)
print("Mean Squared Error (MSE):", mae)

R2 Score: 0.9067547705637365
Mean Squared Error (MSE): 6860.042710899027
