# Collect Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Stock_Market_Dataset.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,0,02-02-2024,2.079,,72.28,,3.8215,,43194.7,42650.0,...,589498,10580.0,564.64,4030000.0,171.81,117220000.0,474.99,84710000.0,2053.7,
1,1,01-02-2024,2.05,161340.0,73.82,577940.0,3.8535,,43081.4,47690.0,...,581600,9780.0,567.51,3150000.0,159.28,66360000.0,394.78,25140000.0,2071.1,260920.0
2,2,31-01-2024,2.1,142860.0,75.85,344490.0,3.906,,42580.5,56480.0,...,578020,9720.0,564.11,4830000.0,155.2,49690000.0,390.14,20010000.0,2067.4,238370.0
3,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.911,,42946.2,55130.0,...,584680,9750.0,562.85,6120000.0,159.0,42290000.0,400.06,18610000.0,2050.9,214590.0
4,4,29-01-2024,2.49,3590.0,76.78,331930.0,3.879,,43299.8,45230.0,...,578800,13850.0,575.79,6880000.0,161.26,42840000.0,401.02,17790000.0,2034.9,1780.0


# Preprocessing Data/ Feature Engineering

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         1243 non-null   int64  
 1   Date               1243 non-null   object 
 2   Natural_Gas_Price  1243 non-null   float64
 3   Natural_Gas_Vol.   1239 non-null   float64
 4   Crude_oil_Price    1243 non-null   float64
 5   Crude_oil_Vol.     1220 non-null   float64
 6   Copper_Price       1243 non-null   float64
 7   Copper_Vol.        1206 non-null   float64
 8   Bitcoin_Price      1243 non-null   object 
 9   Bitcoin_Vol.       1243 non-null   float64
 10  Platinum_Price     1243 non-null   object 
 11  Platinum_Vol.      636 non-null    float64
 12  Ethereum_Price     1243 non-null   object 
 13  Ethereum_Vol.      1243 non-null   float64
 14  S&P_500_Price      1243 non-null   object 
 15  Nasdaq_100_Price   1243 non-null   object 
 16  Nasdaq_100_Vol.    1242 

### Reduce unneccesary columns

In [4]:
columns_to_drop = ['Apple_Price','Apple_Vol.',
                   'Tesla_Price','Tesla_Vol.',
                   'Microsoft_Price','Microsoft_Vol.',
                   'Google_Price','Google_Vol.',
                   'Nvidia_Price','Nvidia_Vol.',
                   'Berkshire_Price','Berkshire_Vol.',
                   'Netflix_Price','Netflix_Vol.',
                   'Amazon_Price','Amazon_Vol.',
                   'Meta_Price','Meta_Vol.',
                   'Unnamed: 0'
                  ]
                   
df = df.drop(columns = columns_to_drop)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               1243 non-null   object 
 1   Natural_Gas_Price  1243 non-null   float64
 2   Natural_Gas_Vol.   1239 non-null   float64
 3   Crude_oil_Price    1243 non-null   float64
 4   Crude_oil_Vol.     1220 non-null   float64
 5   Copper_Price       1243 non-null   float64
 6   Copper_Vol.        1206 non-null   float64
 7   Bitcoin_Price      1243 non-null   object 
 8   Bitcoin_Vol.       1243 non-null   float64
 9   Platinum_Price     1243 non-null   object 
 10  Platinum_Vol.      636 non-null    float64
 11  Ethereum_Price     1243 non-null   object 
 12  Ethereum_Vol.      1243 non-null   float64
 13  S&P_500_Price      1243 non-null   object 
 14  Nasdaq_100_Price   1243 non-null   object 
 15  Nasdaq_100_Vol.    1242 non-null   float64
 16  Silver_Price       1243 

### Fill in NaN values with column mean value

In [5]:
columns_with_null = ['Natural_Gas_Vol.',
                     'Crude_oil_Vol.',
                     'Copper_Vol.',
                     'Platinum_Vol.',
                     'Nasdaq_100_Vol.',
                     'Silver_Vol.',
                     'Gold_Vol.'
                    ]
for column in columns_with_null:
    if column not in df.columns:
        print(f"Column '{column}' not found in DataFrame!")
    else:
        df[column] = df[column].fillna(df[column].mean())


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               1243 non-null   object 
 1   Natural_Gas_Price  1243 non-null   float64
 2   Natural_Gas_Vol.   1243 non-null   float64
 3   Crude_oil_Price    1243 non-null   float64
 4   Crude_oil_Vol.     1243 non-null   float64
 5   Copper_Price       1243 non-null   float64
 6   Copper_Vol.        1243 non-null   float64
 7   Bitcoin_Price      1243 non-null   object 
 8   Bitcoin_Vol.       1243 non-null   float64
 9   Platinum_Price     1243 non-null   object 
 10  Platinum_Vol.      1243 non-null   float64
 11  Ethereum_Price     1243 non-null   object 
 12  Ethereum_Vol.      1243 non-null   float64
 13  S&P_500_Price      1243 non-null   object 
 14  Nasdaq_100_Price   1243 non-null   object 
 15  Nasdaq_100_Vol.    1243 non-null   float64
 16  Silver_Price       1243 

### Convert object datatypes to float

In [9]:
columns = ['Bitcoin_Price',
           'Platinum_Price',
           'Ethereum_Price',
           'S&P_500_Price',
           'Nasdaq_100_Price',
           'Gold_Price'
          ]
for column in columns:
    if df[column].dtype == 'object':
        df[column] = pd.to_numeric(df[column].str.replace(',','',regex=False))
        
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               1243 non-null   object 
 1   Natural_Gas_Price  1243 non-null   float64
 2   Natural_Gas_Vol.   1243 non-null   float64
 3   Crude_oil_Price    1243 non-null   float64
 4   Crude_oil_Vol.     1243 non-null   float64
 5   Copper_Price       1243 non-null   float64
 6   Copper_Vol.        1243 non-null   float64
 7   Bitcoin_Price      1243 non-null   float64
 8   Bitcoin_Vol.       1243 non-null   float64
 9   Platinum_Price     1243 non-null   float64
 10  Platinum_Vol.      1243 non-null   float64
 11  Ethereum_Price     1243 non-null   float64
 12  Ethereum_Vol.      1243 non-null   float64
 13  S&P_500_Price      1243 non-null   float64
 14  Nasdaq_100_Price   1243 non-null   float64
 15  Nasdaq_100_Vol.    1243 non-null   float64
 16  Silver_Price       1243 

### Convert Date column to datetime object

In [12]:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1243 non-null   datetime64[ns]
 1   Natural_Gas_Price  1243 non-null   float64       
 2   Natural_Gas_Vol.   1243 non-null   float64       
 3   Crude_oil_Price    1243 non-null   float64       
 4   Crude_oil_Vol.     1243 non-null   float64       
 5   Copper_Price       1243 non-null   float64       
 6   Copper_Vol.        1243 non-null   float64       
 7   Bitcoin_Price      1243 non-null   float64       
 8   Bitcoin_Vol.       1243 non-null   float64       
 9   Platinum_Price     1243 non-null   float64       
 10  Platinum_Vol.      1243 non-null   float64       
 11  Ethereum_Price     1243 non-null   float64       
 12  Ethereum_Vol.      1243 non-null   float64       
 13  S&P_500_Price      1243 non-null   float64       
 14  Nasdaq_1

### Add days of the week using Date column

In [15]:
df['DayofWeek'] = df['Date'].dt.day_name()
columns = ['Monday','Tuesday','Wednesday','Thursday','Friday']
for day in columns:
    df[day] = (df['DayofWeek'] == day).astype(int)
df =df.drop(columns = ['DayofWeek','Date'])
df.head()

Unnamed: 0,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,Platinum_Price,Platinum_Vol.,...,Nasdaq_100_Vol.,Silver_Price,Silver_Vol.,Gold_Price,Gold_Vol.,Monday,Tuesday,Wednesday,Thursday,Friday
0,2.079,131624.116223,72.28,398903.778689,3.8215,35406.616915,43194.7,42650.0,901.6,9082.515723,...,315620000.0,22.796,67695.41806,2053.7,211127.671233,0,0,0,0,1
1,2.05,161340.0,73.82,577940.0,3.8535,35406.616915,43081.4,47690.0,922.3,9082.515723,...,240640000.0,23.236,85160.0,2071.1,260920.0,0,0,0,1,0
2,2.1,142860.0,75.85,344490.0,3.906,35406.616915,42580.5,56480.0,932.6,9082.515723,...,366450000.0,23.169,66910.0,2067.4,238370.0,0,0,1,0,0
3,2.077,139750.0,77.82,347240.0,3.911,35406.616915,42946.2,55130.0,931.7,9082.515723,...,236210000.0,23.225,53370.0,2050.9,214590.0,0,1,0,0,0
4,2.49,3590.0,76.78,331930.0,3.879,35406.616915,43299.8,45230.0,938.3,9082.515723,...,238750000.0,23.134,330.0,2034.9,1780.0,1,0,0,0,0


# Statistical Tests

### Hypothesis Results

# Visualize Correlation

# Model Creation

### Train Model

### Evaluate Model

### Hyperparameter Tuning

### Evaluate Model

# Testing Model

# Results