In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression , Ridge , Lasso , ElasticNet
from sklearn.metrics import mean_squared_error  , mean_absolute_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.stats import zscore
from scipy.stats import f_oneway
from sklearn.metrics import r2_score
from tkinter import ttk
import tkinter as tk
data= pd.read_csv('FINAL_USO.csv')
%matplotlib inline

#Problem Statement Definition
#Using gold price data we needed to create an algorithm that was able to read the data, use the important information
#while ignoring the useless information, learn from this data and be able to predict future prices from this information. 


In [None]:
#info on the dataset
data.info()

In [None]:
data.nunique()

In [None]:
data.describe()

In [None]:
#sample the first 5 rows
data.head()

In [None]:
#sample tail end of dataset
data.tail()

In [None]:
#converting data into the same type to normalize it
data['Date'] = pd.to_datetime(data['Date'])
data[data.columns[1:81]] = data[data.columns[1:81]].astype(float)
data = data.rename(columns = {'Open' : 'Opening Price', 'Close' : 'Closing Price in $'}, inplace = False)
data.dtypes

#histogram of closing price
data['Closing Price in $'].hist()


#data outlier removal


In [None]:
#count all non-number points
count_nan = data.isnull().sum()
print(count_nan)

In [None]:
#gold price over time
sns.displot(data, x = "Date", y = "Closing Price in $", bins = 150)

In [None]:
#gold price compared to GDX Low
gdx_low = list(data['GDX_Low'])
close_price = list(data['Closing Price in $'])

plt.figure(figsize = (20, 20))

plt.bar(gdx_low, close_price, color ='lightblue', 
        width = 1)

plt.xlabel("GDX Low")
plt.ylabel("Close Price")
plt.title("Correlation between close price and GDX Low")
plt.show()

In [None]:
#correlation between the closing price and other variables
target_var = 'Closing Price in $'

correlations = data.corr()[target_var]

plt.figure(figsize = (20, 30))
sns.barplot(x = correlations.values, y = correlations.index, color = 'green')
plt.title(f'Variable Correlation with {target_var}')
plt.xlabel('Correlations')
plt.ylabel('Variable Columns')
plt.xticks(rotation = 90)

In [None]:
#getting all positively correlated variables in a dataframe together
corr_columns_matrix = data.corr()

corr_columns_matrix['Closing Price in $'][abs(corr_columns_matrix['Closing Price in $']) >.5]

new_data = data[corr_columns_matrix]

print(corr_columns_matrix)

In [None]:
#anova

PredictorList = ['GDX_Low', 'SF_Price', 'High', 'EG_open']

def AnovaTest(input_data, TargetVar, Predictors):
    finalpredictors = []
    
    for pred in Predictors:
        group_list = input_data.groupby(pred)[TargetVar].apply(list)
        results = f_oneway(*group_list)

        if (results[1] < 0.05):
            print(results)
            finalpredictors.append(pred)
        else:
            print('No relation')   
    return(finalpredictors)

AnovaTest(input_data = data,
TargetVar = 'Closing Price in $',
Predictors = PredictorList)

In [None]:
#using the predictor columns as the basis for ai model
final_selection = ['GDX_Low', 'SF_Price', 'High', 'EG_open']

aidata = data

In [None]:
#train and test

X_axis = final_data.drop('Closing Price in $', axis = 1)
Y_axis = final_data['Closing Price in $']

Xtrain , Xtest , Ytrain , Ytest = train_test_split(X_axis , Y_axis , test_size = 0.2 , random_state = 69)

s = StandardScaler()
scale_x_TRAIN = s.fit_transform(Xtrain)
scale_x_TEST = s.transform(Xtest)

In [None]:
#ridge model
ridge_model = Ridge(alpha = 1)

ridge_model = ridge_model.fit(scale_x_TRAIN, Ytrain)

ridge_model.predict(scale_x_TEST)
