# Problem Statement

# Importing Libraries

In [None]:
# pip install pandas-profiling

In [None]:
# pip install xgboost

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import datetime 
import random
# from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import OrdinalEncoder,RobustScaler

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import warnings
warnings.filterwarnings("ignore")

# Data Gathering

### Bitcoin Datset 

In [None]:
df_bitcoin = pd.read_csv("coin_Bitcoin.csv")
df_bitcoin.head()

In [None]:
df_bitcoin.shape

### Cardano Dataset

In [None]:
df_cardano = pd.read_csv("coin_Cardano.csv")
df_cardano.head()

In [None]:
df_cardano.shape

### Dogecoin Dataset

In [None]:
df_dogecoin = pd.read_csv("coin_Dogecoin.csv")
df_dogecoin.head()

In [None]:
df_dogecoin.shape

### Ethereum Dataset 

In [None]:
df_ethereum = pd.read_csv("coin_Ethereum.csv")
df_ethereum.head()

In [None]:
df_ethereum.shape

### Concatenating above four dataset by using pandas 

In [None]:
df = pd.concat([df_bitcoin,df_cardano,df_dogecoin,df_ethereum],ignore_index=True)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.columns

### Name

In [None]:
df["Name"].value_counts()

## Feature Engineering and EDA

In [None]:
df["Date"]=pd.to_datetime(df["Date"])

In [None]:
df.head()

In [None]:
df.drop(["SNo","Symbol"],axis=1,inplace=True)

In [None]:
df["Name"].unique()

# Data Analysis

### Function to get Outlier detection by boxplot and distribution of Features

In [None]:
def get_distribution_outlier(col):
    color = random.choice(['r', 'g', 'b'])
    fig, ax = plt.subplots(1,2,figsize=(18,5))
    sns.distplot(col, ax=ax[0], color=color)
    sns.boxplot(col, ax=ax[1], color=color)
    plt.suptitle('Distribution & Boxplot for Outlier Detection')
    fig.show()
    return None

### Bitcoin price over Years

In [None]:
df_bt = df[df["Name"]=="Bitcoin"]
df_bt.head()

In [None]:
df_bt.set_index('Date', inplace=True) # Set date column as the index

In [None]:
df_bt.describe()

In [None]:
plt.plot(df_bt["High"], label='High Prices')
plt.plot(df_bt["Low"], label='Low Prices')
plt.plot(df_bt["Open"], label='Close Prices')
plt.plot(df_bt["Close"], label='Open Prices')
plt.title('Bitcoin crypto Prediction Dataset')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
numeric_columns = ['High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']
for i in numeric_columns:
    get_distribution_outlier(df_bt[i])

### Cardano price over Years

In [None]:
df_cd = df[df["Name"]=="Cardano"]
df_cd.head()

In [None]:
df_cd.set_index('Date', inplace=True)

In [None]:
df_cd.describe()

In [None]:
plt.plot(df_cd["High"], label='High Prices')
plt.plot(df_cd["Low"], label='Low Prices')
plt.plot(df_cd["Open"], label='Close Prices')
plt.plot(df_cd["Close"], label='Open Prices')
plt.title('Cardano crypto Prediction Dataset')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
for i in numeric_columns:
    get_distribution_outlier(df_cd[i])

### Dogecoin price over years

In [None]:
df_dg = df[df["Name"]=="Dogecoin"]
df_dg.head()

In [None]:
df_dg.set_index('Date', inplace=True)

In [None]:
df_dg.describe()

In [None]:
plt.plot(df_dg["High"], label='High Prices')
plt.plot(df_dg["Low"], label='Low Prices')
plt.plot(df_dg["Open"], label='Close Prices')
plt.plot(df_dg["Close"], label='Open Prices')
plt.title('Dogecoin Prediction Dataset')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
for i in numeric_columns:
    get_distribution_outlier(df_dg[i])

### Etherium price over years

In [None]:
df_et = df[df["Name"]=="Ethereum"]
df_et.head()

In [None]:
df_et.set_index('Date', inplace=True)

In [None]:
df_et.describe()

In [None]:
plt.plot(df_et["High"], label='High Prices')
plt.plot(df_et["Low"], label='Low Prices')
plt.plot(df_et["Open"], label='Close Prices')
plt.plot(df_et["Close"], label='Open Prices')
plt.title('Ethereum Prediction Dataset')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
for i in numeric_columns:
    get_distribution_outlier(df_et[i])

# EDA by pandas profiling

In [None]:
# prof = ProfileReport(df)
# prof.to_file(output_file='EDA_profile_crptocurrency_dataset.html')

# Feature Encoding

In [None]:
df["Name"].unique()

In [None]:
oe = OrdinalEncoder(categories=[['Bitcoin', 'Cardano', 'Dogecoin', 'Ethereum']])
df["Name"] = oe.fit_transform(df[["Name"]])

In [None]:
df["Name"].unique()

# Checking Assumptions of regression 

### Checking for Linearity

In [None]:
sns.pairplot(df)

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True)

# Scaling of Data

In [None]:
df.head()

In [None]:
df['Date_year'] =df["Date"].dt.year
df['Date_month'] =df["Date"].dt.month
df['Date_day'] =df["Date"].dt.day
df.drop("Date",axis=1,inplace=True)

In [None]:
df.dtypes

In [None]:
x = df.drop("Close",axis=1)
y = df["Close"]

In [None]:
rs_model = RobustScaler()
x_array = rs_model.fit_transform(x)
x = pd.DataFrame(x_array,columns=x.columns)

# Train Test Split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=45)

In [None]:
x_train.shape,x_test.shape

# Model Training

## LR Model

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)

### Model Evaluation

In [None]:
#Evaluation on training data
y_pred = lr_model.predict(x_train)

LR_train_MAE = mean_absolute_error(y_train,y_pred)
print("The mean absolute error >>",LR_train_MAE)

LR_train_MSE = mean_squared_error(y_train,y_pred)
print("The mean Squared error >>",LR_train_MSE)

LR_train_RMSE = np.sqrt(LR_train_MSE)
print("The RMSE value >>",LR_train_RMSE)

LR_train_R2 = r2_score(y_train,y_pred)
print("The R2 value >>",LR_train_R2) 

In [None]:
#Evaluation on testing data
y_pred = lr_model.predict(x_test)

LR_test_MAE = mean_absolute_error(y_test,y_pred)
print("The mean absolute error >>",LR_test_MAE)

LR_test_MSE = mean_squared_error(y_test,y_pred)
print("The mean Squared error >>",LR_test_MSE)

LR_test_RMSE = np.sqrt(LR_test_MSE)
print("The RMSE value >>",LR_test_RMSE)

LR_test_R2 = r2_score(y_test,y_pred)
print("The R2 value >>",LR_test_R2)

## DT Model

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train,y_train)

### Model Evaluation 

In [None]:
#Evaluation on training data
y_pred = dt_model.predict(x_train)

DT_train_MAE = mean_absolute_error(y_train,y_pred)
print("The mean absolute error >>",DT_train_MAE)

DT_train_MSE = mean_squared_error(y_train,y_pred)
print("The mean Squared error >>",DT_train_MSE)

DT_train_RMSE = np.sqrt(DT_train_MSE)
print("The RMSE value >>",DT_train_RMSE)

DT_train_R2 = r2_score(y_train,y_pred)
print("The R2 value >>",DT_train_R2)

In [None]:
#Evaluation on testing data
y_pred = dt_model.predict(x_test)

DT_test_MAE = mean_absolute_error(y_test,y_pred)
print("The mean absolute error >>",DT_test_MAE)

DT_test_MSE = mean_squared_error(y_test,y_pred)
print("The mean Squared error >>",DT_test_MSE)

DT_test_RMSE = np.sqrt(DT_test_MSE)
print("The RMSE value >>",DT_test_RMSE)

DT_test_R2 = r2_score(y_test,y_pred)
print("The R2 value >>",DT_test_R2)

## RF Model 

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train,y_train)

### Model Evaluation

In [None]:
#Evaluation on training data
y_pred = rf_model.predict(x_train)

RF_train_MAE = mean_absolute_error(y_train,y_pred)
print("The mean absolute error >>",RF_train_MAE)

RF_train_MSE = mean_squared_error(y_train,y_pred)
print("The mean Squared error >>",RF_train_MSE)

RF_train_RMSE = np.sqrt(RF_train_MSE)
print("The RMSE value >>",RF_train_RMSE)

RF_train_R2 = r2_score(y_train,y_pred)
print("The R2 value >>",RF_train_R2)

In [None]:
#Evaluation on testing data
y_pred = rf_model.predict(x_test)

RF_test_MAE = mean_absolute_error(y_test,y_pred)
print("The mean absolute error >>",RF_test_MAE)

RF_test_MSE = mean_squared_error(y_test,y_pred)
print("The mean Squared error >>",RF_test_MSE)

RF_test_RMSE = np.sqrt(RF_test_MSE)
print("The RMSE value >>",RF_test_RMSE)

RF_test_R2 = r2_score(y_test,y_pred)
print("The R2 value >>",RF_test_R2)

## Hyperparameter Tuning 

In [None]:
# rf_model = RandomForestRegressor()
# hyper_parameter = {"criterion":["squared_error", "absolute_error"],
#                    "max_depth":np.arange(3,7),
#                    "min_samples_split":np.arange(3,7),
#                    "min_samples_leaf":np.arange(3,7)}

# rscv = RandomizedSearchCV(rf_model,hyper_parameter,cv=5)
# rscv.fit(x_train,y_train)

In [None]:
# rscv_rf.best_estimator_

## XGB Model

In [None]:
xgb_model = XGBRegressor()
xgb_model.fit(x_train,y_train)

In [None]:
#Evaluation on training data
y_pred = xgb_model.predict(x_train)

XGB_train_MAE = mean_absolute_error(y_train,y_pred)
print("The mean absolute error >>",XGB_train_MAE)

XGB_train_MSE = mean_squared_error(y_train,y_pred)
print("The mean Squared error >>",XGB_train_MSE)

XGB_train_RMSE = np.sqrt(XGB_train_MSE)
print("The RMSE value >>",XGB_train_RMSE)

XGB_train_R2 = r2_score(y_train,y_pred)
print("The R2 value >>",XGB_train_R2)

In [None]:
#Evaluation on testing data
y_pred = xgb_model.predict(x_test)

XGB_test_MAE = mean_absolute_error(y_test,y_pred)
print("The mean absolute error >>",XGB_test_MAE)

XGB_test_MSE = mean_squared_error(y_test,y_pred)
print("The mean Squared error >>",XGB_test_MSE)

XGB_test_RMSE = np.sqrt(XGB_test_MSE)
print("The RMSE value >>",XGB_test_RMSE)

XGB_test_R2 = r2_score(y_test,y_pred)
print("The R2 value >>",XGB_test_R2)

# Comparison of Model results 

### MAE Value Comparison

In [None]:
result_MAE = pd.DataFrame({"Model_name":["LR_model","DT_model","RF_model","XGB_model"],
                       "MAE_value_train":[LR_train_MAE,DT_train_MAE,RF_train_MAE,XGB_train_MAE],
                       "MAE_value_test":[LR_test_MAE,DT_test_MAE,RF_test_MAE,XGB_test_MAE]})


In [None]:
result_MAE.plot(x="Model_name",y=["MAE_value_train","MAE_value_test"],kind="bar",figsize=(8,6))

### RMSE value comparison

In [None]:
result_RMSE = pd.DataFrame({"Model_name":["LR_model","DT_model","RF_model","XGB_model"],
                       "RMSE_value_train":[LR_train_RMSE,DT_train_RMSE,RF_train_RMSE,XGB_train_RMSE],
                       "RMSE_value_test":[LR_test_RMSE,DT_test_RMSE,RF_test_RMSE,XGB_test_RMSE]})
result_RMSE

In [None]:
result_RMSE.plot(x="Model_name",y=["RMSE_value_train","RMSE_value_test"],kind="bar",figsize=(8,6))

### Accuracy comparison for each model

In [None]:
result_r2 = pd.DataFrame({"Model_name":["LR_model","DT_model","RF_model","XGB_model"],
                       "r2_value_train":[LR_train_R2,DT_train_R2,RF_train_R2,XGB_train_R2],
                       "r2_value_test":[LR_test_R2,DT_test_R2,RF_test_R2,XGB_test_R2]})
result_r2

In [None]:
result_r2.plot(x="Model_name",y=["r2_value_train","r2_value_test"],kind="line",figsize=(8,6))

In [None]:
model_names = ["LR_model","DT_model","RF_model","XGB_model"]
accuracy = [0.999599,1.000000,0.999977,0.999997]
colors = ['yellow', 'green', 'blue', 'orange']

plt.bar(model_names, accuracy, color=colors)

# Add labels and title
plt.ylim(0.95555, 1)
plt.xlabel('model_names')
plt.ylabel('accuracy')
plt.title('Model accuracy comparision with different accuracy')

# Show the plot
plt.show()

# Pickle Files 

In [None]:
import json
import pickle

In [None]:
#Feature Name JSON File
features = {"coin_name":{'Bitcoin':0, 'Cardano':1, 'Dogecoin':2, 'Ethereum':3}"feature_names":list(x_train.columns)}


In [None]:
with open("features_names.json","w") as f:
    json.dump(features,f)

In [None]:
with open("lr_model.pkl","wb") as f:
    pickle.dump(lr_model,f)

In [None]:
with open("robust_scaling.pkl","wb") as f:
    pickle.dump(rs_model,f)

In [None]:
ts = df.sample().drop("Close",axis=1)
ts

In [None]:
df.iloc[8143]["Close"]

In [None]:
test_sample = rs_model.transform(ts.to_numpy())
test_sample

# Taking Input and predicting Close Price

In [None]:
x_test.columns

In [None]:
def get_predicted_value(name,dt,high_value,low_value,open_value,volume,marketcap):
    dt = datetime.datetime.strptime(dt,"%Y-%m-%d").date()
    dt_year = int(dt.year)
    dt_month = int(dt.month)
    dt_day = int(dt.day)
    if name == "BT":
        name = 0
    elif name == "CD":
        name = 1
    elif name == "DG":
        name = 2
    elif name == "ET":
        name = 3
    else :
        pass
    test_array = np.array([name,high_value,low_value,open_value,volume,marketcap,dt_year,dt_month,dt_day])
    scaled_test_array = rs_model.transform([test_array])
    close_value = lr_model.predict(scaled_test_array)
    return f"Predicted closing price for given cyptocurrency for given date is :",close_value[0]

# Prediction of Cryptocurrency price

In [None]:
name = input("Give input as 'BT'-Bitcoin,'CD'-cardano,'DG'-dogecoin,'ET-Ethreum' : ")
dt = input("Input date in format YYYY-MM-DD : ")
high_value = input("Give highest value for given day : ")
low_value = input("Give lowest value for given day : ")
open_value = input("Give open value for given day : ")
volume = input("Give volume shared for given day : ")
marketcap = input("Give market capital for given coin : ")
op = get_predicted_value(name,dt,high_value,low_value,open_value,volume,marketcap)
print(op)