### 5) Data Pre-Procesing

#### Import libraries

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import kagglehub as kh
import os

# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

### Import data from kaggle and create a single data frame

In [4]:
path = kh.dataset_download("svaningelgem/crypto-currencies-daily-prices")
print("Path to dataset files:", path)

df_list = []

for file in files:
    file_path = os.path.join(path, file)
    
    # Load the CSV, explicitly parse date column
    df = pd.read_csv(file_path, parse_dates=['date'], dtype={'ticker': 'category'})
    
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)

combined_df = combined_df.sort_values(by=['ticker', 'date'])

print(combined_df.info())  
combined_df.head()
combined_df.shape


Path to dataset files: C:\Users\gerar\.cache\kagglehub\datasets\svaningelgem\crypto-currencies-daily-prices\versions\593
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216550 entries, 0 to 216549
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   ticker  216550 non-null  object        
 1   date    216550 non-null  datetime64[ns]
 2   open    216550 non-null  float64       
 3   high    216550 non-null  float64       
 4   low     216550 non-null  float64       
 5   close   216550 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 9.9+ MB
None


(216550, 6)

#### Creating features for ML

In [5]:

# Ensure chronological order
combined_df = combined_df.sort_values(by=["ticker", "date"])

# 1. Calculate daily returns
combined_df["return"] = combined_df.groupby("ticker")["close"].pct_change()

# 2. Create lagged returns (previous day's return)
combined_df["return_lag1"] = combined_df.groupby("ticker")["return"].shift(1)

# 3. Rolling mean return (past 7 days)
combined_df["return_7d"] = (
    combined_df.groupby("ticker")["return"]
    .rolling(window=7)
    .mean()
    .reset_index(level=0, drop=True)
)

# 4. Moving Averages (SMA & EMA)
combined_df["SMA_20"] = combined_df.groupby("ticker")["close"].transform(
    lambda x: x.rolling(window=20).mean()
)
combined_df["EMA_20"] = combined_df.groupby("ticker")["close"].transform(
    lambda x: x.ewm(span=20, adjust=False).mean()
)

# 5. Rolling Volatility (std dev of returns, 7-day)
combined_df["volatility_7d"] = (
    combined_df.groupby("ticker")["return"]
    .rolling(window=7)
    .std()
    .reset_index(level=0, drop=True)
)

# Drop NaNs (from rolling calculations)
combined_df = combined_df.dropna()

print(combined_df.head())


   ticker       date   open   high    low  close    return  return_lag1  \
19  1INCH 2021-01-27  2.686  2.733  2.233  2.518 -0.062547     0.139101   
20  1INCH 2021-01-28  2.518  3.368  2.440  3.122  0.239873    -0.062547   
21  1INCH 2021-01-29  3.122  3.500  2.850  3.341  0.070147     0.239873   
22  1INCH 2021-01-30  3.341  4.676  3.187  4.565  0.366357     0.070147   
23  1INCH 2021-01-31  4.565  5.555  4.468  4.940  0.082147     0.366357   

    return_7d   SMA_20    EMA_20  volatility_7d  
19   0.053862  1.72500  1.872089       0.144704  
20   0.108018  1.82110  1.991128       0.130655  
21   0.102576  1.92595  2.119687       0.131435  
22   0.139662  2.09300  2.352574       0.165119  
23   0.111670  2.28365  2.598996       0.153973  


#### Add the target column

In [6]:
# Create target variable: 1 if next day's return is positive, else 0
combined_df["target"] = (combined_df.groupby("ticker")["return"].shift(-1) > 0).astype(int)

# Drop NaNs (last row for each ticker will have no target)
combined_df = combined_df.dropna()

print(combined_df[["date", "ticker", "return", "target"]].head(10))

         date ticker    return  target
19 2021-01-27  1INCH -0.062547       1
20 2021-01-28  1INCH  0.239873       1
21 2021-01-29  1INCH  0.070147       1
22 2021-01-30  1INCH  0.366357       1
23 2021-01-31  1INCH  0.082147       1
24 2021-02-01  1INCH  0.022874       0
25 2021-02-02  1INCH -0.076786       1
26 2021-02-03  1INCH  0.079743       1
27 2021-02-04  1INCH  0.111376       1
28 2021-02-05  1INCH  0.040372       0


#### Feature selection

In [7]:
# Features for training
features = ["return_lag1", "return_7d", "SMA_20", "EMA_20", "volatility_7d"]

X = combined_df[features]
y = combined_df["target"]

#### Train-Test Split

In [8]:
from sklearn.model_selection import train_test_split

# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Print shapes
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")


Train size: (171476, 5), Test size: (42870, 5)


#### Train logistic regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.5297
              precision    recall  f1-score   support

           0       0.53      0.89      0.67     22676
           1       0.50      0.12      0.20     20194

    accuracy                           0.53     42870
   macro avg       0.52      0.51      0.43     42870
weighted avg       0.52      0.53      0.45     42870



In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "AdaBoost": AdaBoostClassifier(),
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # Predict

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print("=" * 40)


Model: Logistic Regression
Accuracy: 0.5297
              precision    recall  f1-score   support

           0       0.53      0.89      0.67     22676
           1       0.50      0.12      0.20     20194

    accuracy                           0.53     42870
   macro avg       0.52      0.51      0.43     42870
weighted avg       0.52      0.53      0.45     42870

Model: K-Neighbors Classifier
Accuracy: 0.5150
              precision    recall  f1-score   support

           0       0.54      0.55      0.54     22676
           1       0.49      0.48      0.48     20194

    accuracy                           0.52     42870
   macro avg       0.51      0.51      0.51     42870
weighted avg       0.51      0.52      0.51     42870

Model: Decision Tree
Accuracy: 0.5127
              precision    recall  f1-score   support

           0       0.54      0.53      0.54     22676
           1       0.48      0.49      0.49     20194

    accuracy                           0.51     42870

In [16]:
# Assuming combined_df is your DataFrame containing the data
ticker_counts = combined_df.groupby('ticker').size()

# Print the counts for each ticker
print(ticker_counts)

ticker
1INCH    1443
AAVE     1533
ACH       942
ADA      2548
ALGO     2000
         ... 
XRP      3619
XTZ      2361
ZEC      2976
ZIL      2418
ZRX       975
Length: 116, dtype: int64


In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)