<a href="https://colab.research.google.com/github/dellavecchiaemiliano/Models_for_risk_and_forecasting/blob/main/Grid_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
drive.mount('/content/drive')
data_raw = pd.read_csv('/content/drive/MyDrive/Classroom/Models for Risk and Forecasting/Models for Risk and Forecasting/Part 2: Factor Investing/data_ml.csv',index_col=0)
idx_date = (data_raw['date'] > '1999-12-31') & (data_raw['date'] < '2019-01-01')
data_ml = data_raw.loc[idx_date, :].copy()
# A list of all stock_ids
stock_ids = data_ml['stock_id'].unique()
# compute the number of data points per stock
stock_days = data_ml[['date','stock_id']].groupby(['stock_id']).count().reset_index()
stock_ids_short = stock_days.loc[stock_days['date'] == (stock_days['date'].max())]
stock_ids_short = stock_ids_short['stock_id'].unique()
# compute returns in matrix format
is_stock_ids_short  =data_ml['stock_id'].isin(stock_ids_short)
returns = data_ml[is_stock_ids_short].pivot(index='date',
                                            columns='stock_id', values='R1M_Usd')
features= list(data_ml.iloc[:,3:95].columns)
features_short =["Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd",
                    "Ocf", "Pb", "Vol1Y_Usd"]

# create also categorical variable
df_median = data_ml[['date','R1M_Usd','R12M_Usd']].groupby(['date']).median()
df_median.rename(columns = {"R1M_Usd": "R1M_Usd_median", "R12M_Usd": "R12M_Usd_median"},
                 inplace=True)
df = pd.merge(data_ml, df_median, how='left', on='date')
data_ml['R1M_Usd_C'] = np.where(df['R1M_Usd'] > df['R1M_Usd_median'], 1.0, 0.0)
data_ml['R12M_Usd_C'] = np.where(df['R12M_Usd'] > df['R12M_Usd_median'], 1.0, 0.0)
del df_median, df

# create training and test sample
separation_date = "2014-01-15"
idx_train=data_ml.index[(data_ml['date'] < separation_date)].tolist()
training_sample=data_ml.loc[idx_train,:]
idx_test=data_ml.index[(data_ml['date'] >= separation_date)].tolist()
testing_sample=data_ml.loc[idx_test,:]

y_train = training_sample['R1M_Usd_C'].values # recall features/predictors, full sample
X_train = training_sample[features_short].values # recall label/Dependent variable, full sample
y_test = testing_sample['R1M_Usd_C'].values # recall features/predictors, full sample
X_test = testing_sample[features_short].values # recall label/Dependent variable, full sample

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
param_grid = [
  {'C': [0.001,0.01,0.1,1, 10], 'kernel': ['linear']},
  {'C': [0.001,0.01,.01,0.1,1, 10], 'gamma': [1,0.1,0.01,0.001], 'kernel': ['rbf']},
  {'C': [0.001,0.01,0.1,1, 10], 'degree': [2,3,4], 'kernel': ['poly']},
 ]
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose = 3)
grid.fit(X_train[:1000], y_train[:1000])

Fitting 5 folds for each of 44 candidates, totalling 220 fits
[CV 1/5] END ............C=0.001, kernel=linear;, score=0.530 total time=   0.0s
[CV 2/5] END ............C=0.001, kernel=linear;, score=0.525 total time=   0.0s
[CV 3/5] END ............C=0.001, kernel=linear;, score=0.525 total time=   0.0s
[CV 4/5] END ............C=0.001, kernel=linear;, score=0.525 total time=   0.0s
[CV 5/5] END ............C=0.001, kernel=linear;, score=0.525 total time=   0.0s
[CV 1/5] END .............C=0.01, kernel=linear;, score=0.530 total time=   0.0s
[CV 2/5] END .............C=0.01, kernel=linear;, score=0.525 total time=   0.0s
[CV 3/5] END .............C=0.01, kernel=linear;, score=0.525 total time=   0.0s
[CV 4/5] END .............C=0.01, kernel=linear;, score=0.525 total time=   0.0s
[CV 5/5] END .............C=0.01, kernel=linear;, score=0.525 total time=   0.0s
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.515 total time=   0.0s
[CV 2/5] END ..............C=0.1, kernel=linear

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 0.001, 'kernel': 'linear'}
SVC(C=0.001, kernel='linear')


In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score # introducing buit-in function for accuracy

y_c_train=training_sample['R1M_Usd_C'].values
y_c_test=testing_sample['R1M_Usd_C'].values

model_svm_c=svm.SVC(
    kernel='linear',
    C=0.001                         # Slack variable penalisation
    )

fit_svm_c=model_svm_c.fit(X_train[:3000],y_c_train[:3000])    # Fitting the model

hitratio=accuracy_score(y_c_test, fit_svm_c.predict(X_test)) # Hitratio
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.49628247493163175
