In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

!pip install -U pymfe
from pymfe.mfe import MFE

from sklearn.model_selection import TimeSeriesSplit

%pip install xgboost
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

!pip install yfinance
from pandas_datareader import data as pdr
import yfinance as yfin
yfin.pdr_override()

import time
import datetime

# Source code for installing pycatch22 properly
!pip install setuptools --upgrade
!pip install pycatch22
import pycatch22

## Standardizing S&P 500 data

In [None]:
# General struture of how the Stocks are grouped

#+------+-------------------------+
#| Name |     Sector              | 
#+------+-------------------------+
#| ABC  | ENERGY                  |
#| BCE  | MATERIALS               |
#| CDE  | INDUSTRIALS             |
#| DEF  | CONSUMER DISCRETIONARY  |
#| EFG  | CONSUMER STAPLES        |
#| FGH  | HEALTH CARE             |
#| GHI  | FINANCIALS              |
#| HIJ  | INFORMATION TECHNOLOGY  |
#| IJK  | COMMUNICATION SERVICES  |
#| JKL  | UTILITIES               |
#| KLM  | REAL ESTATE             |
#+------+-------------------------+

In [None]:
# Reads the constituents of S&P 500 and standardizes the nomenclature of Sectors

SP500 = pd.read_csv('/content/SP500_raw_constituents.csv')
standardized_SP500 = pd.DataFrame(SP500[['Symbol', 'Sector']])

standardized_SP500.rename(columns = {'Symbol':'Name'}, inplace = True)
standardized_SP500['Sector'] = standardized_SP500['Sector'].replace(['Telecommunication Services'], 'Communication Services')

# Groups by Sector instead of alphabetical order
standardized_SP500 = standardized_SP500.sort_values(by=['Sector', 'Name'])

print("Amount of companies found on S&P500:", len(standardized_SP500))
standardized_SP500.to_csv('SP500_constituents.csv')

## Standardizing Wilshire 5000 data

In [None]:
# Reads the constituents of Wilshire 5000, fix null values and standardizes the nomenclature of Sectors
wilshire = pd.read_csv('/content/wilshire_raw_constituents.csv')
standardized_Wilshire = pd.DataFrame(wilshire[['Ticker', 'Sector']])

standardized_Wilshire.rename(columns = {'Ticker':'Name'}, inplace = True)
standardized_Wilshire[standardized_Wilshire['Sector'].isnull()]

# Fix companies that has null values on its sector searching the company name on yfinance
def corrijeNulos(row):
  if pd.isna(row['Sector']):
    try:
      consulta = yfin.Ticker(row['Name'])
      sector = consulta.info['sector']
      return sector
    except:
      return row['Sector']
    
  return row['Sector'] # Does not apply if the company does not have a null sector

qtd_nulos_inicial = standardized_Wilshire['Sector'].isna().sum()
print("Initial number of sectors with null values:", qtd_nulos_inicial)
standardized_Wilshire['Sector'] = standardized_Wilshire.apply(corrijeNulos, axis=1) # Fix possible null sectors
qtd_nulos_restantes = standardized_Wilshire['Sector'].isna().sum()
print("Number of null values corrected:", qtd_nulos_inicial - qtd_nulos_restantes)
print("Number of null values removed due to error:", qtd_nulos_restantes)

tam_anterior = len(standardized_Wilshire)
standardized_Wilshire = standardized_Wilshire.dropna()

# Standardizes the sectors names
standardized_Wilshire['Sector'] = standardized_Wilshire['Sector'].replace(['Healthcare'], 'Health Care')
standardized_Wilshire['Sector'] = standardized_Wilshire['Sector'].replace(['Technology'], 'Information Technology')
standardized_Wilshire['Sector'] = standardized_Wilshire['Sector'].replace(['Consumer Cyclical'],'Consumer Discretionary')
standardized_Wilshire['Sector'] = standardized_Wilshire['Sector'].replace(['Consumer Defensive'],'Consumer Staples')
standardized_Wilshire['Sector'] = standardized_Wilshire['Sector'].replace(['Financial Services'], 'Financials')
standardized_Wilshire['Sector'] = standardized_Wilshire['Sector'].replace(['Basic Materials'], 'Materials')

standardized_Wilshire.sort_values(by=['Sector', 'Name'])
print("Amount of companies found on Wilshire:", len(standardized_Wilshire))
standardized_Wilshire.to_csv('Wilshire_constituents.csv')

## Standardizing Crawler data

In [None]:
# Obtain the list of all stocks available on stockmonitor.com grouping by sectors
import requests
from lxml.html import parse
from urllib.request import Request, urlopen

sectors = ['utilities', 'basic-materials', 'consumer-defensive', 'communication-services', 'energy',
           'real-estate', 'consumer-cyclical', 'technology', 'industrials', 'healthcare', 'financial-services']
companies_list = pd.DataFrame(columns=['Name', 'Sector'])

headers = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)" + " "
    "AppleWebKit/537.36 (KHTML, like Gecko)" + " " + "Chrome/35.0.1916.47" +
    " " + "Safari/537.36"
]

for sector in sectors:

  # General settings that helps the crawler getting the target information
  sector_companies = []
  url = 'https://www.stockmonitor.com/sector/'+sector+'/'

  headers_dict = {'User-Agent': headers[0]}
  req = Request(url, headers=headers_dict)
  webpage = urlopen(req)
  tree = parse(webpage)

  for element in tree.xpath("//tbody/tr/td[@class='text-left']/a"):
    dict = {
        'Name': element.text,
        'Sector': sector.replace('-', ' ').title()
    }
    sector_companies.append(element.text)
    companies_list = companies_list.append(dict, ignore_index=True)
  print("Sector " + sector + ":", len(sector_companies))
print("Amount of companies found on crawler:", len(companies_list))

# Fix sectors that does not uses the standardized nomenclature
companies_list['Sector'] = companies_list['Sector'].replace(['Basic Materials'], 'Materials')
companies_list['Sector'] = companies_list['Sector'].replace(['Consumer Defensive'], 'Consumer Staples')
companies_list['Sector'] = companies_list['Sector'].replace(['Communication Services'], 'Communication Services')
companies_list['Sector'] = companies_list['Sector'].replace(['Consumer Cyclical'], 'Consumer Discretionary')
companies_list['Sector'] = companies_list['Sector'].replace(['Technology'], 'Information Technology')
companies_list['Sector'] = companies_list['Sector'].replace(['Healthcare'], 'Health Care')
companies_list['Sector'] = companies_list['Sector'].replace(['Financial Services'], 'Financials')

companies_list.sort_values(by=['Sector', 'Name'])

companies_list.to_csv('Stockmonitor_constituents.csv')

## Generating the Meta-dataset

In [2]:
class learning:
    
    # Class construtor
    def __init__(self, dataframe):
      self.dataframe = dataframe

    # Applies the binary class for the target(up or down)
    def target(self):
      # Method: Tomorrow's closing price minus today's closing price
      self.dataframe['Target'] = np.where(self.dataframe['Close'].shift(-1) - self.dataframe['Close'] > 0, 1, 0)
    
    # Inserts a new row on performance dataset with model and obtained return
    def insert(self, model, cumulativeReturn):
      if self.performance.empty:
        self.performance.loc[0] = [model, cumulativeReturn]
      else:
        self.performance.loc[self.performance.index[-1] + 1] = [model, cumulativeReturn]

    # Calculates the sum of return according to the prediction array
    def calculateCumulativeReturn(self, y_pred, tradingBook):
      # For each day, if the prediction is up tendency, buys today and sells tomorrow one unity of the stock(long trade)
      # If the prediction is down tendency, sells today and buys again tomorrow(short trade)
      # Both possibilites always happens using the closing price of each day
      cumulativeReturn = np.where(y_pred == 1,
                          tradingBook['Close'].shift(-1) - tradingBook['Close'],
                          tradingBook['Close'] - tradingBook['Close'].shift(-1))
      cumulativeReturn = cumulativeReturn[~np.isnan(cumulativeReturn)]
      return round(np.sum(cumulativeReturn), 2)

    # Extracts meta-data using the Catch22 library
    def extractsCatch22(self):
      catch = pycatch22.catch22_all(self.dataframe.Close)

      return catch['names'], catch['values']

    # Extracts meta-data using the PyMFE library
    def extractsPymfe(self, target):
      mfe = MFE(groups=["general", "statistical"])
      mfe.fit(np.asarray(self.dataframe), np.asarray(target))
      ft = mfe.extract()

      return ft[0], ft[1]

    # Extracts meta-data using the two previous libraries, and concatenates the result
    def extractsMetadata(self, target):
      dict_pymfe = {}
      dict_catch = {}

      # Catch22 meta-data
      catch_01, catch_02 = self.extractsCatch22()
      for nomeChave in catch_01:
        for valorChave in catch_02:
          dict_catch[nomeChave] = valorChave
          catch_02.remove(valorChave)
          break 

      # PyMFE meta-data
      pymfe_01, pymfe_02 = self.extractsPymfe(target)
      for nomeChave in pymfe_01:
        for valorChave in pymfe_02:
          dict_pymfe[nomeChave] = valorChave
          pymfe_02.remove(valorChave)
          break 

      return {**dict_catch, **dict_pymfe}

    # Main function of the class, used for training the model
    def training(self):

      self.target() # Defines the target, using the time series passed during class instantiation
      self.dataframe = self.dataframe.dropna()

      # Isolates the target column for training step
      target = self.dataframe['Target']
      del self.dataframe['Target']

      # instantiate the models of base-level learning
      models = []
      models.append(('RF', RandomForestClassifier(random_state=14)))
      models.append(('XGB', XGBClassifier(random_state=14)))
      models.append(('KNN', KNeighborsClassifier()))
      models.append(('SVM', SVC()))
      models.append(('NB', GaussianNB()))
      models.append(('ADA', AdaBoostClassifier(random_state=14)))
      models.append(('LogReg', LogisticRegression(random_state=14, max_iter=1000)))
      models.append(('DT', DecisionTreeClassifier(random_state=14)))

      # Creates the performance dataframe, that stores the cumulative return for each model
      self.performance = pd.DataFrame(columns=['Model', 'Cumulative Return'])

      # Normalizes data
      scaler = MinMaxScaler()
      scaler.fit(self.dataframe)
      scaled_data = pd.DataFrame(scaler.transform(self.dataframe), columns = self.dataframe.columns)

      # Splits the data and starts the training
      timesplit = TimeSeriesSplit(n_splits = 4, test_size = int(0.2*len(target)))
      for model_name, model in models:
        retsum = 0

        # Since there are 5 splits of data, we need to store the sum of returns in each split
        for train_index, test_index in timesplit.split(self.dataframe):
          X_train, X_test = self.dataframe[:len(train_index)], self.dataframe[len(train_index): (len(train_index)+len(test_index))]
          X_train_scaled, X_test_scaled = scaled_data[:len(train_index)], scaled_data[len(train_index): (len(train_index)+len(test_index))]
          y_train, y_test = target[:len(train_index)].values.ravel(), target[len(train_index): (len(train_index)+len(test_index))].values.ravel()       

          if model_name == 'KNN' or model_name == 'SVM': # Models that need data normalization
            clf = model.fit(X_train_scaled, y_train)
            retsum += self.calculateCumulativeReturn(clf.predict(X_test_scaled), X_test)
          else:
            clf = model.fit(X_train, y_train)
            retsum += self.calculateCumulativeReturn(clf.predict(X_test), X_test)

        self.insert(model_name, retsum) # Inserts the result of the model in the performance dataframe

      return self.performance, self.extractsMetadata(target) # Returns the performance dataset and the extracted meta-data

In [3]:
# Generates the complete Meta-dataset for a specific list of stocks
def generatesMetadataset(list_source):

  # Since this process takes a lot of time to be finished, the enviroment can shutdown. This functions creates a backup of the 
  # Meta-dataset everytime a sector is finished. To retrive the data in the last backup, comment the definition of the dataset 
  # and remove the comments from the next section of code

  # Starting a new meta-dataset
  metadataset = pd.DataFrame(columns=['Name', 'Model', 'Sector'])
  performance = pd.DataFrame(columns=['Sector', 'Time(s)', 'Total', 'Success', 'Not found', 'Uncompatible period'])

  # Runnning from the last backup saved
  #metadataset = pd.read_csv('/content/Metadataset_return_' + list_source +'.csv')
  #performance = pd.read_csv('/content/performance_'+ list_source +'.csv')

  companies = pd.read_csv('/content/'+ list_source +'_constituents.csv')
  sectors = companies['Sector'].unique()
  # For each sector
  for sector in sectors: 
    print("    Sector:", sector)

    visited_sectors = performance['Sector'].unique()
    if sector in visited_sectors: # If running a backup
      continue

    # Flags used to track the yfinance results
    success, not_found, uncompatible = 0, 0, 0
    companies_by_sector = companies[companies.Sector == sector]

    starting_time = time.time() # Stores the time generating each sector of the Meta-dataset

    # For each stock in the list
    for companie in companies_by_sector['Name']:
      
      try:
        # Verifies the stock time series is listed, and if has the desirable size
        df = pdr.get_data_yahoo(companie, end="2022-04-22", progress=False)

        if len(df) > 800: # At least 800 rows

          if len(df) > 1500: # If has more than 1500 rows, just gets the last 1500 lines of data
            df = df[-1500:]

          success+=1
          # Runs the base-level learning
          ap = learning(df)
          #metadata_dict = ap.training()

          # Stores the performance extraction and data characterization parts of the Meta-dataset
          ranking, metadata_dict = ap.training()

          performance_dict = ranking.to_dict(orient="list")

          # Stores the Cumulative Return of each model, enables the regression of the Meta-dataset
          for i in range(len(performance_dict['Model'])):
            metadata_dict["ret_" + performance_dict['Model'][i]] = performance_dict['Cumulative Return'][i]

          # Stores the model with greatest Cumulative Return, enables the classification of the Meta-dataset storing
          ranking = ranking.sort_values(by=['Cumulative Return'], ascending=False)

          # Additional data used on the Meta-dataset
          metadata_dict['Name'] = companie
          metadata_dict['Sector'] = sector
          metadata_dict['Model'] = ranking['Model'].iloc[:1].values[0]
          metadata_dict['Starting_date'] = df.index[0].date()
          metadata_dict['Closing_date'] = df.index[-1].date()

          # Inserts the data of the last company searched on yfinance in the Meta-dataset
          metadataset = metadataset.append(metadata_dict,ignore_index=True)

        else:
          yfin.Ticker(companie).actions.size
          uncompatible+=1 # Time serie with less than 800 rows

      except:
        not_found+=1 # Time serie not found

    # Everytime a Sector is completely finished, stores information of performance and creates a backup of the Meta-dataset
    metadataset = metadataset.dropna(axis=1,how='all')

    # Performance data
    performance_dict = {
        'Sector': sector,
        'Time(s)': round(time.time() - starting_time, 2),
        'Total': success+not_found+uncompatible,
        'Success': success,
        'Not found': not_found,
        'Uncompatible period': uncompatible,
    }
    performance = performance.append(performance_dict,ignore_index=True)

    # Most recent version of the Meta-dataset, used as backup
    metadataset.to_csv('Metadataset_return_'+ list_source + '.csv', encoding='utf-8',index=False)
    performance.to_csv('performance_' + list_source + '.csv', encoding='utf-8',index=False)

  # Finished all companies from the list
  tempo_total = performance['Time(s)'].sum()
  print("Total time for generating Meta-dataset:", str(datetime.timedelta(seconds=round(tempo_total))) )

In [None]:
# Creates three Meta-datasets for three different lists of stocks
generatesMetadataset('SP500')
generatesMetadataset('wilshire')
generatesMetadataset('Stockmonitor')

## Concatenating Meta-datasets

In [None]:
# Concatenates the three Meta-datasets and remove duplicates
stockmonitor_metadataset = pd.read_csv('/content/Metadataset_return_Stockmonitor.csv')
stockmonitor_metadataset = stockmonitor_metadataset.iloc[: , 1:]
stockmonitor_metadataset = stockmonitor_metadataset.dropna()

sp500_metadataset = pd.read_csv('/content/Metadataset_return_SP500.csv')
sp500_metadataset = sp500_metadataset.iloc[: , 1:]
sp500_metadataset = sp500_metadataset.dropna()

wilshire_metadataset = pd.read_csv('/content/Metadataset_return_Wilshire.csv')
wilshire_metadataset = wilshire_metadataset.iloc[: , 1:]
wilshire_metadataset = wilshire_metadataset.dropna()

metadataset = pd.concat([stockmonitor_metadataset, sp500_metadataset, wilshire_metadataset]).reset_index(drop=True)
metadataset = metadataset.drop_duplicates(subset='Name').reset_index(drop=True)

# This is the version of the Meta-dataset that uses the Cumulative Return. The Balanced Accuracy version
# can be obtained in analogous way. The results obtained are stored in Meta-dataset.csv  
metadataset.to_csv('Meta-dataset.csv', encoding='utf-8',index=False)