Added more flexible ticker calls

cuemacro · May 26, 2021 · 1a1e4d0 · 1a1e4d0
1 parent 8f2ad8b
commit 1a1e4d0
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -128,6 +128,8 @@ individual data providers)
 
 # Coding log
 
+* 26 May 2021
+  * Added more flexible ticker calls (with a '_' prefix)
 * 23 May 2021
   * Fixed various bugs with reading ECO_RELEASE_DT etc. dates from Bloomberg
   * Fixed bugs when predefined ticker is defined differently in different categories  

diff --git a/findatapy/market/market.py b/findatapy/market/market.py
@@ -365,13 +365,15 @@ def fetch_market(self, md_request=None, md_request_df=None, md_request_str=None,
 
         return data_frame
 
-    def create_md_request_from_dataframe(self, md_request_df, md_request=None, start_date=None, finish_date=None):
+    def create_md_request_from_dataframe(self, md_request_df, md_request=None, start_date=None, finish_date=None, smart_group=True):
 
         md_list = []
 
         # Aggregate/shrink dataframe grouping it by common attributes
         # tickers, vendor_tickers, fields, vendor_fields
-        md_request_df = ConfigManager().get_instance().smart_group_dataframe_tickers(md_request_df, ret_fields=md_request_df.columns.tolist())
+        if smart_group:
+            md_request_df = ConfigManager().get_instance().smart_group_dataframe_tickers(
+                md_request_df, ret_fields=md_request_df.columns.tolist())
 
         # Now populate MarketDataRequests based on the DataFrame
         for index, row in md_request_df.iterrows():
@@ -406,7 +408,7 @@ def create_md_request_from_dict(self, md_request_dict, md_request=None, start_da
 
         return md_request
 
-    def create_md_request_from_str(self, md_request_str, md_request=None, start_date=None, finish_date=None):
+    def create_md_request_from_str(self, md_request_str, md_request=None, start_date=None, finish_date=None, best_match_only=False, smart_group=True):
 
         json_md_request = None
 
@@ -427,7 +429,10 @@ def create_md_request_from_str(self, md_request_str, md_request=None, start_date
         except:
             pass
 
+        # If we failed to parse as a JSON, let's try as string
         if json_md_request is None:
+
+            # Create a list of input parameters for our MarketDataRequest
             split_lst = []
 
             word = ''
@@ -457,7 +462,8 @@ def create_md_request_from_str(self, md_request_str, md_request=None, start_date
             if environment in constants.possible_data_environment:
                 i = 0
 
-            elif environment == 'raw':
+            # Otherwise, what if the user wants to specify each property manually?
+            elif environment == 'raw' or environment == 'r':
                 # Here the user can specify any tickers/fields etc. they want, they don't have to be predefined
                 # eg. raw.data_source.bloomberg.tickers.EURUSD.vendor_tickers.EURUSD Curncy
                 if md_request is None:
@@ -480,10 +486,23 @@ def create_md_request_from_str(self, md_request_str, md_request=None, start_date
 
                 return self.create_md_request_from_freeform(md_request)
 
+            # Otherwise we do a partial match of predefined tickers
+            elif environment == "_":
+                # Try a heuristic/approximate match eg. _.quandl.fx
+                md_request_df = ConfigManager().get_instance().free_form_tickers_query(md_request_params[1:],
+                                                                                       best_match_only=best_match_only,
+                                                                                       smart_group=smart_group)
+
+                return self.create_md_request_from_dataframe(md_request_df,
+                                                             md_request=md_request, start_date=start_date,
+                                                             finish_date=finish_date)
+
             else:
                 i = -1
                 environment = None
 
+            # Otherwise the user has specified the MarketDataRequest str in the form
+            # category.data_source.freq.cut.tickers.field = fx.bloomberg.daily.NYC.EURUSD.close
             category = md_request_params[i + 1]
             data_source = md_request_params[i + 2]
 

diff --git a/findatapy/util/configmanager.py b/findatapy/util/configmanager.py
@@ -17,9 +17,12 @@
 import csv
 import pandas as pd
 
+
+from findatapy.timeseries import Calculations
 from findatapy.util.dataconstants import DataConstants
 from findatapy.util.singleton import Singleton
 from findatapy.util.loggermanager import LoggerManager
+
 from dateutil.parser import parse
 
 import re
@@ -170,7 +173,14 @@ def populate_time_series_dictionaries(data_constants=None):
                             else:
                                 ConfigManager._dict_time_series_category_tickers_library_to_library[key] = [tickers]
 
-        ConfigManager._data_frame_time_series_tickers = pd.concat(df_tickers)
+        try:
+            df_tickers = pd.concat(df_tickers).sort_values(by=['category', 'data_source', 'freq', 'cut'])
+        except:
+            pass
+
+        df_tickers = df_tickers.reset_index().drop('level_0', axis=1).reset_index()
+
+        ConfigManager._data_frame_time_series_tickers = df_tickers
 
         ## Populate fields conversions
         reader = csv.DictReader(open(data_constants.time_series_fields_list))
@@ -249,11 +259,96 @@ def free_form_tickers_regex_query(self, category=None, data_source=None, freq=No
 
         return df
 
+    def free_form_tickers_query(self, free_form_query, best_match_only=False,
+                                ret_fields=['category', 'data_source', 'freq', 'cut', 'tickers', 'vendor_tickers', 'fields'],
+                                smart_group=True):
+        """From a string or list of properties for predefined tickers, we create a DataFrame that can be used to populate a
+        MarketDataRequest. We search through all the predefined tickers, and "guess" any matches to our query, without
+        having to use the standard query format which consists of category.data_source.freq.cut.ticker such as this example
+        fx.bloomberg.daily.NYC.EURUSD.close
+
+        eg. quandl.fx will match all tickers which are from "quandl" and have a "category" fx
+
+        We must be careful to make sure that categories, data_sources  etc. are unique and do not overlap with other properties
+        like tickers
+
+        Parameters
+        ----------
+        free_form_query : str
+            A query that can be used to generate a MarketDataRequest
+
+            eg. quandl.fx
+
+        best_match_only : bool
+            Only return at most 1 row of a DataFrame (default: False)
+
+        ret_fields : str(list)
+            Which properties of a MarketDataRequest to return
+
+        smart_group : bool
+            Smart group tickers of a particular category in a specific row
+
+        Returns
+        -------
+        DataFrame
+        """
+        logger = LoggerManager().getLogger(__name__)
+
+        if isinstance(free_form_query, str):
+            keywords = free_form_query.split('.')
+        else:
+            keywords = free_form_query
+
+        logger.info("Finding ticker combination which matches " + str(free_form_query))
+
+        df = ConfigManager._data_frame_time_series_tickers
+
+        df_joined = df
+
+        # Search through all the keywords, and see if matches with any columns of our predefined tickers
+        try:
+            for k in keywords:
+                for c in df.columns:
+                    try:
+                        df_temp = df_joined[df_joined[c] == k]
+                    except:
+                        df_temp = pd.DataFrame()
+
+                    if not(df_temp.empty):
+                        df_joined = df_temp
+                        break
+
+            df = df_joined
+        except Exception as e:
+            return None
+
+        if len(df.index) > 1:
+            logger.info("Found multiple matches for ticker combination, first trying smart group...")
+
+            if smart_group:
+                df = self.smart_group_dataframe_tickers(df, ret_fields=ret_fields)
+
+            if best_match_only:
+                logger.info("Taking only top match...")
+                df = pd.DataFrame(df.head(1))
+
+        return df
+
     @staticmethod
     def smart_group_dataframe_tickers(df, ret_fields=['category', 'data_source', 'freq', 'cut']):
-        """Groups together a dataframe of metadata associated with assets
+        """Groups together a DataFrame of metadata associated with assets, which can be used to create MarketDataRequest
+        objects
         """
 
+        if ret_fields is None:
+            ret_fields = df.columns.to_list()
+        elif isinstance(ret_fields, 'str'):
+            if ret_fields == 'all':
+                ret_fields = df.columns.to_list()
+        elif isinstance(ret_fields, list):
+            if ret_fields == []:
+                ret_fields =  df.columns.to_list()
+
         if set(['category', 'data_source', 'freq', 'cut']).issubset(ret_fields):
             group_fields = ret_fields.copy()
 
@@ -270,12 +365,22 @@ def smart_group_dataframe_tickers(df, ret_fields=['category', 'data_source', 'fr
                 group_fields.remove('vendor_tickers')
 
             if agg_dict != {}:
-                df = df.groupby(group_fields).agg(agg_dict)
 
-                for i, g in enumerate(group_fields):
-                    df[g] = df.index.get_level_values(i)
+                try:
+                    df = df.drop(data_constants.drop_cols_smart_tickers_grouping, axis=1)
+                except:
+                    pass
+
+                df_temp = df.groupby(group_fields).agg(agg_dict)
+
+                # If grouping fails (when there aren't multiple elements to group!)
+                if df_temp.empty:
+                   pass
+                else:
+                    for i, g in enumerate(group_fields):
+                        df_temp[g] = df_temp.index.get_level_values(i)
 
-                df = df.reset_index(drop=True)
+                    df = df_temp.reset_index(drop=True)
 
         return df
 

diff --git a/findatapy/util/dataconstants.py b/findatapy/util/dataconstants.py
@@ -107,6 +107,9 @@ class DataConstants(object):
     events_category = 'events'
     events_category_dt = 'events_dt'
 
+    # Ignore these columns when doing smart grouping
+    drop_cols_smart_tickers_grouping = ['level_0']
+
     ###### FOR CURRENT VERSION
 
     # which marketdatagenerator type to use?

diff --git a/findatapy_examples/freeform_md_request.py b/findatapy_examples/freeform_md_request.py
@@ -50,8 +50,21 @@
     # We can also create MarketDataRequest by passing a str that relates to predefined tickers
     md_request = market.create_md_request_from_str('backtest.fx.bloomberg.daily.NYC.EURUSD.close')
 
+    print(md_request)
+
+    # We can do an approximate match for predefined tickers, and findatapy will "guess" the closest match
+    md_request = market.create_md_request_from_str('_.bloomberg.EURUSD.NYC')
+
+    print(md_request)
+
+    # We get *all* the predefined tickers which match in any column with quandl and fx, and these be smart grouped
+    # into the smallest number of requests
+    md_request = market.create_md_request_from_str('_.quandl.fx', best_match_only=False, smart_group=True)
+
+    print(md_request)
+
     # We can also create MarketDataRequest by passing a str for an arbitrary Parquet, note, if we have dots in the path
-    # we need to use {} to denote them
+    # we need to use {} to denote them (we can use either 'raw' to denote this or just 'r')
     md_request = market.create_md_request_from_str('raw.data_source.{c:\parquet_files\dump.parquet}.tickers.EURUSD')
 
     print(md_request)