In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob 
from tqdm import tqdm
from collections import Counter

In [2]:
def fs_results_aggregation(fs_paths, n):
    """
    get df with feature selection results.
    the df will present the ticker name,
    the top n features, and the bottom n
    features. We assume that the feature
    selection csv is sorted from best to worst.
    

    :param fs_paths: paths with feature selection results
    :type fs_paths: [str]
    :param n: number features to display
    :type n: int
    :return: dataframe with feature selection information
    :rtype: pd.DataFrame
    """

    result = []
    columns = ["ticker"]
    columns += ["top_{}".format(i+1) for i in range(n)]
    columns += ["bottom_{}".format(i+1) for i in range(n)]


    for path in tqdm(fs_paths):
        df = pd.read_csv(path).dropna()
        top_n = list(df.head(n).feature.values)
        bottom_n = list(df.tail(n).feature.values)
        name = path.split("/")[-1].split(".")[0]
        obs = [name] + top_n + bottom_n
        result.append(obs)
    return pd.DataFrame(result, columns=columns)



def get_top_features_from_fs_results(fs_results, top_k=10):
    """
    get the word, lag, and frequency from the
    top1 features in the dataframe 'fs_results'.
    
    :param fs_results: feature selection dataframe
    :type fs_results: DataFrame
    :param top_k: number features to display
    :type top_k: int
    :return: dataframe with top 1 feature information
    :rtype: pd.DataFrame
    """

    size = fs_results.shape[0] 
    tops = Counter(fs_results.top_1.values).most_common(top_k)
    new_tops = []
    for obs in tops:
        feature = obs[0]
        count =  obs[1]
        word = " ".join(feature.split("_")[:-1]) 
        lag = int(feature.split("_")[-1])
        new_tops.append((word, lag, count))
    top_features = pd.DataFrame(new_tops, columns=["word", "lags", "frequency"])
    top_features.loc[:, "frequency"] = (top_features.loc[:, "frequency"] / size)
    top_features.loc[:, "frequency"] = top_features.frequency.map(lambda x: "{:.1%}".format(x))
    return top_features

# SFI NYSE Results

In [3]:
path_sfi_nyse = glob("results/sfi/nyse/*.csv")
path_sfi_nyse.sort()
sfi_nyse_result = fs_results_aggregation(path_sfi_nyse,n=5)
get_top_features_from_fs_results(sfi_nyse_result)

100%|██████████| 1628/1628 [00:09<00:00, 172.42it/s]


Unnamed: 0,word,lags,frequency
0,DOW JONES,5,2.0%
1,dow jones,8,1.4%
2,happy,19,1.3%
3,dow jones,15,1.2%
4,bonds,1,1.0%
5,chance,18,0.9%
6,happy,17,0.9%
7,headlines,2,0.9%
8,debt,10,0.8%
9,derivatives,18,0.7%


# MDI NYSE Results

In [4]:
path_mdi_nyse = glob("results/mdi/nyse/*.csv")
path_mdi_nyse.sort()
mdi_nyse_result = fs_results_aggregation(path_mdi_nyse,n=5)
get_top_features_from_fs_results(mdi_nyse_result)

100%|██████████| 1628/1628 [00:09<00:00, 164.68it/s]


Unnamed: 0,word,lags,frequency
0,short selling,16,4.7%
1,short selling,18,4.0%
2,short selling,15,3.6%
3,short sell,16,3.1%
4,short selling,14,2.6%
5,short selling,13,2.5%
6,short selling,17,2.1%
7,short selling,20,1.4%
8,short sell,18,1.2%
9,short selling,8,1.2%


# SFI NASDAQ Results

In [5]:
path_sfi_nasdaq = glob("results/sfi/nasdaq/*.csv")
path_sfi_nasdaq.sort()
sfi_nasdaq_result = fs_results_aggregation(path_sfi_nasdaq, n=5)
get_top_features_from_fs_results(sfi_nasdaq_result)

100%|██████████| 7868/7868 [00:58<00:00, 133.57it/s]


Unnamed: 0,word,lags,frequency
0,happy,19,0.5%
1,labor,4,0.4%
2,DOW JONES,5,0.4%
3,lifestyle,18,0.4%
4,bonds,1,0.4%
5,dow jones,15,0.3%
6,dow jones,8,0.3%
7,chance,18,0.3%
8,happy,17,0.3%
9,debt,10,0.3%


# MDI NASDAQ Results

In [6]:
path_mdi_nasdaq = glob("results/mdi/nasdaq/*.csv")
path_mdi_nasdaq.sort()
mdi_nasdaq_result = fs_results_aggregation(path_mdi_nasdaq,n=5)
get_top_features_from_fs_results(mdi_nasdaq_result)

100%|██████████| 7868/7868 [00:51<00:00, 152.66it/s]


Unnamed: 0,word,lags,frequency
0,short selling,16,1.6%
1,short selling,18,1.4%
2,short selling,14,1.2%
3,short selling,15,1.2%
4,short sell,16,1.2%
5,short selling,13,1.0%
6,short selling,12,0.7%
7,short sell,15,0.6%
8,short sell,14,0.6%
9,short sell,18,0.6%


# Case analysis
## APPLE

In [25]:
df1 = sfi_nasdaq_result.set_index("ticker")
df2 = mdi_nasdaq_result.set_index("ticker")

ticker_name =  "AAPL US Equity" 

apple = pd.concat([df1.loc[ticker_name,:].to_frame().add_prefix("SFI "),
           df2.loc[ticker_name,:].to_frame().add_prefix("MDI ")],1)
apple

Unnamed: 0,SFI AAPL US Equity,MDI AAPL US Equity
top_1,nyse_10,short_sell_15
top_2,massachusetts_17,short_selling_6
top_3,votes_12,short_sell_6
top_4,fed_20,short_selling_15
top_5,nyse_19,financial_markets_5
bottom_1,dow_jones_2,rare_earths_8
bottom_2,headlines_3,short_sell_20
bottom_3,senate_17,BUY_AND_HOLD_11
bottom_4,headlines_14,BUY_AND_HOLD_14
bottom_5,DOW_JONES_2,rare_earths_15


## GOOGLE

In [26]:
ticker_name =  "GOOGL US Equity" 

google = pd.concat([df1.loc[ticker_name,:].to_frame().add_prefix("SFI "),
           df2.loc[ticker_name,:].to_frame().add_prefix("MDI ")],1)
google

Unnamed: 0,SFI GOOGL US Equity,MDI GOOGL US Equity
top_1,stats_2,short_sell_6
top_2,unemployment_20,BUY_AND_HOLD_20
top_3,wisconsin_20,short_selling_6
top_4,federal_20,bank_19
top_5,economics_20,short_selling_9
bottom_1,governor_2,BUY_AND_HOLD_10
bottom_2,senate_4,financial_markets_6
bottom_3,senate_11,short_sell_4
bottom_4,senate_2,BUY_AND_HOLD_3
bottom_5,senate_8,short_sell_19


## TESLA

In [27]:
ticker_name =  "TSLA US Equity" 

google = pd.concat([df1.loc[ticker_name,:].to_frame().add_prefix("SFI "),
           df2.loc[ticker_name,:].to_frame().add_prefix("MDI ")],1)
google

Unnamed: 0,SFI TSLA US Equity,MDI TSLA US Equity
top_1,police_15,fond_7
top_2,risk_13,voters_5
top_3,political_18,votes_13
top_4,sell_4,rich_5
top_5,portfolio_18,headlines_8
bottom_1,votes_14,nasdaq_6
bottom_2,votes_8,headlines_17
bottom_3,votes_1,elected_17
bottom_4,votes_4,gains_15
bottom_5,votes_5,served_20


## AMAZON

In [28]:
ticker_name =  "AMZN US Equity" 

google = pd.concat([df1.loc[ticker_name,:].to_frame().add_prefix("SFI "),
           df2.loc[ticker_name,:].to_frame().add_prefix("MDI ")],1)
google

Unnamed: 0,SFI AMZN US Equity,MDI AMZN US Equity
top_1,portfolio_6,return_7
top_2,present_7,texas_14
top_3,firm_4,marriage_10
top_4,founded_9,politics_17
top_5,fond_15,social_12
bottom_1,headlines_4,BUY_AND_HOLD_10
bottom_2,DOW_JONES_2,short_sell_18
bottom_3,fond_18,short_selling_17
bottom_4,DOW_JONES_7,BUY_AND_HOLD_12
bottom_5,virginia_7,rare_earths_20
