In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import os
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from multiprocessing import Pool, cpu_count

In [23]:
def similarity_prob(kwargs):
    series = kwargs['series']
    other = kwargs['other']
    n_days = kwargs['n_days']
    threshold = kwargs['threshold']
    perc = kwargs['perc']
    name = kwargs['name']
    rolling = series.rolling(window=n_days)
    corrs = rolling.apply(lambda x: np.corrcoef(x, other)[0, 1]).dropna().iloc[:-n_days]
    max_values = rolling.max().dropna().shift(-n_days).dropna()
    min_values = rolling.min().dropna().shift(-n_days).dropna()
    values = series.iloc[n_days-1:-n_days]
    inc_op = (max_values > (values * (1 + perc))).loc[(corrs > threshold).values]
    dec_op = (min_values < (values * (1 - perc))).loc[(corrs > threshold).values]
    try:
        return pd.DataFrame({'total': inc_op.shape[0], 'inc_prob': inc_op.sum()/inc_op.shape[0], 'dec_prob': dec_op.sum()/dec_op.shape[0]}, index=[name])
    except ZeroDivisionError:
        return pd.DataFrame({'total': 0, 'inc_prob': 0, 'dec_prob': 0}, index=[name])

In [24]:
price_path = '/Users/feizhan/Dropbox/Project2M/ASXYearlyCompanyConsolidation/'

In [25]:
years = [2013, 2014, 2015, 2016]
dat = pd.DataFrame({})
for year in years:
    dat = dat.append(pd.read_csv(os.path.join(price_path, str(year)+'price.csv')))

In [26]:
gpd = dat.groupby('code')

In [27]:
def find_prob_total(gpd, func, n_days=10, threshold=.7, perc=.1):
    with Pool(cpu_count()) as p:
        ret_list = p.map(
            func, 
            [
                {
                    'n_days': n_days,
                    'threshold': threshold,
                    'perc': perc,
                    'series': group['Close'],
                    'other': group['Close'].iloc[-n_days:],
                    'name': name
                } 
                for name, group in gpd
            ]
        )
    return pd.concat(ret_list)

In [28]:
%time similarity_analysis = find_prob_total(gpd, similarity_prob)

CPU times: user 2.58 s, sys: 264 ms, total: 2.84 s
Wall time: 46.4 s


In [30]:
similarity_analysis.head()

Unnamed: 0,dec_prob,inc_prob,total
1AL.AX,0.0,0.0,5
1PG.AX,0.238095,0.428571,21
1ST.AX,0.315789,0.280702,57
3DM.AX,0.337209,0.430233,86
3PL.AX,0.135802,0.074074,81


In [4]:
start_year = 2011
end_year = 2016
start = pd.datetime(start_year, 1, 1)
end = pd.datetime(end_year, 12, 31)
code = 'AJL.AX'
code_dat = web.DataReader(code, 'yahoo', start, end)

In [20]:
def single_similarity_prob(series, n_days=10, threshold=.7, perc=0.1):
    other = series.iloc[-n_days:]
    rolling = series.rolling(window=n_days)
    corrs = rolling.apply(lambda x: np.corrcoef(x, other)[0, 1]).dropna().iloc[:-n_days]
    max_values = rolling.max().dropna().shift(-n_days).dropna()
    min_values = rolling.min().dropna().shift(-n_days).dropna()
    values = series.iloc[n_days-1:-n_days]
    inc_op = (max_values > (values * (1 + perc))).loc[(corrs > threshold).values]
    dec_op = (min_values < (values * (1 - perc))).loc[(corrs > threshold).values]
    try:
        return pd.DataFrame({'total': [inc_op.shape[0]], 'inc_prob': [inc_op.sum()/inc_op.shape[0]], 'dec_prob': [dec_op.sum()/dec_op.shape[0]]})
    except ZeroDivisionError:
        return pd.DataFrame({'total': [0], 'inc_prob': [0], 'dec_prob': [0]})

In [21]:
single_similarity_prob(code_dat.loc[:, 'Close'], n_days=10, perc=.15, threshold=.8)

Unnamed: 0,dec_prob,inc_prob,total
0,0.058824,0.235294,17
