In [None]:
"""
The following code shows how to calculate the daily return of stock price, stock volume and sector price

"""
import pandas as pd
import datetime 

f = lambda x: datetime.datetime.strptime(x,'%Y-%m-%d')
data_1 = pd.read_csv('price_sector.csv',header = 0,index_col=0,converters = {'Date':f})
Ticker_List = data_1.Ticker.unique()

def ticker_convert(a):
    return a[:-10]
data_1['New_Ticker'] = data_1['Ticker'].apply(ticker_convert)
data_1 = data_1[['New_Ticker','Date','Price','Volume','Sector']]

def time_convert(time):
    new_time  = datetime.datetime.strptime(time, "%m/%d/%Y").strftime("%Y-%m-%d")
    return new_time
new_ticker_list = data_1.New_Ticker.unique()
output = pd.DataFrame()
for new_ticker in new_ticker_list:
    sub_data  = data_1[data_1.New_Ticker == new_ticker]
    sub_data = sub_data.sort_values(['Date'])
    sub_data = sub_data.reset_index()
    date_list = sub_data['Date']
    date_diff = date_list.diff()
    if (date_diff > datetime.timedelta(days =10)).any():
        split_index = list(date_diff.index[date_diff > datetime.timedelta(days = 10)])
        split_index += [0,len(sub_data.index)]
        split_index.sort()
        return_data = pd.DataFrame()
        for i in range(len(split_index)-1):
            sub_sub_data = sub_data.iloc[split_index[i]:split_index[i+1]]
            sub_sub_data['Price_Return'] = sub_sub_data.Price.pct_change()
            sub_sub_data['Volume_Return'] = sub_sub_data.Volume.pct_change()
            return_data = pd.concat([return_data, sub_sub_data],ignore_index = True)
            return_data = return_data.drop(['index'], axis=1)
    else:
        sub_data['Price_Return'] = sub_data.Price.pct_change()
        sub_data['Volume_Return'] = sub_data.Volume.pct_change()
        sub_data = sub_data.drop(['index'],axis = 1)
        return_data = sub_data

    output = pd.concat([output,return_data],ignore_index = True)

f_2 = lambda x: datetime.datetime.strptime(x,'%m/%d/%Y')
data_2 = pd.read_csv('Sector Performance.csv',index_col = 0,converters = {'Date':f_2})

sector_list = data_2.Sector.unique()
output_2 = pd.DataFrame()
for sector in sector_list:
    sub_data = data_2[data_2.Sector == sector]
    sub_data = sub_data.sort_values(['Date'])
    sub_data = sub_data.reset_index(drop = True)
    sub_data['Sector_Return'] = sub_data.Sector_Price.pct_change()
    output_2 = pd.concat([output_2,sub_data],ignore_index = True)
    
output_3 = output.merge(output_2, how = 'left', on = ['Sector','Date'])
output_4 = output_3.dropna(axis = 0)
output_4.to_csv('return.csv')



In [None]:
"""
The following code shows how to generate 1/0 target based on cross-sectional median of SP 500 constituents' daily return

"""
import pandas as pd
data= pd.read_csv('return.csv',index_col = 0)
data = data.sort_values(['Date','Price_Return'])
data = data.reset_index(drop = True)
date_list = list(data.Date.unique())
output = pd.DataFrame()

def target_class(equity_return, median):
    if equity_return >= median:
        return 1
    else:
        return 0
    
for date in date_list:
    sub_data = data[data.Date == date]
    return_median = sub_data['Price_Return'].median()
    sub_data['Target'] = sub_data['Price_Return'].apply(target_class,args=(return_median,))
    output = pd.concat([output,sub_data], ignore_index = True)
    print(date)


In [None]:
"""
The following code shows how to seperate our dataset into 13 study period 
and normalize all three features using the mean and standard deviation of training set.

"""

def normalize(a,mean,std):
    return (a-mean)/std
    

for i in range(13):
    study_date = date_list[250*i : 250*(i+4)]        
    study_period = output[output.Date.isin(study_date)]
    
    train_date = date_list[250*i : 250*(i+3)]
    train_data = output[output.Date.isin(train_date)]
    
    trade_date = date_list[250*(i+3) : 250*(i+4)]
    trade_data = output[output.Date.isin(trade_date)]

    train_price_mean = train_data.Price_Return.mean()
    train_price_std = train_data.Price_Return.std()   
    study_period['Normalized_Price_Return'] = study_period['Price_Return'].apply(normalize,args=(train_price_mean,train_price_std))
    
    train_volume_mean = train_data.Volume_Return.mean()
    train_volume_std = train_data.Volume_Return.std()
    study_period['Normalized_Volume_Return'] = study_period['Volume_Return'].apply(normalize,args = (train_volume_mean,train_volume_std))
    
    train_sector_mean = train_data.Sector_Return.mean()
    train_sector_std = train_data.Sector_Return.std()
    study_period['Normalized_Sector_Return'] = study_period['Sector_Return'].apply(normalize,args = (train_sector_mean,train_sector_std))
    
    study_period.to_csv('normalized_return_'+str(i)+'.csv')



In [None]:
"""
The following code shows how to split our dataset into training set and testing set within each study period

"""
import pandas as pd

for m in range(13):
    data = pd.read_csv('C:/Users/Yu/Desktop/Data/normalized_return_'+str(m)+'.csv',index_col = 0)
    data = data[['Date','New_Ticker','Sector','Normalized_Price_Return','Normalized_Volume_Return','Normalized_Sector_Return','Target']]
    data = data.sort_values(['Date','New_Ticker'],ascending = True)
    date_list = list(data.Date.unique())
    ticker_list = list(data.New_Ticker.unique())
    ticker_list.sort()

    # Generate Training Set
    final_output = pd.DataFrame()

    for ticker in ticker_list:
        output = pd.DataFrame()
        for i in range(510):
            sub_date_list = date_list[i : (i+241)]
            ticker_data = data[data.New_Ticker == ticker]
            ticker_date_data = ticker_data[ticker_data.Date.isin(sub_date_list)]
            ticker_date_data = ticker_date_data.reset_index(drop = True)
            if len(ticker_date_data.index) >= 241:
                ticker_date_data = ticker_date_data.transpose()
                name = ticker_date_data.iloc[1,0]
                target = ticker_date_data.iloc[-1,-1]
                target_date = ticker_date_data.iloc[0,-1]
                sector = ticker_date_data.iloc[2,0]
                
                ticker_date_data = ticker_date_data.drop(['Date','New_Ticker','Target','Sector'],axis = 0)
                ticker_date_data = ticker_date_data.iloc[:,0:-1]
                ticker_date_data = ticker_date_data.stack().to_frame().T
                ticker_date_data.columns = range(720)
                                
                ticker_date_data['target'] = target
                ticker_date_data['ticker'] = name
                ticker_date_data['target_date'] = target_date
                ticker_date_data['sector'] = sector
                ticker_date_data = ticker_date_data.reset_index(drop= True)
                output = pd.concat([output,ticker_date_data],ignore_index = True)
        print(str(ticker))
        final_output = pd.concat([final_output,output],ignore_index = True)

    final_output.to_csv('Set_'+str(m)+'_Train.csv')

    # Generate Testing Set
    final_output_2 = pd.DataFrame()
    
    for ticker in ticker_list:
        output_2 = pd.DataFrame()
        for i in range(510,760):
            sub_date_list = date_list[i : (i+241)]
            ticker_data = data[data.New_Ticker == ticker]
            ticker_date_data = ticker_data[ticker_data.Date.isin(sub_date_list)]
            ticker_date_data = ticker_date_data.reset_index(drop = True)
            if len(ticker_date_data.index) >= 241:
                ticker_date_data = ticker_date_data.transpose()
                name = ticker_date_data.iloc[1,0]
                target = ticker_date_data.iloc[-1,-1]
                target_date = ticker_date_data.iloc[0,-1]
                sector = ticker_date_data.iloc[2,0]
                
                ticker_date_data = ticker_date_data.drop(['Date','New_Ticker','Target','Sector'],axis = 0)
                ticker_date_data = ticker_date_data.iloc[:,0:-1]
                ticker_date_data = ticker_date_data.stack().to_frame().T
                ticker_date_data.columns = range(720)
                                
                ticker_date_data['target'] = target
                ticker_date_data['ticker'] = name
                ticker_date_data['target_date'] = target_date
                ticker_date_data['sector'] = sector
                ticker_date_data = ticker_date_data.reset_index(drop= True)
                output_2 = pd.concat([output_2,ticker_date_data],ignore_index = True)
        print(str(ticker))
        final_output_2 = pd.concat([final_output_2,output_2],ignore_index = True)

    final_output_2.to_csv('Set_'+str(m)+'_Test.csv')
