# Creating New Style Feature Set
Where 60-day period (t-60 to t) is used to predict time t+30

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

### Convert Date

In [3]:
def convert_date(df, datecolname):
    converted_list = []

    original_list = df[datecolname].values
    
    for i in range(len(original_list)):
        try:
            date = datetime.strptime(original_list[i], '%Y-%m-%d')
        except:
            date = datetime.strptime(original_list[i], '%Y-%m')
        
        converted_list.append(date.strftime('%-m/%-d/%y'))

    df[datecolname] = converted_list
    
    return df

### Create a Datetime Column

In [4]:
def create_dt_col(df):
    # Remove nan columns
    df = df[pd.isnull(df['Date']) == 0]

    dates = df['Date'].values
    datetime_list = []

    for i in range(df.shape[0]):

        date = datetime.strptime(dates[i], '%m/%d/%y')
        datetime_list.append(date)

    df['Datetimes'] = datetime_list
    df['Datetimes'] = df['Datetimes'].dt.to_pydatetime()
    
    return df

### Load Files

In [8]:
# load Sharpe calculations
file1 = '../Data/Sharpe_RUA_complete.csv'
main = pd.read_csv(file1)

In [9]:
main = create_dt_col(main)
main.head(n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0.1,Unnamed: 0,Date,RUA Close,Risk-Free TNX,RUA_Returns,Dates,Annual_Return_RUA,Numerator,Std_RUA,Ratio,Ratio_smoothed,Unnamed: 11,Datetimes
0,837.0,1/2/91,178.979996,7.97,-0.010395,1/2/91,0.094524,0.014824,2.256054,0.006571,0.008547,-0.001976,1991-01-02
1,838.0,1/3/91,176.639999,7.93,-0.013074,1/3/91,0.079362,6.2e-05,2.234674,2.8e-05,0.008775,-0.008747,1991-01-03
2,839.0,1/4/91,176.259995,8.02,-0.002151,1/4/91,0.074637,-0.005563,2.216556,-0.00251,0.009003,-0.011513,1991-01-04
3,840.0,1/7/91,173.199997,8.13,-0.017361,1/7/91,0.085293,0.003993,2.154587,0.001853,0.009232,-0.007379,1991-01-07
4,841.0,1/8/91,172.779999,8.16,-0.002425,1/8/91,0.084374,0.002774,2.154293,0.001288,0.00946,-0.008172,1991-01-08


In [10]:
main = main.drop(['Unnamed: 0', 'Unnamed: 11', 'RUA_Returns', 'Dates', 'Numerator', 'Ratio_smoothed'], axis = 1)
main.head()

Unnamed: 0,Date,RUA Close,Risk-Free TNX,Annual_Return_RUA,Std_RUA,Ratio,Datetimes
0,1/2/91,178.979996,7.97,0.094524,2.256054,0.006571,1991-01-02
1,1/3/91,176.639999,7.93,0.079362,2.234674,2.8e-05,1991-01-03
2,1/4/91,176.259995,8.02,0.074637,2.216556,-0.00251,1991-01-04
3,1/7/91,173.199997,8.13,0.085293,2.154587,0.001853,1991-01-07
4,1/8/91,172.779999,8.16,0.084374,2.154293,0.001288,1991-01-08


In [None]:
# load VIX
file1 = '^VIX.csv'
vix = convert_date(pd.read_csv(file1), 'Date') # convert dates so its equal to main
vix = vix[['Date', 'Adj Close']]
vix = create_dt_col(vix)
vix.name = 'vix'
vix = vix.rename(index=str, columns={"Close": "VIX_Close"})
vix.head()

In [None]:
# load Real Disposable Income (too short)
file1 = 'A067RL1Q156SBEA_short.csv'
rdi = pd.read_csv(file1)
rdi = rdi.rename(index=str, columns={"DATE": "Date"})
rdi.name = 'rdi'
rdi = create_dt_col(rdi)
rdi.head(n=5)

In [None]:
# load Investor Sentiment (Weekly)
file1 = 'AAII-AAII_SENTIMENT.csv'
invest = pd.read_csv(file1)
invest = invest[['Date', 'Bullish', 'Neutral', 'Bearish', 'Total']]
invest.name = 'invest'
invest = create_dt_col(invest)
invest.head(n=5)

In [None]:
# load Consumer Price Index (too short)
file1 = 'CPIAUCSL_short.csv'
cpi = pd.read_csv(file1)
cpi = cpi.rename(index=str, columns={"DATE": "Date"})
cpi.name = 'cpi'
cpi = create_dt_col(cpi)
cpi.head(n=5)

In [None]:
# Load Consumer Confidence Index
file1 = 'CCI_USA.csv'
cci = convert_date(pd.read_csv(file1), 'TIME')
cci = cci[['TIME', 'Value']]
cci = cci.rename(index=str, columns={'Value': 'CCI_Value', 'TIME': 'Date'})
cci.name = 'cci'
cci = create_dt_col(cci)
cci.head(n=5)

In [None]:
# Gold Fixing Price: GOLDAMGBD228NLBM
file1 = 'GOLDAMGBD228NLBM.csv'
gfp = convert_date(pd.read_csv(file1), 'DATE')
gfp = gfp.rename(index=str, columns={"DATE": "Date"})
gfp.name = 'gfp'
gfp = create_dt_col(gfp)
gfp.head(n=5)

In [None]:
# CBOE Gold Volatility Index: GVZCLS
file1 = 'GVZCLS.csv'
cboe_gvi = convert_date(pd.read_csv(file1), 'DATE')
cboe_gvi = cboe_gvi.rename(index=str, columns={"DATE": "Date"})
cboe_gvi.name = 'cboe_gvi'
cboe_gvi = create_dt_col(cboe_gvi)
cboe_gvi.head(n=5)

In [6]:
# Personal Savings Rate: PSAVERT
file1 = 'PSAVERT_short.csv'
psavert = pd.read_csv(file1)
psavert = psavert.rename(index=str, columns={"DATE": "Date"})
psavert.name = 'psavert'
psavert = create_dt_col(psavert)
psavert.head(n=5)

FileNotFoundError: File b'PSAVERT_short.csv' does not exist

In [7]:
# Unemployment Rate: UNRATE
file1 = 'UNRATE_short.csv'
unrate = pd.read_csv(file1)
unrate = unrate.rename(index=str, columns={"DATE": "Date"})
unrate.name = 'unrate'
unrate = create_dt_col(unrate)
unrate.head(n=5)

FileNotFoundError: File b'UNRATE_short.csv' does not exist

In [42]:
# 30-Year Mortgage Rate: MORTGAGE30US - Weekly
file1 = 'MORTGAGE30US.csv'
mortgage30 = convert_date(pd.read_csv(file1), 'DATE')
mortgage30 = mortgage30.rename(index=str, columns={"DATE": "Date"})
mortgage30.name = 'mortgage30'
mortgage30 = create_dt_col(mortgage30)
mortgage30.head(n=5)

Unnamed: 0,Date,MORTGAGE30US,Datetimes
0,4/2/71,7.33,1971-04-02
1,4/9/71,7.31,1971-04-09
2,4/16/71,7.31,1971-04-16
3,4/23/71,7.31,1971-04-23
4,4/30/71,7.29,1971-04-30


In [43]:
# 10-Year Treasury Constant Maturity Rate: DGS10
file1 = 'DGS10.csv'
dgs10 = convert_date(pd.read_csv(file1), 'DATE')
dgs10 = dgs10.rename(index=str, columns={"DATE": "Date"})
dgs10.name = 'dgs10'
dgs10 = create_dt_col(dgs10)
dgs10.head(n=5)

Unnamed: 0,Date,DGS10,Datetimes
0,1/2/62,4.06,2062-01-02
1,1/3/62,4.03,2062-01-03
2,1/4/62,3.99,2062-01-04
3,1/5/62,4.02,2062-01-05
4,1/8/62,4.03,2062-01-08


In [44]:
# BCI
file1 = 'BCI_short.csv'
bci = pd.read_csv(file1)
bci = bci.rename(index=str, columns={"DATE": "Date"})
bci.name = 'bci'
bci = create_dt_col(bci)

In [45]:
bci.head(n=5)

Unnamed: 0,Date,BCI,BCIp,BCIg,Datetimes
0,1/3/80,102.5,-76.1,-4.7,1980-01-03
1,1/10/80,101.8,-85.6,-5.6,1980-01-10
2,1/17/80,102.0,-83.2,-6.3,1980-01-17
3,1/24/80,101.9,-85.4,-6.5,1980-01-24
4,1/31/80,101.6,-89.3,-6.6,1980-01-31


### Google Sources

In [46]:
# Hits

path1 = './Google_Trends/hits_google_1.csv'
path2 = './Google_Trends/hits_google_2.csv'
path3 = './Google_Trends/hits_google_3.csv'
path4 = './Google_Trends/hits_google_4.csv'
path5 = './Google_Trends/hits_google_5.csv'
path6 = './Google_Trends/hits_google_6.csv'
path7 = './Google_Trends/hits_google_7.csv'
path8 = './Google_Trends/hits_google_8.csv'
path9 = './Google_Trends/news_google_1.csv'
path10 = './Google_Trends/news_google_2.csv'
path11 = './Google_Trends/news_google_3.csv'
path12 = './Google_Trends/news_google_4.csv'
path13 = './Google_Trends/news_google_5.csv'
path14 = './Google_Trends/news_google_6.csv'
path15 = './Google_Trends/news_google_7.csv'
path16 = './Google_Trends/news_google_9.csv'

google_hits = pd.read_csv(path1)
google_hits = google_hits.drop(['isPartial'], axis = 1)
path_list = [path2, path3, path4, path5, path6, path7, path8, path9, path10, path11, path12, path13, path14, path15, path16]

for path in path_list:
    df = pd.read_csv(path)
    df = df.drop(['isPartial', 'date'], axis = 1)
    google_hits = pd.concat([google_hits, df], axis = 1)

google_hits = convert_date(google_hits, 'date')
google_hits = google_hits.rename(index=str, columns={"date": "Date"})
google_hits = create_dt_col(google_hits)
google_hits.name = 'google_hits'
google_hits.head()

Unnamed: 0,Date,Recession,debt,color,stocks,restaurant,portfolio,inflation,housing,dow jones,...,bonds,derivatives,headlines,wall street,banks,greed,bribery,insider trading,bankruptcy,Datetimes
0,1/1/04,0,7,46,8,77,25,15,93,8,...,0,0,0,0,0,0,0,0,0,2004-01-01
1,2/1/04,0,7,48,7,85,24,15,95,6,...,0,0,0,0,0,0,0,0,0,2004-02-01
2,3/1/04,0,6,46,6,74,25,15,95,7,...,0,0,0,0,0,0,0,0,0,2004-03-01
3,4/1/04,0,6,46,6,75,24,16,98,7,...,0,0,0,0,0,0,0,0,0,2004-04-01
4,5/1/04,0,6,43,5,76,23,16,96,6,...,0,0,0,0,0,0,0,0,0,2004-05-01


In [48]:
list(google_hits)[32]

'bribery'

In [49]:
google_hits.to_csv('GOOGLE_ONLY.csv')

#### Hyperparameters

In [132]:
m = 30 # period of days
df_list = [main, vix, rdi, invest, cpi, cci, gfp, cboe_gvi, psavert, unrate, mortgage30, dgs10, bci]
df_names = ['main', 'vix' , 'rdi', 'invest', 'cpi', 'cci', 'gfp', 'cboe_gvi', 'psavert', 'unrate', 'mortgage30', 'dgs10', 'bci']
#df_list = [google_hits]
#df_names = ['google_hits']

## Begin Code

In [133]:
# find start date
start_date = main['Datetimes'].dt.to_pydatetime()[0]
end_date = start_date + timedelta(days = m)
print(start_date, end_date)

1991-01-02 00:00:00 1991-02-01 00:00:00


**Note to self:**  
You could have repesented the data in 2 different ways. You chose to represent the harder way, which is any points that happen in the last 30 days period. But you could have chosen something like, the last 20 points, and such. I don't know which one is better  
  
Technically, Kayvan said the last 30 days, so in this case, it's better to do what I did. Plus you still need a cutoff point for the monthly measures.

In [1]:
#new_main = []
collist = []
new_main = pd.DataFrame()
#print(main.shape)
days = 20
    
for source_idx in range(1): #len(df_list)):
    new_df = pd.DataFrame()
    source = df_list[source_idx]

    print(df_names[source_idx])
    for i in range(days, 1): # main.shape[0]):
        row = pd.Series()
        end_date = main['Datetimes'][i]
        start_date = main['Datetimes'][i-days]
        #start_date = end_date - timedelta(days = m)

        #print(start_date, end_date)
        subset = source[(source['Datetimes'] >= start_date) & (source['Datetimes'] <= end_date)]
        subset_rows = subset.shape[0]

        ########### IDENTIFY DATA TYPE #################
               
        if subset.shape[0] >= 4:
            sub_date = subset['Datetimes'].dt.to_pydatetime()
            diff = np.mean([sub_date[i+1] - sub_date[i] for i in range(len(sub_date)-1)])

            # restricting the amount of records (Daily already restricted to twenty)
            if diff < timedelta(days = 14) and diff > timedelta(days = 6):
                # weekly
                subset = subset.iloc[:4] # take most recent FOUR records
                #print('weekly')

        if subset.shape[0] >= 2:
            sub_date = subset['Datetimes'].dt.to_pydatetime()
            diff = np.mean([sub_date[i+1] - sub_date[i] for i in range(len(sub_date)-1)])

            if diff >= timedelta(days = 25) and diff < timedelta(days = 33):
                # monthly
                subset = subset.iloc[:1] # take most recent ONE record
                #print('monthly')
            elif diff < timedelta(days = 3) and diff > timedelta(days = 0):
                # daily
                subset = subset.iloc[:days] # take most recent 20 records
        
        #print(subset)
        subset = subset.drop(['Date','Datetimes'], axis=1)
        #################################################
        
        for row_idx in range(subset.shape[0]):
            row = pd.concat([row, subset.iloc[row_idx]], ignore_index = True)
        
        # Account for empty
        if row.empty and len(new_df > 1):
            new_df = new_df.append(new_df.iloc[-1], ignore_index = True)
            # print('Add previous row')
        else:
            new_df = new_df.append(row, ignore_index = True)
        
    ##### COLUMN NAMES
    columns = [df_names[source_idx] + str(n) for n in list(new_df)]
    collist = collist + columns
    ####
     
    new_main = pd.concat([new_main, new_df], axis = 1)


print(len(collist))
new_main.columns = collist

NameError: name 'pd' is not defined

In [94]:
main.shape

(7140, 7)

In [95]:
new_main.shape

(7120, 217)

In [135]:
dates = main['Date'][20:len(main['Date'].values)].values
len(dates)

7120

In [136]:
#cols = list(new_main)[0:68]
#new_main = new_main[cols]
#new_main

In [137]:
new_main['Dates'] = dates

In [138]:
new_main.head()

Unnamed: 0,google_hits0,google_hits1,google_hits2,google_hits3,google_hits4,google_hits5,google_hits6,google_hits7,google_hits8,google_hits9,...,google_hits59,google_hits60,google_hits61,google_hits62,google_hits63,google_hits64,google_hits65,google_hits66,google_hits67,Dates
0,,,,,,,,,,,...,,,,,,,,,,1/30/91
1,,,,,,,,,,,...,,,,,,,,,,1/31/91
2,,,,,,,,,,,...,,,,,,,,,,2/1/91
3,,,,,,,,,,,...,,,,,,,,,,2/4/91
4,,,,,,,,,,,...,,,,,,,,,,2/5/91


In [139]:
new_main.to_csv('Revised_Dataset_1_google.csv')

In [101]:
#new_main.to_csv('Revised_Dataset_3.csv', index = False) # including RUA and SI

## Merge together sets

In [129]:
path1 = 'Revised_Dataset_3_clean.csv'
path2 = 'Revised_dataset_1_google.csv'

main1 = pd.read_csv(path1)
main2 = pd.read_csv(path2)

main3 = pd.merge(main1, main2, on='Date')
names2 = list(main2)
names2.remove('Date')
main3.columns = list(main1) + names2
main3.head()

Unnamed: 0.1,Unnamed: 0,Date,main0,main1,main2,main3,main4,main5,main6,main7,...,google_hits58,google_hits59,google_hits60,google_hits61,google_hits62,google_hits63,google_hits64,google_hits65,google_hits66,google_hits67
0,0,1/30/91,178.979996,7.97,0.094524,2.256054,0.006571,176.639999,7.93,0.079362,...,,,,,,,,,,
1,1,1/31/91,176.639999,7.93,0.079362,2.234674,2.8e-05,176.259995,8.02,0.074637,...,,,,,,,,,,
2,2,2/1/91,176.259995,8.02,0.074637,2.216556,-0.00251,173.199997,8.13,0.085293,...,,,,,,,,,,
3,3,2/4/91,173.199997,8.13,0.085293,2.154587,0.001853,172.779999,8.16,0.084374,...,,,,,,,,,,
4,4,2/5/91,172.779999,8.16,0.084374,2.154293,0.001288,171.169998,8.25,0.081169,...,,,,,,,,,,


In [130]:
main3.to_csv('Combined_Sets_from_Revised_3.csv')

In [107]:
# Combined_Sets_from_Revised_2.csv is combined with the clean set (Revised_Dataset_2)

## Clean the dataset

In [124]:
dataset_path = 'Revised_Dataset_3.csv'
dataset = pd.read_csv(dataset_path)
dataset.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,main0,main1,main2,main3,main4,main5,main6,main7,main8,main9,...,bci3,bci4,bci5,bci6,bci7,bci8,bci9,bci10,bci11,Dates
0,178.979996,7.97,0.094524,2.256054,0.006571,176.639999,7.93,0.079362,2.234674,2.8e-05,...,101.8,-187.5,-10.6,101.2,-195.8,-10.5,101.7,-189.1,-10.4,1/30/91
1,176.639999,7.93,0.079362,2.234674,2.8e-05,176.259995,8.02,0.074637,2.216556,-0.00251,...,101.8,-187.5,-10.6,101.2,-195.8,-10.5,101.7,-189.1,-10.4,1/31/91
2,176.259995,8.02,0.074637,2.216556,-0.00251,173.199997,8.13,0.085293,2.154587,0.001853,...,101.2,-195.8,-10.5,101.7,-189.1,-10.4,101.9,-186.9,-10.0,2/1/91
3,173.199997,8.13,0.085293,2.154587,0.001853,172.779999,8.16,0.084374,2.154293,0.001288,...,101.2,-195.8,-10.5,101.7,-189.1,-10.4,101.9,-186.9,-10.0,2/4/91
4,172.779999,8.16,0.084374,2.154293,0.001288,171.169998,8.25,0.081169,2.153921,-0.000618,...,101.2,-195.8,-10.5,101.7,-189.1,-10.4,101.9,-186.9,-10.0,2/5/91


In [125]:
# Deal with missing
def clean_dot(pdcol):
    new_col = []
    col = pdcol.values
    new_col.append(col[0])
    idx_changed = []
    
    for val_idx in range(1,len(col)):
        try:
            new_col.append(float(col[val_idx]))
        except:
            print(col[val_idx], col[val_idx + 1])
            if col[val_idx + 1] == '.' or col[val_idx + 1] == '':
                new_col.append((float(col[val_idx + 2]) + float(new_col[val_idx - 1])) / 2)
            else:
                new_col.append((float(col[val_idx + 1]) + float(new_col[val_idx - 1])) / 2) # imput the average
                idx_changed.append(val_idx)

    return new_col, idx_changed

In [126]:
new_df = pd.DataFrame()
new_df['Date'] = dataset['Dates']

In [127]:
for col in list(dataset[:-1]):
    new_df[col], x  = clean_dot(dataset[col])
#t,x = clean(dataset['gfp3'])

. 357.9
. 356.25
. 354.1
. 356.1
. 338.2
. 336.5
. 342.65
. 332.9
. 336.95
. 353.4
. 370.6
. .
. 385.65
. 395
. 387.4
. 375.7
. 386.15
. 381.8
. 394.1
. 389.9
. 383.25
. 387.35
. 397.6
. 393.75
. 389
. 369.55
. 349.85
. 342.8
. 325.4
. 293.9
. 309.5
. 304.2
. 277.7
. 286.7
. 278.35
. 285.6
. 254.6
. .
. 290.65
. .
. 282.05
. 280
. 275
. 274
. 274.7
. 262.15
. 265.6
. 271.6
. 278.05
. 302.65
. 312.65
. .
. 324.5
. 309.4
. 348.3
. 333.25
. 342.65
. 360.75
. 413.5
. 417.45
. 389.75
. 408.1
. .
. 443.5
. 426.8
. 426.95
. 429.2
. 436.15
. 509.15
. 616.75
. 657.75
. 616.85
. 628.3
. 676.85
. 685.9
. 667
. 822.5
. 930.65
. 878
. 809.75
. 881
. 895
. 903
. 949.75
. 1103
. 1124
. 1184.25
. 1233.5
. .
. 1403.5
. 1405.5
. 1505
. .
. 1546.5
. 1791
. 1584
. 1643.75
. 1627
. .
. 1633.25
. 1663.5
. 1655.25
. 1597.75
. 1463
. 1411
. 1209.25
. 1290.75
. 1308.5
. 1286.5
. 1194
. 1208.5
. 1187.4
. 1141.9
. 1067.25
. 1216.45
. 1296.5
. 1318.85
. 1139.75
. 1285
. 1255.8
. 1270.15
. 1323.4
. 1285.4
. 1336.6

IndexError: index 7120 is out of bounds for axis 0 with size 7120

In [None]:
new_df

In [128]:
new_df.to_csv('Revised_Dataset_3_clean.csv')