# Creating New Style Feature Set
Where 60-day period (t-60 to t) is used to predict time t+30

In [218]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

### Convert Date

In [219]:
def convert_date(df, datecolname):
    converted_list = []

    original_list = df[datecolname].values
    
    for i in range(len(original_list)):
        try:
            date = datetime.strptime(original_list[i], '%Y-%m-%d')
        except:
            date = datetime.strptime(original_list[i], '%Y-%m')
        
        converted_list.append(date.strftime('%-m/%-d/%y'))

    df[datecolname] = converted_list
    
    return df

### Create a Datetime Column

In [220]:
def create_dt_col(df):
    # Remove nan columns
    df = df[pd.isnull(df['Date']) == 0]

    dates = df['Date'].values
    datetime_list = []

    for i in range(df.shape[0]):

        date = datetime.strptime(dates[i], '%m/%d/%y')
        datetime_list.append(date)

    df['Datetimes'] = datetime_list
    df['Datetimes'] = df['Datetimes'].dt.to_pydatetime()
    
    return df

### Load Files

In [221]:
# load Sharpe calculations
file1 = '../Data/Sharpe_RUA_complete.csv'
main = pd.read_csv(file1)

In [222]:
main = create_dt_col(main)
main.head(n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0.1,Unnamed: 0,Date,RUA Close,Risk-Free TNX,RUA_Returns,Dates,Annual_Return_RUA,Numerator,Std_RUA,Ratio,Ratio_smoothed,Unnamed: 11,Datetimes
0,837.0,1/2/91,178.979996,7.97,-0.010395,1/2/91,0.094524,0.014824,2.256054,0.006571,0.008547,-0.001976,1991-01-02
1,838.0,1/3/91,176.639999,7.93,-0.013074,1/3/91,0.079362,6.2e-05,2.234674,2.8e-05,0.008775,-0.008747,1991-01-03
2,839.0,1/4/91,176.259995,8.02,-0.002151,1/4/91,0.074637,-0.005563,2.216556,-0.00251,0.009003,-0.011513,1991-01-04
3,840.0,1/7/91,173.199997,8.13,-0.017361,1/7/91,0.085293,0.003993,2.154587,0.001853,0.009232,-0.007379,1991-01-07
4,841.0,1/8/91,172.779999,8.16,-0.002425,1/8/91,0.084374,0.002774,2.154293,0.001288,0.00946,-0.008172,1991-01-08


In [223]:
main = main.drop(['Unnamed: 0', 'Unnamed: 11', 'RUA_Returns', 'Dates', 'Numerator', 'Ratio_smoothed'], axis = 1)
main.head()

Unnamed: 0,Date,RUA Close,Risk-Free TNX,Annual_Return_RUA,Std_RUA,Ratio,Datetimes
0,1/2/91,178.979996,7.97,0.094524,2.256054,0.006571,1991-01-02
1,1/3/91,176.639999,7.93,0.079362,2.234674,2.8e-05,1991-01-03
2,1/4/91,176.259995,8.02,0.074637,2.216556,-0.00251,1991-01-04
3,1/7/91,173.199997,8.13,0.085293,2.154587,0.001853,1991-01-07
4,1/8/91,172.779999,8.16,0.084374,2.154293,0.001288,1991-01-08


In [224]:
# load VIX
file1 = '../Data/^VIX.csv'

vix = convert_date(pd.read_csv(file1), 'Date') # convert dates so its equal to main
vix = vix[['Date', 'Adj Close']]
vix = create_dt_col(vix)
vix.name = 'vix'
vix = vix.rename(index=str, columns={"Close": "VIX_Close"})
vix.head()

Unnamed: 0,Date,Adj Close,Datetimes
0,1/2/90,17.24,1990-01-02
1,1/3/90,18.190001,1990-01-03
2,1/4/90,19.219999,1990-01-04
3,1/5/90,20.110001,1990-01-05
4,1/8/90,20.26,1990-01-08


In [225]:
# load Real Disposable Income (too short)
file1 = '../Data/A067RL1Q156SBEA_short.csv'
rdi = pd.read_csv(file1)
rdi = rdi.rename(index=str, columns={"DATE": "Date"})
rdi.name = 'rdi'
rdi = create_dt_col(rdi)
rdi.head(n=5)

Unnamed: 0,Date,A067RL1Q156SBEA,Datetimes
0,1/1/86,5.4,1986-01-01
1,4/1/86,5.2,1986-04-01
2,7/1/86,2.6,1986-07-01
3,10/1/86,0.6,1986-10-01
4,1/1/87,3.7,1987-01-01


In [226]:
# load Investor Sentiment (Weekly)
file1 = '../Data/AAII-AAII_SENTIMENT.csv'
invest = pd.read_csv(file1)
invest = invest[['Date', 'Bullish', 'Neutral', 'Bearish', 'Total']]
invest.name = 'invest'
invest = create_dt_col(invest)
invest.head(n=5)

Unnamed: 0,Date,Bullish,Neutral,Bearish,Total,Datetimes
0,6/26/87,,,,,1987-06-26
1,7/17/87,,,,,1987-07-17
2,7/24/87,0.36,0.5,0.14,1.0,1987-07-24
3,7/31/87,0.26,0.48,0.26,1.0,1987-07-31
4,8/7/87,0.56,0.15,0.29,1.0,1987-08-07


In [227]:
# load Consumer Price Index (too short)
file1 = '../Data/CPIAUCSL_short.csv'
cpi = pd.read_csv(file1)
cpi = cpi.rename(index=str, columns={"DATE": "Date"})
cpi.name = 'cpi'
cpi = create_dt_col(cpi)
cpi.head(n=5)

Unnamed: 0,Date,CPIAUCSL,Datetimes
0,1/1/86,109.9,1986-01-01
1,2/1/86,109.7,1986-02-01
2,3/1/86,109.1,1986-03-01
3,4/1/86,108.7,1986-04-01
4,5/1/86,109.0,1986-05-01


In [228]:
# Load Consumer Confidence Index
file1 = '../Data/CCI_USA.csv'
cci = convert_date(pd.read_csv(file1), 'TIME')
cci = cci[['TIME', 'Value']]
cci = cci.rename(index=str, columns={'Value': 'CCI_Value', 'TIME': 'Date'})
cci.name = 'cci'
cci = create_dt_col(cci)
cci.head(n=5)

Unnamed: 0,Date,CCI_Value,Datetimes
0,9/1/83,100.4003,1983-09-01
1,10/1/83,100.4069,1983-10-01
2,11/1/83,100.5977,1983-11-01
3,12/1/83,100.9262,1983-12-01
4,1/1/84,101.2544,1984-01-01


In [229]:
# Gold Fixing Price: GOLDAMGBD228NLBM
file1 = '../Data/GOLDAMGBD228NLBM.csv'
gfp = convert_date(pd.read_csv(file1), 'DATE')
gfp = gfp.rename(index=str, columns={"DATE": "Date"})
gfp.name = 'gfp'
gfp = create_dt_col(gfp)
gfp.head(n=5)

Unnamed: 0,Date,GOLDAMGBD228NLBM,Datetimes
0,4/1/68,38.0,2068-04-01
1,4/2/68,37.6,2068-04-02
2,4/3/68,37.7,2068-04-03
3,4/4/68,36.7,2068-04-04
4,4/5/68,37.2,2068-04-05


In [230]:
# CBOE Gold Volatility Index: GVZCLS
file1 = '../Data/GVZCLS.csv'
cboe_gvi = convert_date(pd.read_csv(file1), 'DATE')
cboe_gvi = cboe_gvi.rename(index=str, columns={"DATE": "Date"})
cboe_gvi.name = 'cboe_gvi'
cboe_gvi = create_dt_col(cboe_gvi)
cboe_gvi.head(n=5)

Unnamed: 0,Date,GVZCLS,Datetimes
0,6/3/08,22.89,2008-06-03
1,6/4/08,22.69,2008-06-04
2,6/5/08,22.78,2008-06-05
3,6/6/08,23.6,2008-06-06
4,6/9/08,24.47,2008-06-09


In [231]:
# Personal Savings Rate: PSAVERT
file1 = '../Data/PSAVERT_short.csv'
psavert = pd.read_csv(file1)
psavert = psavert.rename(index=str, columns={"DATE": "Date"})
psavert.name = 'psavert'
psavert = create_dt_col(psavert)
psavert.head(n=5)

Unnamed: 0,Date,PSAVERT,Datetimes
0,1/1/86,8.6,1986-01-01
1,2/1/86,9.3,1986-02-01
2,3/1/86,9.9,1986-03-01
3,4/1/86,9.7,1986-04-01
4,5/1/86,9.3,1986-05-01


In [232]:
# Unemployment Rate: UNRATE
file1 = '../Data/UNRATE_short.csv'
unrate = pd.read_csv(file1)
unrate = unrate.rename(index=str, columns={"DATE": "Date"})
unrate.name = 'unrate'
unrate = create_dt_col(unrate)
unrate.head(n=5)

Unnamed: 0,Date,UNRATE,Datetimes
0,1/1/86,6.7,1986-01-01
1,2/1/86,7.2,1986-02-01
2,3/1/86,7.2,1986-03-01
3,4/1/86,7.1,1986-04-01
4,5/1/86,7.2,1986-05-01


In [233]:
# 30-Year Mortgage Rate: MORTGAGE30US - Weekly
file1 = '../Data/MORTGAGE30US.csv'
mortgage30 = convert_date(pd.read_csv(file1), 'DATE')
mortgage30 = mortgage30.rename(index=str, columns={"DATE": "Date"})
mortgage30.name = 'mortgage30'
mortgage30 = create_dt_col(mortgage30)
mortgage30.head(n=5)

Unnamed: 0,Date,MORTGAGE30US,Datetimes
0,4/2/71,7.33,1971-04-02
1,4/9/71,7.31,1971-04-09
2,4/16/71,7.31,1971-04-16
3,4/23/71,7.31,1971-04-23
4,4/30/71,7.29,1971-04-30


In [234]:
# 10-Year Treasury Constant Maturity Rate: DGS10
file1 = '../Data/DGS10.csv'
dgs10 = convert_date(pd.read_csv(file1), 'DATE')
dgs10 = dgs10.rename(index=str, columns={"DATE": "Date"})
dgs10.name = 'dgs10'
dgs10 = create_dt_col(dgs10)
dgs10.head(n=5)

Unnamed: 0,Date,DGS10,Datetimes
0,1/2/62,4.06,2062-01-02
1,1/3/62,4.03,2062-01-03
2,1/4/62,3.99,2062-01-04
3,1/5/62,4.02,2062-01-05
4,1/8/62,4.03,2062-01-08


In [235]:
# BCI
file1 = '../Data/BCI_short.csv'
bci = pd.read_csv(file1)
bci = bci.rename(index=str, columns={"DATE": "Date"})
bci.name = 'bci'
bci = create_dt_col(bci)

In [236]:
bci.head(n=5)

Unnamed: 0,Date,BCI,BCIp,BCIg,Datetimes
0,1/3/80,102.5,-76.1,-4.7,1980-01-03
1,1/10/80,101.8,-85.6,-5.6,1980-01-10
2,1/17/80,102.0,-83.2,-6.3,1980-01-17
3,1/24/80,101.9,-85.4,-6.5,1980-01-24
4,1/31/80,101.6,-89.3,-6.6,1980-01-31


### Google Sources

In [237]:
# Hits

path1 = 'hits_1.csv'
path2 = 'hits_2.csv'
path3 = 'hits_3.csv'
path4 = 'hits_4.csv'
path5 = 'hits_5.csv'
path6 = 'hits_6.csv'
path7 = 'hits_7.csv'
path8 = 'hits_8.csv'
path9 = 'hits_9.csv'
path10 = 'news_1.csv'
path11 = 'news_2.csv'
path12 = 'news_3.csv'
path13 = 'news_4.csv'
path14 = 'news_5.csv'
path15 = 'news_6.csv'
path16 = 'news_7.csv'
path17 = 'news_8.csv'
path18 = 'news_9.csv'

google_hits = pd.read_csv(path1)
google_hits = google_hits.drop(['isPartial'], axis = 1)
path_list = [path2, path3, path4, path5, path6, path7, path8, path9, path10, path11, path12, path13, path14, path15, path16, path17, path18]

for path in path_list:
    df = pd.read_csv(path)
    df = df.drop(['isPartial', 'date'], axis = 1)
    google_hits = pd.concat([google_hits, df], axis = 1)

google_hits = convert_date(google_hits, 'date')
google_hits = google_hits.rename(index=str, columns={"date": "Date"})
google_hits = create_dt_col(google_hits)
google_hits.name = 'google_hits'
google_hits.head()

Unnamed: 0,Date,recession,debt,color,stocks,restaurant,portfolio,inflation,housing,dow jones,...,profit,society,ring,wall street,banks,greed,bribery,insider trading,bankruptcy,Datetimes
0,6/15/14,0,6,48,4,62,2,4,40,6,...,27,15,12,23,34,3,10,10,18,2014-06-15
1,6/22/14,0,6,49,4,62,2,4,40,6,...,17,14,14,22,18,1,9,2,16,2014-06-22
2,6/29/14,0,5,48,4,64,2,4,38,6,...,25,9,15,18,36,1,4,0,10,2014-06-29
3,7/6/14,0,6,49,4,64,2,4,42,6,...,19,10,11,19,22,0,4,4,20,2014-07-06
4,7/13/14,0,6,52,4,68,2,4,42,7,...,11,39,9,23,15,3,4,6,14,2014-07-13


In [41]:
list(google_hits)[32]

'bribery'

In [42]:
google_hits.to_csv('GOOGLE_ONLY.csv')

#### Hyperparameters

In [238]:
m = 30 # period of days
#df_list = [main, vix, rdi, invest, cpi, cci, gfp, cboe_gvi, psavert, unrate, mortgage30, dgs10, bci]
#df_names = ['main', 'vix' , 'rdi', 'invest', 'cpi', 'cci', 'gfp', 'cboe_gvi', 'psavert', 'unrate', 'mortgage30', 'dgs10', 'bci']
df_list = [google_hits]
df_names = ['google_hits']

## Begin Code

In [239]:
# find start date
start_date = main['Datetimes'].dt.to_pydatetime()[0]
end_date = start_date + timedelta(days = m)
print(start_date, end_date)

1991-01-02 00:00:00 1991-02-01 00:00:00


**Note to self:**  
You could have repesented the data in 2 different ways. You chose to represent the harder way, which is any points that happen in the last 30 days period. But you could have chosen something like, the last 20 points, and such. I don't know which one is better  
  
Technically, Kayvan said the last 30 days, so in this case, it's better to do what I did. Plus you still need a cutoff point for the monthly measures.

In [240]:
# Date goes old (top) to new (bottom)

#new_main = []
collist = []
new_main = pd.DataFrame()
#print(main.shape)
days = 20
    
for source_idx in range(len(df_list)):
    new_df = pd.DataFrame()
    source = df_list[source_idx]

    print(df_names[source_idx])
    for i in range(days, main.shape[0]):
        row = pd.Series()
        end_date = main['Datetimes'][i]
        start_date = main['Datetimes'][i-days]
        #start_date = end_date - timedelta(days = m)

        #print(start_date, end_date)
        subset = source[(source['Datetimes'] >= start_date) & (source['Datetimes'] <= end_date)]
        subset_rows = subset.shape[0]

        ########### IDENTIFY DATA TYPE #################
               
        if subset.shape[0] >= 4:
            sub_date = subset['Datetimes'].dt.to_pydatetime()
            diff = np.mean([sub_date[i+1] - sub_date[i] for i in range(len(sub_date)-1)])

            # restricting the amount of records (Daily already restricted to twenty)
            if diff < timedelta(days = 14) and diff > timedelta(days = 6):
                # weekly
                subset = subset.iloc[-4:] # take most recent FOUR records
                #print('weekly')

        if subset.shape[0] >= 2:
            sub_date = subset['Datetimes'].dt.to_pydatetime()
            diff = np.mean([sub_date[i+1] - sub_date[i] for i in range(len(sub_date)-1)])

            if diff >= timedelta(days = 25) and diff < timedelta(days = 33):
                # monthly
                subset = subset.iloc[-1:] # take most recent ONE record
                #print('monthly')
            elif diff < timedelta(days = 3) and diff > timedelta(days = 0):
                # daily
                subset = subset.iloc[-days:] # take most recent 20 records
                #print('daily')
        
        subset = subset.drop(['Date','Datetimes'], axis=1)

        #################################################
        
        for row_idx in range(subset.shape[0]):
            row = pd.concat([row, subset.iloc[row_idx]], ignore_index = True)
        
        # Account for empty
        if row.empty and len(new_df > 1):
            new_df = new_df.append(new_df.iloc[-1], ignore_index = True)
            # print('Add previous row')
        else:
            new_df = new_df.append(row, ignore_index = True)

        
    ##### COLUMN NAMES
    columns = [df_names[source_idx] + str(n) for n in list(new_df)]
    collist = collist + columns
    ####
     
    new_main = pd.concat([new_main, new_df], axis = 1)


print(len(collist))
new_main.columns = collist

google_hits
272


In [242]:
new_main

Unnamed: 0,google_hits0,google_hits1,google_hits2,google_hits3,google_hits4,google_hits5,google_hits6,google_hits7,google_hits8,google_hits9,...,google_hits262,google_hits263,google_hits264,google_hits265,google_hits266,google_hits267,google_hits268,google_hits269,google_hits270,google_hits271
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [142]:
main.shape

(7140, 7)

In [143]:
new_main.shape

(7120, 272)

In [243]:
dates = main['Date'][20:len(main['Date'].values)].values
len(dates)

7120

In [244]:
#cols = list(new_main)[0:68]
#new_main = new_main[cols]
#new_main

In [245]:
new_main['Dates'] = dates

In [246]:
new_main.head()

Unnamed: 0,google_hits0,google_hits1,google_hits2,google_hits3,google_hits4,google_hits5,google_hits6,google_hits7,google_hits8,google_hits9,...,google_hits263,google_hits264,google_hits265,google_hits266,google_hits267,google_hits268,google_hits269,google_hits270,google_hits271,Dates
0,,,,,,,,,,,...,,,,,,,,,,1/30/91
1,,,,,,,,,,,...,,,,,,,,,,1/31/91
2,,,,,,,,,,,...,,,,,,,,,,2/1/91
3,,,,,,,,,,,...,,,,,,,,,,2/4/91
4,,,,,,,,,,,...,,,,,,,,,,2/5/91


In [247]:
new_main.to_csv('Revised_Dataset_google.csv', index = False)

In [101]:
#new_main.to_csv('Revised_Dataset_3.csv', index = False) # including RUA and SI

## Merge together sets

In [256]:
path1 = 'Revised_Dataset_clean.csv'
path2 = 'Revised_Dataset_google_clean.csv'

main1 = pd.read_csv(path1)
main2 = pd.read_csv(path2)

main3 = pd.merge(main1, main2, on='Date')

names2 = list(main2)
names2.remove('Date')
main3.columns = list(main1) + names2
main3.head()

Unnamed: 0,Date,Label_m30,main0,main1,main2,main3,main4,main5,main6,main7,...,google_hits263,google_hits264,google_hits265,google_hits266,google_hits267,google_hits268,google_hits269,google_hits270,google_hits271,google_hits0
0,1/30/91,1,176.639999,7.93,0.079362,2.234674,2.8e-05,176.259995,8.02,0.074637,...,,,,,,,,,,
1,1/31/91,1,176.259995,8.02,0.074637,2.216556,-0.00251,173.199997,8.13,0.085293,...,,,,,,,,,,
2,2/1/91,1,173.199997,8.13,0.085293,2.154587,0.001853,172.779999,8.16,0.084374,...,,,,,,,,,,
3,2/4/91,1,172.779999,8.16,0.084374,2.154293,0.001288,171.169998,8.25,0.081169,...,,,,,,,,,,
4,2/5/91,1,171.169998,8.25,0.081169,2.153921,-0.000618,172.759995,8.16,0.080759,...,,,,,,,,,,


In [257]:
main3.to_csv('Combined_Sets_from_Revised.csv', index = False)

In [107]:
# Combined_Sets_from_Revised_2.csv is combined with the clean set (Revised_Dataset_2)

## Clean the dataset

In [248]:
dataset_path = 'Revised_Dataset_google.csv'
dataset = pd.read_csv(dataset_path)
dataset.head()

Unnamed: 0,google_hits0,google_hits1,google_hits2,google_hits3,google_hits4,google_hits5,google_hits6,google_hits7,google_hits8,google_hits9,...,google_hits263,google_hits264,google_hits265,google_hits266,google_hits267,google_hits268,google_hits269,google_hits270,google_hits271,Dates
0,,,,,,,,,,,...,,,,,,,,,,1/30/91
1,,,,,,,,,,,...,,,,,,,,,,1/31/91
2,,,,,,,,,,,...,,,,,,,,,,2/1/91
3,,,,,,,,,,,...,,,,,,,,,,2/4/91
4,,,,,,,,,,,...,,,,,,,,,,2/5/91


In [249]:
# Deal with missing
def clean_dot(pdcol):
    new_col = []
    col = pdcol.values
    new_col.append(col[0])
    idx_changed = []
    
    for val_idx in range(1,len(col)):
        try:
            new_col.append(float(col[val_idx]))

        except:
            if val_idx == len(col)-2: # last value
                new_col.append('')
                print('yes')
                continue
                
            print(col[val_idx], col[val_idx + 1])
            if col[val_idx + 1] == '.' or col[val_idx + 1] == '':
                new_col.append((float(col[val_idx + 2]) + float(new_col[val_idx - 1])) / 2)
            else:
                new_col.append((float(col[val_idx + 1]) + float(new_col[val_idx - 1])) / 2) # imput the average
                idx_changed.append(val_idx)

    return new_col, idx_changed

In [251]:
new_df = pd.DataFrame()
new_df['Date'] = dataset['Dates'] # 'Dates'

In [253]:
# dataset[:-1]
for col in list(dataset)[:-1]:
    print(col)
    new_df[col], x  = clean_dot(dataset[col])
#t,x = clean_dot(dataset['main0'])

google_hits0
google_hits1
google_hits2
google_hits3
google_hits4
google_hits5
google_hits6
google_hits7
google_hits8
google_hits9
google_hits10
google_hits11
google_hits12
google_hits13
google_hits14
google_hits15
google_hits16
google_hits17
google_hits18
google_hits19
google_hits20
google_hits21
google_hits22
google_hits23
google_hits24
google_hits25
google_hits26
google_hits27
google_hits28
google_hits29
google_hits30
google_hits31
google_hits32
google_hits33
google_hits34
google_hits35
google_hits36
google_hits37
google_hits38
google_hits39
google_hits40
google_hits41
google_hits42
google_hits43
google_hits44
google_hits45
google_hits46
google_hits47
google_hits48
google_hits49
google_hits50
google_hits51
google_hits52
google_hits53
google_hits54
google_hits55
google_hits56
google_hits57
google_hits58
google_hits59
google_hits60
google_hits61
google_hits62
google_hits63
google_hits64
google_hits65
google_hits66
google_hits67
google_hits68
google_hits69
google_hits70
google_hits71
go

In [254]:
new_df

Unnamed: 0,Date,google_hits1,google_hits2,google_hits3,google_hits4,google_hits5,google_hits6,google_hits7,google_hits8,google_hits9,...,google_hits263,google_hits264,google_hits265,google_hits266,google_hits267,google_hits268,google_hits269,google_hits270,google_hits271,google_hits0
0,1/30/91,,,,,,,,,,...,,,,,,,,,,
1,1/31/91,,,,,,,,,,...,,,,,,,,,,
2,2/1/91,,,,,,,,,,...,,,,,,,,,,
3,2/4/91,,,,,,,,,,...,,,,,,,,,,
4,2/5/91,,,,,,,,,,...,,,,,,,,,,
5,2/6/91,,,,,,,,,,...,,,,,,,,,,
6,2/7/91,,,,,,,,,,...,,,,,,,,,,
7,2/8/91,,,,,,,,,,...,,,,,,,,,,
8,2/11/91,,,,,,,,,,...,,,,,,,,,,
9,2/12/91,,,,,,,,,,...,,,,,,,,,,


In [255]:
new_df.to_csv('Revised_Dataset_google_clean.csv')