In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime, timedelta
from collections import OrderedDict

In [2]:
income = pd.read_csv("Personal Income Summary Personal Income, Population, Per Capita Personal Income.csv", skiprows=(0,1,2,3,6,7,8,9,10,11,12,13))

In [3]:
del income["GeoFips"]
del income["GeoName"]
del income["LineCode"]
del income["Description"]
income = income.T
income.reset_index(inplace = True)
income.rename(columns = {"index":"dtime", 0:"personal_income"}, inplace = True) #in millions of dollars

In [4]:
income.replace({':Q1':'-01', ':Q2':'-04',':Q3':'-07', ':Q4':'-10',}, regex=True, inplace=True)

In [5]:
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dtime            79 non-null     object 
 1   personal_income  79 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.4+ KB


In [6]:
income

Unnamed: 0,dtime,personal_income
0,2001-01,112805.3
1,2001-04,114275.1
2,2001-07,114847.8
3,2001-10,115907.8
4,2002-01,116198.9
...,...,...
74,2019-07,220385.2
75,2019-10,222895.6
76,2020-01,222543.4
77,2020-04,245881.1


In [7]:
#target features - sales - is disaggregated by month
#steps below performed to disaggregate income from quarter to month
income['dtime'] = pd.to_datetime(income['dtime']).dt.to_period('M')

In [8]:
income = income.set_index('dtime').resample('M').asfreq()
income.reset_index(inplace = True)
income

Unnamed: 0,dtime,personal_income
0,2001-01,112805.3
1,2001-02,
2,2001-03,
3,2001-04,114275.1
4,2001-05,
...,...,...
230,2020-03,
231,2020-04,245881.1
232,2020-05,
233,2020-06,


In [9]:
income.fillna(method='ffill', inplace=True)
income

Unnamed: 0,dtime,personal_income
0,2001-01,112805.3
1,2001-02,112805.3
2,2001-03,112805.3
3,2001-04,114275.1
4,2001-05,114275.1
...,...,...
230,2020-03,222543.4
231,2020-04,245881.1
232,2020-05,245881.1
233,2020-06,245881.1


In [10]:
income['dtime'] = income['dtime'].astype('object')
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dtime            235 non-null    object 
 1   personal_income  235 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.8+ KB


In [11]:
gdp = pd.read_csv("Gross Domestic Product (GDP) summary, quarterly by state.csv", skiprows=(0,1,2,3,6,7,8,9,10,11,12,13))

In [12]:
gdp

Unnamed: 0,GeoFips,GeoName,LineCode,Description,2005:Q1,2005:Q2,2005:Q3,2005:Q4,2006:Q1,2006:Q2,...,2018:Q2,2018:Q3,2018:Q4,2019:Q1,2019:Q2,2019:Q3,2019:Q4,2020:Q1,2020:Q2,2020:Q3
0,22000,Louisiana,1,Real GDP (millions of chained 2012 dollars),245979.3,248901.1,244426.8,243602.8,244602.0,243884.5,...,234649.8,235046.0,236872.9,238611.5,238443.3,240978.6,241835.4,234288.9,213207.6,229019.5


In [13]:
del gdp["GeoFips"]
del gdp["GeoName"]
del gdp["LineCode"]
del gdp["Description"]
gdp = gdp.T
gdp.reset_index(inplace = True)
gdp.rename(columns = {"index":"dtime", 0:"real_gdp"}, inplace = True) #in millions of chained 2012 dollars

In [14]:
gdp.replace({':Q1':'-01', ':Q2':'-04',':Q3':'-07', ':Q4':'-10',}, regex=True, inplace=True)

In [15]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dtime     63 non-null     object 
 1   real_gdp  63 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.1+ KB


In [16]:
gdp

Unnamed: 0,dtime,real_gdp
0,2005-01,245979.3
1,2005-04,248901.1
2,2005-07,244426.8
3,2005-10,243602.8
4,2006-01,244602.0
...,...,...
58,2019-07,240978.6
59,2019-10,241835.4
60,2020-01,234288.9
61,2020-04,213207.6


In [17]:
#target features - sales - is disaggregated by month
#steps below performed to disaggregate gdp from quarter to month
gdp['dtime'] = pd.to_datetime(gdp['dtime']).dt.to_period('M')
gdp

Unnamed: 0,dtime,real_gdp
0,2005-01,245979.3
1,2005-04,248901.1
2,2005-07,244426.8
3,2005-10,243602.8
4,2006-01,244602.0
...,...,...
58,2019-07,240978.6
59,2019-10,241835.4
60,2020-01,234288.9
61,2020-04,213207.6


In [18]:
gdp = gdp.set_index('dtime').resample('M').asfreq()
gdp.reset_index(inplace = True)
gdp

Unnamed: 0,dtime,real_gdp
0,2005-01,245979.3
1,2005-02,
2,2005-03,
3,2005-04,248901.1
4,2005-05,
...,...,...
182,2020-03,
183,2020-04,213207.6
184,2020-05,
185,2020-06,


In [19]:
gdp.fillna(method='ffill', inplace=True)
gdp

Unnamed: 0,dtime,real_gdp
0,2005-01,245979.3
1,2005-02,245979.3
2,2005-03,245979.3
3,2005-04,248901.1
4,2005-05,248901.1
...,...,...
182,2020-03,234288.9
183,2020-04,213207.6
184,2020-05,213207.6
185,2020-06,213207.6


In [20]:
gdp['dtime'] = gdp['dtime'].astype('object')
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dtime     187 non-null    object 
 1   real_gdp  187 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.0+ KB


In [21]:
consumer_spending = pd.read_csv("Total personal consumption expenditures (PCE).csv", skiprows=(0,1,2,3))

In [22]:
consumer_spending = consumer_spending.iloc[0:1]

In [23]:
del consumer_spending["GeoFips"]
del consumer_spending["GeoName"]
del consumer_spending["LineCode"]
del consumer_spending["Description"]
consumer_spending = consumer_spending.T
consumer_spending.reset_index(inplace = True)
consumer_spending.rename(columns = {"index":"dtime", 0:"consumer_spending"}, inplace = True) #in millions of chained 2012 dollars

In [24]:
consumer_spending['dtime'] = pd.to_datetime(consumer_spending['dtime']).dt.to_period('M')
consumer_spending.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype    
---  ------             --------------  -----    
 0   dtime              19 non-null     period[M]
 1   consumer_spending  19 non-null     float64  
dtypes: float64(1), period[M](1)
memory usage: 432.0 bytes


In [25]:
consumer_spending

Unnamed: 0,dtime,consumer_spending
0,2001-01,90022.5
1,2002-01,93570.2
2,2003-01,98760.7
3,2004-01,104689.4
4,2005-01,109377.5
5,2006-01,116018.7
6,2007-01,122382.6
7,2008-01,127072.9
8,2009-01,126669.9
9,2010-01,131421.1


In [26]:
dates = ["2019-02", "2020-01"]
start, end = [datetime.strptime(_, '%Y-%m') for _ in dates]
month_ordereddict = OrderedDict(((start + timedelta(_)).strftime("%Y-%-m"), None) for _ in range((end - start).days)).keys()
month_list = list(month_ordereddict)
new_months_df = pd.DataFrame(month_list, columns = ["dtime"])

In [27]:
#target features - sales - is disaggregated by month
#steps below performed to disaggregate consumer_spending from year to month
consumer_spending = consumer_spending.set_index('dtime').resample('M').asfreq()
consumer_spending.reset_index(inplace = True)
consumer_spending = consumer_spending.append(new_months_df)
consumer_spending

Unnamed: 0,dtime,consumer_spending
0,2001-01,90022.5
1,2001-02,
2,2001-03,
3,2001-04,
4,2001-05,
...,...,...
6,2019-8,
7,2019-9,
8,2019-10,
9,2019-11,


In [28]:
consumer_spending.fillna(method='ffill', inplace=True)
consumer_spending

Unnamed: 0,dtime,consumer_spending
0,2001-01,90022.5
1,2001-02,90022.5
2,2001-03,90022.5
3,2001-04,90022.5
4,2001-05,90022.5
...,...,...
6,2019-8,173167.0
7,2019-9,173167.0
8,2019-10,173167.0
9,2019-11,173167.0


In [29]:
consumer_spending['dtime'] = consumer_spending['dtime'].astype('object')
consumer_spending

Unnamed: 0,dtime,consumer_spending
0,2001-01,90022.5
1,2001-02,90022.5
2,2001-03,90022.5
3,2001-04,90022.5
4,2001-05,90022.5
...,...,...
6,2019-8,173167.0
7,2019-9,173167.0
8,2019-10,173167.0
9,2019-11,173167.0


In [30]:
income_gdp = pd.merge(income, gdp, on="dtime")
income_gdp

Unnamed: 0,dtime,personal_income,real_gdp
0,2005-01,131337.1,245979.3
1,2005-02,131337.1,245979.3
2,2005-03,131337.1,245979.3
3,2005-04,132933.4,248901.1
4,2005-05,132933.4,248901.1
...,...,...,...
182,2020-03,222543.4,234288.9
183,2020-04,245881.1,213207.6
184,2020-05,245881.1,213207.6
185,2020-06,245881.1,213207.6


In [31]:
economic = pd.merge(income_gdp, consumer_spending, on="dtime")
economic

Unnamed: 0,dtime,personal_income,real_gdp,consumer_spending
0,2005-01,131337.1,245979.3,109377.5
1,2005-02,131337.1,245979.3,109377.5
2,2005-03,131337.1,245979.3,109377.5
3,2005-04,132933.4,248901.1,109377.5
4,2005-05,132933.4,248901.1,109377.5
...,...,...,...,...
164,2018-09,216152.7,235046.0,168095.0
165,2018-10,218781.5,236872.9,168095.0
166,2018-11,218781.5,236872.9,168095.0
167,2018-12,218781.5,236872.9,168095.0


In [32]:
economic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dtime              169 non-null    object 
 1   personal_income    169 non-null    float64
 2   real_gdp           169 non-null    float64
 3   consumer_spending  169 non-null    float64
dtypes: float64(3), object(1)
memory usage: 6.6+ KB


In [33]:
economic['dtime'] = economic['dtime'].astype('str')
economic

Unnamed: 0,dtime,personal_income,real_gdp,consumer_spending
0,2005-01,131337.1,245979.3,109377.5
1,2005-02,131337.1,245979.3,109377.5
2,2005-03,131337.1,245979.3,109377.5
3,2005-04,132933.4,248901.1,109377.5
4,2005-05,132933.4,248901.1,109377.5
...,...,...,...,...
164,2018-09,216152.7,235046.0,168095.0
165,2018-10,218781.5,236872.9,168095.0
166,2018-11,218781.5,236872.9,168095.0
167,2018-12,218781.5,236872.9,168095.0


In [34]:
economic['dtime'] += "-01"
economic['dtime'] = pd.to_datetime(economic['dtime'])
economic

Unnamed: 0,dtime,personal_income,real_gdp,consumer_spending
0,2005-01-01,131337.1,245979.3,109377.5
1,2005-02-01,131337.1,245979.3,109377.5
2,2005-03-01,131337.1,245979.3,109377.5
3,2005-04-01,132933.4,248901.1,109377.5
4,2005-05-01,132933.4,248901.1,109377.5
...,...,...,...,...
164,2018-09-01,216152.7,235046.0,168095.0
165,2018-10-01,218781.5,236872.9,168095.0
166,2018-11-01,218781.5,236872.9,168095.0
167,2018-12-01,218781.5,236872.9,168095.0


In [35]:
economic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   dtime              169 non-null    datetime64[ns]
 1   personal_income    169 non-null    float64       
 2   real_gdp           169 non-null    float64       
 3   consumer_spending  169 non-null    float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 6.6 KB


In [36]:
economic.to_pickle('economic_data.pkl')