## Load Python modules

In [1]:
import pandas as pd
from pandas.core.series import Series
import numpy as np
import glob
import datetime
# import sklearn
# import sklearn.datasets
# import sklearn.cross_validation
# import time

# from sklearn.cross_validation import KFold
# from sklearn import ensemble
# from sklearn import metrics
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression
# from sklearn.cross_validation import cross_val_score
# from sklearn.tree import DecisionTreeRegressor

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [117]:
def convert_week(x):
    start_epoch_arr= x.values.astype(int)
    end_epoch_arr= np.array(start_epoch_arr)
    
    start_epoch_arr= (start_epoch_arr-400)*7+31900-6
    start_epoch_arr = (start_epoch_arr-25569) * 86400
    
    end_epoch_arr = (end_epoch_arr-400)*7+31900
    end_epoch_arr= (end_epoch_arr-25569) * 86400
    
    return (start_epoch_arr, end_epoch_arr)

def get_transaction_time(x, start_epoch_arr):
    transaction_time_epoch_arr= x.values.astype(int)
    transaction_time_epoch_arr= start_epoch_arr + (transaction_time_epoch_arr * 60)
    return transaction_time_epoch_arr

def get_datetime(transaction_time_epoch_arr):
    t_shape= np.shape(transaction_time_epoch_arr)
    month_arr= np.zeros(t_shape, dtype=int)
    day_arr=np.zeros(t_shape, dtype=int)
    year_arr= np.zeros(t_shape, dtype=int)
    hour_arr= np.zeros(t_shape, dtype=int)
    minute_arr= np.zeros(t_shape, dtype=int)
    dayofweek_arr= np.zeros(t_shape, dtype=int)
    
    for i in xrange(t_shape[0]):
        t_time= transaction_time_epoch_arr[i]
        dt= datetime.datetime.utcfromtimestamp(t_time)
        month_arr[i]= dt.month
        day_arr[i]= dt.day
        year_arr[i]= dt.year
        hour_arr[i]= dt.hour
        minute_arr[i]= dt.minute
        dayofweek_arr[i]= dt.weekday()
    
    return (month_arr, day_arr, year_arr, hour_arr, minute_arr, dayofweek_arr)

def parse_colupc(colupc_arr):
    t_shape= np.shape(colupc_arr)
    sy_arr= np.zeros(t_shape, dtype=int)
    ge_arr=np.zeros(t_shape, dtype=int)
    vend_arr= np.zeros(t_shape, dtype=int)
    item_arr= np.zeros(t_shape, dtype=int)
    for i, colupc in enumerate(colupc_arr):
        assert len(colupc) == 13
        # sy, gen, vend, item
        m_as= [colupc[:2], colupc[2:3], colupc[3:8], colupc[8:]]
        m_outs= [sy_arr, ge_arr, vend_arr, item_arr]
        for j in xrange(4):
            a= m_as[j]
            out= m_outs[j]
            out[i]= int(a)
    return (sy_arr, ge_arr, vend_arr, item_arr)

def parse_panel(filename):
    column_headers= None
    m_df_items= []
#     m_df = pd.read_csv(filename, engine='python')
    with open(filename) as f:
        i= 0
        for line in f:
            line= line.strip()
            s_arr= line.split(",")
            s_arr= [x.strip() for x in s_arr if len(x.strip()) > 0]
            if column_headers == None:
                column_headers= s_arr
                continue
            item= (i, s_arr)
            m_df_items.append(item)
            i+=1

    #if "MINUTE" not in m_df.columns.values.tolist():
    if "MINUTE" not in column_headers:
        raise RuntimeError("Expecting MINUTE column in dataset, unable to parse")
        
    m_df= pd.DataFrame.from_items(m_df_items, columns=column_headers, orient='index')
    
    s_week_df= m_df["WEEK"]
    start_time_epoch_arr, end_time_epoch_arr= convert_week(s_week_df)
    r1= pd.DataFrame({
        'START_TIME_EPOCH_S' : start_time_epoch_arr
    })
    r2= pd.DataFrame({
        'END_TIME_EPOCH_S' : end_time_epoch_arr
    })
    m_df= pd.concat([m_df, r1, r2], axis=1)
    #
    s_minute_df= m_df["MINUTE"]
    transaction_time_epoch_arr= get_transaction_time(s_minute_df, start_time_epoch_arr)
    t_result= pd.DataFrame({
        'TRANSACTION_TIME_EPOCH_S' : transaction_time_epoch_arr
    })
    # extract month, day, year, hour, minute
    month_arr, day_arr, year_arr, hour_arr, minute_arr, dayofweek_arr= get_datetime(transaction_time_epoch_arr)
    r1= pd.DataFrame({
        'MONTH' : month_arr
    })
    r2= pd.DataFrame({
        'DAY' : day_arr
    })
    r3= pd.DataFrame({
        'YEAR' : year_arr
    })
    r4= pd.DataFrame({
        'HOUR_OF_DAY' : hour_arr
    })
    r5= pd.DataFrame({
        'MINUTE' : minute_arr
    })
    r6= pd.DataFrame({
        'DAYOFWEEK' : dayofweek_arr
    })
    m_df.drop("MINUTE", axis=1, inplace=True)
    m_df= pd.concat([m_df, t_result, r1, r2, r3, r4, r5, r6], axis=1)
    
    # Parse COLUPC to SY GE VEND ITEM
    # 18417375121 ->   
    #  item: 75121
    #  vender: 84173
    #  generation: 1
    #  system: 00 (zero padded)
    s_colupc_df= m_df["COLUPC"]
    sy_arr, ge_arr, vend_arr, item_arr = parse_colupc(s_colupc_df)
    r1= pd.DataFrame({
        'SY' : sy_arr
    })
    r2= pd.DataFrame({
        'GE' : ge_arr
    })
    r3= pd.DataFrame({
        'VEND' : vend_arr
    })
    r4= pd.DataFrame({
        'ITEM' : item_arr
    })
    m_df.drop("COLUPC", axis=1, inplace=True)
    m_df= pd.concat([m_df, r1, r2, r3, r4], axis=1)
    
    return m_df

In [137]:
beer_dir='/home/conway/beer'

# parse panels across Year8-Year11
panel_filenames= []
panel_filenames.extend(glob.glob(beer_dir+"/Year8/*.DAT"))
panel_filenames.extend(glob.glob(beer_dir+"/Year9/*.DAT"))
panel_filenames.extend(glob.glob(beer_dir+"/Year10/*.DAT"))
panel_filenames.extend(glob.glob(beer_dir+"/Year11/*.DAT"))

master_panel_df= None
for pf in panel_filenames:
    print "Parsing {}".format(pf)
    m_df= parse_panel(pf)
    if master_panel_df is None:
        master_panel_df= m_df
    else:
        master_panel_df= pd.concat([master_panel_df, m_df], ignore_index=True)

master_panel_df.sort(["TRANSACTION_TIME_EPOCH_S"], inplace=True)
master_panel_df.reset_index(drop=True, inplace=True)
master_panel_df

Parsing /home/conway/beer/Year8/beer_PANEL_KK_1479_1530.DAT
Parsing /home/conway/beer/Year8/beer_PANEL_GK_1479_1530.DAT
Parsing /home/conway/beer/Year8/beer_PANEL_MK_1479_1530.DAT
Parsing /home/conway/beer/Year9/beer_PANEL_GK_1531_1582.DAT
Parsing /home/conway/beer/Year9/beer_PANEL_MK_1531_1582.DAT
Parsing /home/conway/beer/Year10/beer_PANEL_MK_1583_1634.DAT
Parsing /home/conway/beer/Year10/beer_PANEL_DK_1583_1634.DAT
Parsing /home/conway/beer/Year10/beer_PANEL_GK_1583_1634.DAT
Parsing /home/conway/beer/Year11/beer_PANEL_MK_1635_1686.DAT
Parsing /home/conway/beer/Year11/beer_PANEL_GK_1635_1686.DAT
Parsing /home/conway/beer/Year11/beer_PANEL_DK_1635_1686.DAT


Unnamed: 0,PANID,WEEK,UNITS,OUTLET,DOLLARS,IRI_KEY,START_TIME_EPOCH_S,END_TIME_EPOCH_S,TRANSACTION_TIME_EPOCH_S,MONTH,DAY,YEAR,HOUR_OF_DAY,MINUTE,DAYOFWEEK,SY,GE,VEND,ITEM
0,3369595,1479,4,GK,10,257871,1199059200,1199577600,1199092980,12,31,2007,9,23,0,0,1,84173,75121
1,3834697,1479,1,GK,8.99,257871,1199059200,1199577600,1199094480,12,31,2007,9,48,0,0,1,34100,17636
2,3308189,1479,1,GK,14.68,9999690,1199059200,1199577600,1199095200,12,31,2007,10,0,0,0,1,18200,53168
3,3812222,1479,1,GK,10.98,9999691,1199059200,1199577600,1199095440,12,31,2007,10,4,0,0,1,34100,57340
4,3356188,1479,1,GK,8.89,9999691,1199059200,1199577600,1199096700,12,31,2007,10,25,0,0,1,71990,30069
5,3315523,1479,1,GK,14.68,9999690,1199059200,1199577600,1199097180,12,31,2007,10,33,0,0,1,34100,57306
6,3341818,1479,1,GK,10.99,257871,1199059200,1199577600,1199098380,12,31,2007,10,53,0,0,1,18200,96418
7,3334490,1479,1,GK,7.49,9999691,1199059200,1199577600,1199099460,12,31,2007,11,11,0,0,2,80660,95605
8,3358846,1479,1,GK,5.99,9999690,1199059200,1199577600,1199099520,12,31,2007,11,12,0,0,1,84173,32130
9,3358846,1479,1,GK,6.19,9999690,1199059200,1199577600,1199099520,12,31,2007,11,12,0,0,1,84173,30130


In [138]:
master_panel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67965 entries, 0 to 67964
Data columns (total 19 columns):
PANID                       67965 non-null object
WEEK                        67965 non-null object
UNITS                       67965 non-null object
OUTLET                      67965 non-null object
DOLLARS                     67965 non-null object
IRI_KEY                     67965 non-null object
START_TIME_EPOCH_S          67965 non-null int64
END_TIME_EPOCH_S            67965 non-null int64
TRANSACTION_TIME_EPOCH_S    67965 non-null int64
MONTH                       67965 non-null int64
DAY                         67965 non-null int64
YEAR                        67965 non-null int64
HOUR_OF_DAY                 67965 non-null int64
MINUTE                      67965 non-null int64
DAYOFWEEK                   67965 non-null int64
SY                          67965 non-null int64
GE                          67965 non-null int64
VEND                        67965 non-null int64
ITE

In [139]:
s_outlet= master_panel_df["OUTLET"]
s_outlet_counts= s_outlet.value_counts()
s_outlet_counts

GK    67891
DK       26
MK       25
KK       23
dtype: int64

In [140]:
# decompose outlet categories to 0=Grocery, 1=Drug, 2=Mass, 3=KK?
mapping= {
    "GK":0,
    "DK":1,
    "MK":2,
    "KK":3,
}
master_panel_df["OUTLET"]= s_outlet.apply(lambda x: mapping[x.strip()])
master_panel_df

Unnamed: 0,PANID,WEEK,UNITS,OUTLET,DOLLARS,IRI_KEY,START_TIME_EPOCH_S,END_TIME_EPOCH_S,TRANSACTION_TIME_EPOCH_S,MONTH,DAY,YEAR,HOUR_OF_DAY,MINUTE,DAYOFWEEK,SY,GE,VEND,ITEM
0,3369595,1479,4,0,10,257871,1199059200,1199577600,1199092980,12,31,2007,9,23,0,0,1,84173,75121
1,3834697,1479,1,0,8.99,257871,1199059200,1199577600,1199094480,12,31,2007,9,48,0,0,1,34100,17636
2,3308189,1479,1,0,14.68,9999690,1199059200,1199577600,1199095200,12,31,2007,10,0,0,0,1,18200,53168
3,3812222,1479,1,0,10.98,9999691,1199059200,1199577600,1199095440,12,31,2007,10,4,0,0,1,34100,57340
4,3356188,1479,1,0,8.89,9999691,1199059200,1199577600,1199096700,12,31,2007,10,25,0,0,1,71990,30069
5,3315523,1479,1,0,14.68,9999690,1199059200,1199577600,1199097180,12,31,2007,10,33,0,0,1,34100,57306
6,3341818,1479,1,0,10.99,257871,1199059200,1199577600,1199098380,12,31,2007,10,53,0,0,1,18200,96418
7,3334490,1479,1,0,7.49,9999691,1199059200,1199577600,1199099460,12,31,2007,11,11,0,0,2,80660,95605
8,3358846,1479,1,0,5.99,9999690,1199059200,1199577600,1199099520,12,31,2007,11,12,0,0,1,84173,32130
9,3358846,1479,1,0,6.19,9999690,1199059200,1199577600,1199099520,12,31,2007,11,12,0,0,1,84173,30130


In [141]:
master_panel_df[master_panel_df['OUTLET'] == 1].groupby('PANID').get_group("3107334")
#master_panel_df[master_panel_df['OUTLET'] == 1].groupby('PANID').describe()

Unnamed: 0,PANID,WEEK,UNITS,OUTLET,DOLLARS,IRI_KEY,START_TIME_EPOCH_S,END_TIME_EPOCH_S,TRANSACTION_TIME_EPOCH_S,MONTH,DAY,YEAR,HOUR_OF_DAY,MINUTE,DAYOFWEEK,SY,GE,VEND,ITEM
45809,3107334,1619,1,1,7.99,651444,1283731200,1284249600,1284299280,9,12,2010,13,48,6,0,1,34100,57636
53657,3107334,1644,1,1,8.61,651444,1298851200,1299369600,1299336540,3,5,2011,14,49,5,0,1,34100,57636
55790,3107334,1651,1,1,8.99,651444,1303084800,1303603200,1303658160,4,24,2011,15,16,6,0,1,34100,57636
60228,3107334,1663,1,1,9.49,651444,1310342400,1310860800,1310494020,7,12,2011,18,7,1,0,1,34100,57636


In [142]:
master_panel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67965 entries, 0 to 67964
Data columns (total 19 columns):
PANID                       67965 non-null object
WEEK                        67965 non-null object
UNITS                       67965 non-null object
OUTLET                      67965 non-null int64
DOLLARS                     67965 non-null object
IRI_KEY                     67965 non-null object
START_TIME_EPOCH_S          67965 non-null int64
END_TIME_EPOCH_S            67965 non-null int64
TRANSACTION_TIME_EPOCH_S    67965 non-null int64
MONTH                       67965 non-null int64
DAY                         67965 non-null int64
YEAR                        67965 non-null int64
HOUR_OF_DAY                 67965 non-null int64
MINUTE                      67965 non-null int64
DAYOFWEEK                   67965 non-null int64
SY                          67965 non-null int64
GE                          67965 non-null int64
VEND                        67965 non-null int64
ITEM

In [143]:
master_panel_df.to_csv("master_panel.csv")

In [144]:
day_store_size = master_panel_df.groupby(['DAYOFWEEK','IRI_KEY']).size()
type(day_store_size)
typday_store_size[day_store_size > 10]

NameError: name 'typday_store_size' is not defined

In [145]:
#len(master_panel_df['IRI_KEY'].unique())
master_panel_df.groupby('IRI_KEY').size().describe()
#len(master_panel_df['IRI_KEY'].unique())

count       29.000000
mean      2343.620690
std       4654.027307
min          1.000000
25%          6.000000
50%         26.000000
75%        719.000000
max      15634.000000
dtype: float64

In [146]:
master_panel_df['IRI_KEY'].unique().tolist()

['257871',
 '9999690',
 '9999691',
 '9999661',
 '1085053',
 '9999875',
 '9999879',
 '9999671',
 '9979673',
 '9999864',
 '9999821',
 '9887890',
 '9979630',
 '9979872',
 '9929162',
 '9979664',
 '9979652',
 '9979648',
 '9979663',
 '9979633',
 '9921205',
 '9979874',
 '6000981',
 '9979668',
 '9979672',
 '9837261',
 '9979854',
 '9979619',
 '651444']

In [147]:
len(master_panel_df['PANID'].unique())

3685

In [148]:
master_panel_df.groupby('PANID').size().describe()

count    3685.000000
mean       18.443691
std        51.250777
min         1.000000
25%         2.000000
50%         6.000000
75%        16.000000
max      1539.000000
dtype: float64

In [158]:
store_data_filenames = []
store_dict= {}
store_data_filenames.append(beer_dir+"/Year8/Delivery_Stores")
store_data_filenames.append(beer_dir+"/Year9/Delivery_Stores")
store_data_filenames.append(beer_dir+"/Year10/Delivery_Stores")
store_data_filenames.append(beer_dir+"/Year11/Delivery_Stores")
for f in store_data_filenames:
    with open(f) as openfile:
        i = 0
        for line in openfile:
            if i > 0: # skip header
                iri_key = line[:7].strip()
                market = line[20:45].strip()
                store_dict[iri_key] = market
            i+=1
print len(store_dict)

2314


In [174]:
master_panel_dict= {}
for row in master_panel_df.iterrows():
    row= row[1]
    week= str(row['WEEK'])
    iri= str(row['IRI_KEY'])
    sy= str (row['SY'])
    ge= str(row['GE'])
    vend= str(row['VEND'])
    item= str(row['ITEM'])
    m_key= (week, iri, sy, ge, vend, item)
#     if m_key in master_panel_dict:
#         raise RuntimeError("{} present in dictionary".format(m_key))
    master_panel_dict[m_key]= 0

test_key= ('1479', '257871', '0', '1', '84173', '75121')
print master_panel_dict[test_key]
print len(master_panel_dict)

0
47711


In [176]:
# test_key= ('1479', '681530', '0', '2', '18200', '646')
# print master_panel_dict[test_key]

In [177]:
### WARNING: This cell takes a long time to run.

# for each line of store data
    # split by whitespace
    # choose fields iri_key, week, sy, ge, vend, item, f, d, pr
    # 0-5, 8-10

beer_drug_filenames= []
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year8/beer_drug*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year8/beer_groc*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year9/beer_drug*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year9/beer_groc*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year10/beer_drug*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year10/beer_groc*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year11/beer_drug*"))
beer_drug_filenames.extend(glob.glob(beer_dir+"/Year11/beer_groc*"))

for f in beer_drug_filenames:
    with open(f) as openfile:
        i = 0
        for line in openfile:
            if i != 0:
                vals = line.strip().split(' ')
                vals = [x.strip() for x in vals if len(x.strip()) > 0]
                week = vals[1]
                iri_key = vals[0]
                sy = vals[2]
                ge = vals[3]
                vend = vals[4]
                item = vals[5]
                f = vals[8]
                d = vals[9]
                pr = vals[10]
                m_key = (week, iri_key, sy, ge, vend, item)
                if m_key not in master_panel_dict:
                    continue
                assert master_panel_dict[m_key] == 0 
                master_panel_dict[m_key] = (f, d, pr)
            i += 1

In [179]:
print len(master_panel_dict)
test_key= ('1479', '257871', '0', '1', '84173', '75121')
print master_panel_dict[test_key]

47711
('NONE', '0', '0')


In [200]:
m_df_items= []
i =0
for k,v in master_panel_dict.iteritems():
    m_arr= []
    m_arr.extend(k)
    if type(v) == int:
        v= (np.NAN, np.NAN, np.NAN)
    m_arr.extend(v)
    m_df_items.append((i, m_arr))
    i += 1

# test_key= ('1479', '257871', '0', '1', '84173', '75121')
column_headers= ['WEEK', 'IRI_KEY', 'SY', 'GE', 'VEND', "ITEM", "F", 'D', "PR"]
m_df= pd.DataFrame.from_items(m_df_items, columns=column_headers, orient='index')
m_df

Unnamed: 0,WEEK,IRI_KEY,SY,GE,VEND,ITEM,F,D,PR
0,1498,9999879,0,1,88345,10053,,,
1,1662,9999879,0,1,72890,158,,,
2,1673,9999690,0,1,80660,95937,,,
3,1482,9999690,0,1,84173,31130,,,
4,1480,9999691,0,1,18200,53168,,,
5,1584,257871,0,1,34100,57306,NONE,1,1
6,1654,1085053,0,1,71990,48,NONE,0,0
7,1519,9999879,7,5,4563,13,,,
8,1664,9999661,0,1,84173,37,,,
9,1605,9999879,0,1,87692,97102,,,


In [201]:
# if Nan found in column for F, D, PR => assume that these stores have no promotions
# NOTE: investigation was made into the use of chains info and masked chain info (see Section 3.6 in the PDF)
# however, after evaluation, it was determined that a promotion done in one store would not apply for one
# in a different Geographic location

columns= ['F','D','PR']
replace= ['NONE','0','0']
for i in xrange(3):
    c= columns[i]
    r= replace[i]
    m_mask= pd.isnull(m_df[c])
    m_df[c][m_mask]= r

m_df

Unnamed: 0,WEEK,IRI_KEY,SY,GE,VEND,ITEM,F,D,PR
0,1498,9999879,0,1,88345,10053,NONE,0,0
1,1662,9999879,0,1,72890,158,NONE,0,0
2,1673,9999690,0,1,80660,95937,NONE,0,0
3,1482,9999690,0,1,84173,31130,NONE,0,0
4,1480,9999691,0,1,18200,53168,NONE,0,0
5,1584,257871,0,1,34100,57306,NONE,1,1
6,1654,1085053,0,1,71990,48,NONE,0,0
7,1519,9999879,7,5,4563,13,NONE,0,0
8,1664,9999661,0,1,84173,37,NONE,0,0
9,1605,9999879,0,1,87692,97102,NONE,0,0


In [207]:
m_df['SY'] = m_df['SY'].astype(int)
m_df['GE'] = m_df['GE'].astype(int)
m_df['VEND'] = m_df['VEND'].astype(int)
m_df['ITEM'] = m_df['ITEM'].astype(int)
m_df['D'] = m_df['D'].astype(int)
m_df['PR'] = m_df['PR'].astype(int)
m_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47711 entries, 0 to 47710
Data columns (total 9 columns):
WEEK       47711 non-null object
IRI_KEY    47711 non-null object
SY         47711 non-null int64
GE         47711 non-null int64
VEND       47711 non-null int64
ITEM       47711 non-null int64
F          47711 non-null object
D          47711 non-null int64
PR         47711 non-null int64
dtypes: int64(6), object(3)
memory usage: 3.6+ MB


In [209]:
master_merged_df = pd.merge(master_panel_df, m_df, on=['WEEK', 'IRI_KEY','SY', 'GE', 'VEND','ITEM'])
master_merged_df

Unnamed: 0,PANID,WEEK,UNITS,OUTLET,DOLLARS,IRI_KEY,START_TIME_EPOCH_S,END_TIME_EPOCH_S,TRANSACTION_TIME_EPOCH_S,MONTH,...,HOUR_OF_DAY,MINUTE,DAYOFWEEK,SY,GE,VEND,ITEM,F,D,PR
0,3369595,1479,4,0,10,257871,1199059200,1199577600,1199092980,12,...,9,23,0,0,1,84173,75121,NONE,0,0
1,3369595,1479,3,0,7.5,257871,1199059200,1199577600,1199197080,1,...,14,18,1,0,1,84173,75121,NONE,0,0
2,3369595,1479,4,0,10,257871,1199059200,1199577600,1199464740,1,...,16,39,4,0,1,84173,75121,NONE,0,0
3,3834697,1479,1,0,8.99,257871,1199059200,1199577600,1199094480,12,...,9,48,0,0,1,34100,17636,NONE,0,0
4,3308189,1479,1,0,14.68,9999690,1199059200,1199577600,1199095200,12,...,10,0,0,0,1,18200,53168,NONE,0,0
5,3176552,1479,1,0,14.68,9999690,1199059200,1199577600,1199461680,1,...,15,48,4,0,1,18200,53168,NONE,0,0
6,3176552,1479,1,0,14.68,9999690,1199059200,1199577600,1199540940,1,...,13,49,5,0,1,18200,53168,NONE,0,0
7,3308189,1479,1,0,14.68,9999690,1199059200,1199577600,1199611620,1,...,9,27,6,0,1,18200,53168,NONE,0,0
8,3812222,1479,1,0,10.98,9999691,1199059200,1199577600,1199095440,12,...,10,4,0,0,1,34100,57340,NONE,0,0
9,3356188,1479,1,0,8.89,9999691,1199059200,1199577600,1199096700,12,...,10,25,0,0,1,71990,30069,NONE,0,0


In [210]:
master_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67965 entries, 0 to 67964
Data columns (total 22 columns):
PANID                       67965 non-null object
WEEK                        67965 non-null object
UNITS                       67965 non-null object
OUTLET                      67965 non-null int64
DOLLARS                     67965 non-null object
IRI_KEY                     67965 non-null object
START_TIME_EPOCH_S          67965 non-null int64
END_TIME_EPOCH_S            67965 non-null int64
TRANSACTION_TIME_EPOCH_S    67965 non-null int64
MONTH                       67965 non-null int64
DAY                         67965 non-null int64
YEAR                        67965 non-null int64
HOUR_OF_DAY                 67965 non-null int64
MINUTE                      67965 non-null int64
DAYOFWEEK                   67965 non-null int64
SY                          67965 non-null int64
GE                          67965 non-null int64
VEND                        67965 non-null int64
ITEM