# Data Analysis and Summarization using Pandas

# 1. DOCUMENTATION

#In this project we are going to perform data analysis and summarization using pandas on the given six datasets
---> First of all we need to import all the necessary modules.
    In this we require only pandas. If it is not installed ,one can install using this command !pip install pandas
    Then import pandas as pd
     
---> To perform any operation on the dataset ,we load the dataset using pd.read_csv()     # reading csv file
     
The various methods used in this project are:
>> head()
           It is used for getting a quick overview of the DataFrame.
           #Note: if the number of rows is not specified, the head() method will return the top 5 rows.
>> isnull.sum()
          Count the number of missing values in each column.
>> dataframe.info()
          It is used to get a brief summary of the dataframe. 
>> dataframe.shape
          It returns the tuple of(rows,colums) of the dataset
          
>> dataframe.dropna()
          It removes the rows which contains Nan values(NULL values)
          We use  dataframe.dropna(inplace=True) so that the changes reflect in the original dataframe.
>> intersection()
          It is used to get the common values in the dataframe
>> set()
          It returns then unique values in a dataframe
>> duplicated()
          It returns a Boolean values for each row.
          Returns True for every row that is a duplicate, othwerwise False.
>> df.merge(df1, on condition )
          It marges the content of df1 in df on basis of the specified condition
>> df.describe()
           It returns description of the data in the DataFrame.

# 2. CODE

# Importing libraries 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def get_df_info(df, include_unique_values=False):
    col_name_list = list(df.columns)
    col_type_list = [type(df[col][0]) for col in col_name_list]
    col_null_count_list = [df[col].isnull().sum() for col in col_name_list]
    col_unique_count_list = [df[col].nunique() for col in col_name_list]
    col_memory_usage_list = [df[col].memory_usage(deep=True) for col in col_name_list]
    df_total_memory_usage = sum(col_memory_usage_list) / 1048576
    if include_unique_values:
        col_unique_list = [df[col].unique() for col in col_name_list]
        df_info = pd.DataFrame({'column_name': col_name_list, 'type': col_type_list, 
                                'null_count': col_null_count_list, 'nunique': col_unique_count_list, 
                                'unique_values': col_unique_list})
    else:
        df_info = pd.DataFrame({'column_name': col_name_list, 'type': col_type_list, 
                                'null_count': col_null_count_list, 'nunique': col_unique_count_list})
    return df_info, df_total_memory_usage

# IDA of datasets individually

# (a) Load the datasets

In [3]:
df_train_raw_data = pd.read_csv('train.csv')
df_item_raw_data = pd.read_csv('item_data.csv')
df_ct_raw_data = pd.read_csv('customer_transaction_data.csv')
df_cd_raw_data = pd.read_csv('customer_demographics.csv')
df_cim_raw_data = pd.read_csv('coupon_item_mapping.csv')
df_campaign_raw_data = pd.read_csv('campaign_data.csv')

# (b) Display the dataframes

In [4]:
df_train_raw_data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [5]:
df_item_raw_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [6]:
df_ct_raw_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [7]:
df_cd_raw_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [8]:
df_cim_raw_data.head()

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [9]:
df_campaign_raw_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,21/10/13,20/12/13
1,25,Y,21/10/13,22/11/13
2,20,Y,07/09/13,16/11/13
3,23,Y,08/10/13,15/11/13
4,21,Y,16/09/13,18/10/13


# (c) Data Cleaning and Transformations

Now we will analyze the datasets individually and then remove the meaningless data to increse the efficiency. It includes overview of data, finding the missing values and handling them.

1. df_train_raw_data dataframe:

In [10]:
df_train_raw_data_info, df_train_raw_data_mem = get_df_info(df_train_raw_data, True)
print(f'train dataset has {df_train_raw_data.shape[0]} rows and {df_train_raw_data.shape[1]} cols, uses approx. {df_train_raw_data_mem:.2f} MB')
df_train_raw_data_info

train dataset has 78369 rows and 5 cols, uses approx. 2.99 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,id,<class 'numpy.int64'>,0,78369,"[1, 2, 6, 7, 9, 11, 14, 15, 17, 19, 20, 21, 22..."
1,campaign_id,<class 'numpy.int64'>,0,18,"[13, 9, 8, 11, 29, 30, 2, 5, 12, 26, 3, 4, 10,..."
2,coupon_id,<class 'numpy.int64'>,0,866,"[27, 116, 635, 644, 1017, 795, 444, 538, 857, ..."
3,customer_id,<class 'numpy.int64'>,0,1428,"[1053, 48, 205, 1050, 1489, 793, 590, 368, 523..."
4,redemption_status,<class 'numpy.int64'>,0,2,"[0, 1]"


In [11]:
df_train_raw_data.describe()   #Overview of data

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
count,78369.0,78369.0,78369.0,78369.0,78369.0
mean,64347.975449,13.974441,566.363243,787.451888,0.009302
std,37126.440855,8.019215,329.966054,456.811339,0.095999
min,1.0,1.0,1.0,1.0,0.0
25%,32260.0,8.0,280.0,399.0,0.0
50%,64318.0,13.0,597.0,781.0,0.0
75%,96577.0,13.0,857.0,1190.0,0.0
max,128595.0,30.0,1115.0,1582.0,1.0


In [12]:
df_train_raw_data.isnull().sum()    #Checking the missing values in each column

id                   0
campaign_id          0
coupon_id            0
customer_id          0
redemption_status    0
dtype: int64

Conclusion: There are no missing values in the attributes of train.csv dataset.We can see that there are no outliers that could affect the accuracy of result. Thus no transformation is needed in this dataset.

Analysis: 'train.csv' file contains unique and not null values. The columns in this file acts as foreign key for reference to other files.

2. df_item_raw_data dataframe:

In [13]:
df_item_raw_data_info, df_item_raw_data_mem = get_df_info(df_item_raw_data, True)
print(f'item data dataset has {df_train_raw_data.shape[0]} rows and {df_item_raw_data.shape[1]} cols, uses approx. {df_item_raw_data_mem:.2f} MB')
df_item_raw_data_info

item data dataset has 78369 rows and 4 cols, uses approx. 10.65 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,item_id,<class 'numpy.int64'>,0,74066,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,brand,<class 'numpy.int64'>,0,5528,"[1, 56, 11, 21, 162, 209, 278, 105, 487, 25, 1..."
2,brand_type,<class 'str'>,0,2,"[Established, Local]"
3,category,<class 'str'>,0,19,"[Grocery, Miscellaneous, Bakery, Pharmaceutica..."


In [14]:
df_item_raw_data.describe() #Overview of data

Unnamed: 0,item_id,brand
count,74066.0,74066.0
mean,37033.5,1485.560055
std,21381.156856,1537.385673
min,1.0,1.0
25%,18517.25,278.0
50%,37033.5,978.0
75%,55549.75,2013.0
max,74066.0,5528.0


In [15]:
df_item_raw_data.isnull().sum() #Checking the missing values in each column

item_id       0
brand         0
brand_type    0
category      0
dtype: int64

Conclusion: There are no missing values in the attributes of item_data.csv dataset.We can see that there are no outliers that could affect the accuracy of result. Thus no transformation is needed in this dataset.

Analysis: 'item_data.csv' file contains the items available for sale in the campaign. It conatins the description of items. This file can be referenced by other files through 'item_id' column.

3. df_ct_raw_data dataframe

In [16]:
df_ct_raw_data_info, df_ct_raw_data_mem = get_df_info(df_ct_raw_data, True)
print(f'customer transaction dataset has {df_ct_raw_data.shape[0]} rows and {df_ct_raw_data.shape[1]} cols, uses approx. {df_ct_raw_data_mem:.2f} MB')
df_ct_raw_data_info

customer transaction dataset has 1324566 rows and 7 cols, uses approx. 145.27 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,date,<class 'str'>,0,549,"[2012-01-02, 2012-01-03, 2012-01-04, 2012-01-0..."
1,customer_id,<class 'numpy.int64'>,0,1582,"[1501, 857, 67, 751, 679, 135, 464, 1457, 1191..."
2,item_id,<class 'numpy.int64'>,0,74063,"[26830, 54253, 31962, 33647, 48199, 57397, 124..."
3,quantity,<class 'numpy.int64'>,0,9252,"[1, 3, 2, 4, 5, 10, 9, 8, 6, 7, 20, 995, 1461,..."
4,selling_price,<class 'numpy.float64'>,0,4923,"[35.26, 53.43, 106.5, 67.32, 71.24, 110.07, 89..."
5,other_discount,<class 'numpy.float64'>,0,1418,"[-10.69, -13.89, -14.25, 0.0, -28.14, -35.26, ..."
6,coupon_discount,<class 'numpy.float64'>,0,232,"[0.0, -35.62, -14.25, -26.71, -21.02, -19.59, ..."


In [17]:
df_ct_raw_data.describe() #Overview of data

Unnamed: 0,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
count,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0
mean,804.002,29519.03,130.6633,114.6036,-17.76871,-0.5948983
std,457.3363,17908.06,1311.545,152.9053,37.88867,7.069367
min,1.0,1.0,1.0,0.36,-3120.31,-1992.23
25%,418.0,14684.0,1.0,49.16,-23.15,0.0
50%,801.0,26597.0,1.0,78.01,-1.78,0.0
75%,1198.0,42405.75,1.0,124.31,0.0,0.0
max,1582.0,74066.0,89638.0,17809.64,0.0,0.0


In [18]:
df_ct_raw_data.isnull().sum() #Checking for missing values

date               0
customer_id        0
item_id            0
quantity           0
selling_price      0
other_discount     0
coupon_discount    0
dtype: int64

Conclusion:There are no missing values in the attributes of customer_transaction_data.csv dataset.We can see that there are no outliers that could affect the accuracy of result. Thus no transformation is needed in this dataset.

Analysis: 'customer_transaction_data.csv' file contains data of the transaction of customers i.e. which itme a customer has bought and their quantity. It contains the coupon used by the customer and the discount they got. This file can be accessed by other files through customer_id column.

4. df_cd_raw_data dataframe

In [19]:
df_cd_raw_data_info, df_cd_raw_data_mem = get_df_info(df_cd_raw_data, True)
print(f'customer demographics dataset has {df_cd_raw_data.shape[0]} rows and {df_cd_raw_data.shape[1]} cols, uses approx. {df_cd_raw_data_mem:.2f} MB')
df_cd_raw_data_info

customer demographics dataset has 760 rows and 7 cols, uses approx. 0.17 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,customer_id,<class 'numpy.int64'>,0,760,"[1, 6, 7, 8, 10, 11, 12, 13, 14, 15, 17, 19, 2..."
1,age_range,<class 'str'>,0,6,"[70+, 46-55, 26-35, 36-45, 18-25, 56-70]"
2,marital_status,<class 'str'>,329,2,"[Married, nan, Single]"
3,rented,<class 'numpy.int64'>,0,2,"[0, 1]"
4,family_size,<class 'str'>,0,5,"[2, 3, 4, 1, 5+]"
5,no_of_children,<class 'float'>,538,3,"[nan, 1, 2, 3+]"
6,income_bracket,<class 'numpy.int64'>,0,12,"[4, 5, 3, 6, 1, 7, 2, 8, 9, 12, 10, 11]"


In [20]:
df_cd_raw_data.describe()  #Overview of data

Unnamed: 0,customer_id,rented,income_bracket
count,760.0,760.0,760.0
mean,779.201316,0.053947,4.715789
std,459.754429,0.226063,2.258817
min,1.0,0.0,1.0
25%,382.75,0.0,3.0
50%,774.5,0.0,5.0
75%,1187.25,0.0,6.0
max,1581.0,1.0,12.0


In [21]:
df_cd_raw_data.isnull().sum()  #Checking for missing values

customer_id         0
age_range           0
marital_status    329
rented              0
family_size         0
no_of_children    538
income_bracket      0
dtype: int64

Here we can see that 'marital_status' and 'no_of_children' contains null values. We have to handle the missing values in order to perform actions on this dataset.

In [22]:
df_cd_raw_data.marital_status.unique() #Here we are checking the unique values in the marital_status attribute

array(['Married', nan, 'Single'], dtype=object)

We can see there are only two unique values in marital_status attribute: single and married. We consider a person to be single, by default. Therefore single can be inserted in place of missing values.

In [23]:
df_cd_raw_data.marital_status.fillna('Single',inplace=True)   #inplace modifies the values in the original dataset

In [24]:
df_cd_raw_data.no_of_children.unique() #Here we are checking the unique values in the marital_status attribute

array([nan, '1', '2', '3+'], dtype=object)

We can see there are three unique values in no_of_children attribute: 1,2 and 3+.By default a person can have 0 children. Therefore 0 can be inserted in place of missing values.

In [25]:
df_cd_raw_data.no_of_children.fillna(0,inplace=True)

In [26]:
df_cd_raw_data.isnull().sum() #Checking for missing values after the modifications

customer_id       0
age_range         0
marital_status    0
rented            0
family_size       0
no_of_children    0
income_bracket    0
dtype: int64

In [27]:
df_cd_raw_data.head()  #display the df_cd_raw_data dataframe after the modifications.

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,0,4
1,6,46-55,Married,0,2,0,5
2,7,26-35,Single,0,3,1,3
3,8,26-35,Single,0,4,2,6
4,10,46-55,Single,0,1,0,5


Conclusion : 'marital_status' and 'no_of_children' contained misaing values which are handled successfully in order to increase the accuracy. 

Analysis: 'customer_demographics.csv' file contains the personal information of the customers i.e., their age, family size , number of children and so on. This file can be referenced by customer_id attribute.

5. df_cim_raw_data dataframe

In [28]:
df_cim_raw_data_info, df_cim_raw_data_mem = get_df_info(df_cim_raw_data, True)
print(f'coupon item mapping dataset has {df_cim_raw_data.shape[0]} rows and {df_cim_raw_data.shape[1]} cols, uses approx. {df_cim_raw_data_mem:.2f} MB')
df_cim_raw_data_info

coupon item mapping dataset has 92663 rows and 2 cols, uses approx. 1.41 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,coupon_id,<class 'numpy.int64'>,0,1116,"[105, 107, 494, 522, 518, 520, 529, 524, 378, ..."
1,item_id,<class 'numpy.int64'>,0,36289,"[37, 75, 76, 77, 81, 90, 98, 101, 105, 111, 11..."


In [29]:
df_cim_raw_data.describe() #Overview of data

Unnamed: 0,coupon_id,item_id
count,92663.0,92663.0
mean,155.967387,36508.613071
std,282.99172,21131.312716
min,1.0,1.0
25%,22.0,18255.5
50%,30.0,37955.0
75%,42.0,54191.5
max,1116.0,74061.0


In [30]:
df_cim_raw_data.isnull().sum()  #Checking for the missing values

coupon_id    0
item_id      0
dtype: int64

Conclusion: There are no missing values in the coupon_item_mapping.csv dataset. Thus no transformation is needed in this dataset.

Analysis: 'coupon_item_mapping.csv' contains the details of the coupons that can be used by the customers in order to avail the discount while buying the items. It contains the coupon id of the coupon and item id of the items on which the respective coupon can be used.

6. df_campaign_raw_data dataframe

In [31]:
df_campaign_raw_data_info, df_campaign_raw_data_mem = get_df_info(df_campaign_raw_data, True)
print(f'campaign data dataset has {df_campaign_raw_data.shape[0]} rows and {df_campaign_raw_data.shape[1]} cols, uses approx. {df_campaign_raw_data_mem:.2f} MB')
df_campaign_raw_data_info

campaign data dataset has 28 rows and 4 cols, uses approx. 0.01 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,campaign_id,<class 'numpy.int64'>,0,28,"[24, 25, 20, 23, 21, 22, 18, 19, 17, 16, 13, 1..."
1,campaign_type,<class 'str'>,0,2,"[Y, X]"
2,start_date,<class 'str'>,0,25,"[21/10/13, 07/09/13, 08/10/13, 16/09/13, 10/08..."
3,end_date,<class 'str'>,0,26,"[20/12/13, 22/11/13, 16/11/13, 15/11/13, 18/10..."


In [32]:
df_campaign_raw_data.describe()

Unnamed: 0,campaign_id
count,28.0
mean,15.571429
std,9.118271
min,1.0
25%,7.75
50%,16.5
75%,23.25
max,30.0


In [33]:
df_campaign_raw_data.isnull().sum()

campaign_id      0
campaign_type    0
start_date       0
end_date         0
dtype: int64

Conclusion: campaign_data.csv contains no null values. There are no outliers that could affect the accuracy of the result. Therefore no transformation is needed in the dataset.

Analysis: 'campaign_data.csv' contains the information of the campaigns that are to be held i.e., the start date ,end date, campaign type and so on. Other files can reference this file through campaign_id attribute.

Exploratory analysis is executed individually of all the datasets. They are cleaned and transformed for further operations.

#  Merging all the datasets 

1. Merging df_train_raw_data with df_cd_raw_data via 'customer_id'

In [34]:
df_customer_id_merged = df_train_raw_data.merge(df_cd_raw_data, on='customer_id')
df_customer_id_merged.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,13,27,1053,0,46-55,Single,0,1,0,5
1,2444,13,513,1053,0,46-55,Single,0,1,0,5
2,3651,13,166,1053,0,46-55,Single,0,1,0,5
3,7712,13,766,1053,0,46-55,Single,0,1,0,5
4,15317,13,165,1053,0,46-55,Single,0,1,0,5


In [35]:
df_customer_id_merged_info, df_customer_id_merged_mem = get_df_info(df_customer_id_merged, True)
print(f'customer_id_merged dataset has {df_customer_id_merged.shape[0]} rows and {df_customer_id_merged.shape[1]} cols, uses approx. {df_customer_id_merged_mem:.2f} MB')
df_customer_id_merged_info

customer_id_merged dataset has 43661 rows and 11 cols, uses approx. 26.40 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,id,<class 'numpy.int64'>,0,43661,"[1, 2444, 3651, 7712, 15317, 19329, 21689, 217..."
1,campaign_id,<class 'numpy.int64'>,0,18,"[13, 9, 8, 4, 5, 11, 10, 7, 28, 3, 30, 29, 26,..."
2,coupon_id,<class 'numpy.int64'>,0,866,"[27, 513, 166, 766, 165, 155, 23, 143, 25, 124..."
3,customer_id,<class 'numpy.int64'>,0,703,"[1053, 48, 205, 1489, 793, 590, 368, 679, 108,..."
4,redemption_status,<class 'numpy.int64'>,0,2,"[0, 1]"
5,age_range,<class 'str'>,0,6,"[46-55, 36-45, 18-25, 26-35, 56-70, 70+]"
6,marital_status,<class 'str'>,0,2,"[Single, Married]"
7,rented,<class 'numpy.int64'>,0,2,"[0, 1]"
8,family_size,<class 'str'>,0,5,"[1, 2, 3, 4, 5+]"
9,no_of_children,<class 'int'>,0,4,"[0, 1, 2, 3+]"


2. Merging df_train_raw_data with df_campaign_raw_data via 'campaign_id'

In [36]:
df_campaign_id_merged = df_train_raw_data.merge(df_campaign_raw_data,on='campaign_id')
df_campaign_id_merged.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_date,end_date
0,1,13,27,1053,0,X,19/05/13,05/07/13
1,2,13,116,48,0,X,19/05/13,05/07/13
2,7,13,644,1050,0,X,19/05/13,05/07/13
3,21,13,1028,89,0,X,19/05/13,05/07/13
4,23,13,517,1067,0,X,19/05/13,05/07/13


In [37]:
df_campaign_id_merged_info, df_campaign_id_merged_mem = get_df_info(df_campaign_id_merged, True)
print(f'campaign_id_merged dataset has {df_campaign_id_merged.shape[0]} rows and {df_campaign_id_merged.shape[1]} cols, uses approx. {df_campaign_id_merged_mem:.2f} MB')
df_campaign_id_merged_info

campaign_id_merged dataset has 78369 rows and 8 cols, uses approx. 37.95 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,id,<class 'numpy.int64'>,0,78369,"[1, 2, 7, 21, 23, 25, 28, 35, 46, 51, 63, 65, ..."
1,campaign_id,<class 'numpy.int64'>,0,18,"[13, 9, 8, 11, 29, 30, 2, 5, 12, 26, 3, 4, 10,..."
2,coupon_id,<class 'numpy.int64'>,0,866,"[27, 116, 644, 1028, 517, 796, 268, 1031, 482,..."
3,customer_id,<class 'numpy.int64'>,0,1428,"[1053, 48, 1050, 89, 1067, 248, 1152, 1031, 10..."
4,redemption_status,<class 'numpy.int64'>,0,2,"[0, 1]"
5,campaign_type,<class 'str'>,0,2,"[X, Y]"
6,start_date,<class 'str'>,0,17,"[19/05/13, 11/03/13, 16/02/13, 22/04/13, 08/10..."
7,end_date,<class 'str'>,0,17,"[05/07/13, 12/04/13, 05/04/13, 07/06/13, 30/11..."


3. Merging df_train_raw_data with df_cim_raw_data via 'coupon_id'

In [38]:
df_coupon_id_merged = df_train_raw_data.merge(df_cim_raw_data, on='coupon_id')
df_coupon_id_merged.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,item_id
0,1,13,27,1053,0,24775
1,1,13,27,1053,0,14958
2,1,13,27,1053,0,40431
3,1,13,27,1053,0,20749
4,1,13,27,1053,0,56860


In [39]:
df_coupon_id_merged_info, df_coupon_id_merged_mem = get_df_info(df_coupon_id_merged, True)
print(f'coupon_id_merged dataset has {df_coupon_id_merged.shape[0]} rows and {df_coupon_id_merged.shape[1]} cols, uses approx. {df_coupon_id_merged_mem:.2f} MB')
df_coupon_id_merged_info

coupon_id_merged dataset has 6420694 rows and 6 cols, uses approx. 587.83 MB


Unnamed: 0,column_name,type,null_count,nunique,unique_values
0,id,<class 'numpy.int64'>,0,78369,"[1, 370, 554, 1958, 2075, 2496, 3062, 3801, 44..."
1,campaign_id,<class 'numpy.int64'>,0,18,"[13, 9, 8, 11, 29, 30, 2, 12, 5, 10, 28, 26, 3..."
2,coupon_id,<class 'numpy.int64'>,0,866,"[27, 116, 635, 644, 1017, 795, 444, 538, 857, ..."
3,customer_id,<class 'numpy.int64'>,0,1428,"[1053, 1168, 1061, 1240, 351, 1531, 810, 1507,..."
4,redemption_status,<class 'numpy.int64'>,0,2,"[0, 1]"
5,item_id,<class 'numpy.int64'>,0,32800,"[24775, 14958, 40431, 20749, 56860, 58066, 569..."


4. Merging df_train_raw_data with df_ct_raw_data via 'customer_id'

In [None]:
df_customer_idi_merged = df_train_raw_data.merge(df_ct_raw_data, on='customer_id')
df_customer_idi_merged.head()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

