<a href="https://colab.research.google.com/github/cleysonl/ML_Bootcamp_CLL/blob/master/Data_Processing_and_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Processing and Analysis**

In [0]:
import datetime
import random
from random import randrange
import numpy as np
import pandas as pd

In [0]:
def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1
        
        

def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """

    # sentinels
    startDate = datetime.datetime(2016, 1, 1, 13)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000

    # base list of attributes
    data_dict = {
        'Serial No':
        np.arange(row_count) + serial_number_sentinel,
        'Date':
        np.random.permutation(
            pd.to_datetime([
                x.strftime("%d-%m-%Y")
                for x in _random_date(startDate, row_count)
            ]).date),
        'User ID':
        np.random.permutation(
            np.random.randint(0, row_count, size=int(row_count / 10)) +
            user_id_sentinel).tolist() * 10,
        'Product ID':
        np.random.permutation(
            np.random.randint(0, row_count, size=int(row_count / 10)) +
            product_id_sentinel).tolist() * 10,
        'Quantity Purchased':
        np.random.permutation(np.random.randint(1, 42, size=row_count)),
        'Price':
        np.round(
            np.abs(np.random.randn(row_count) + 1) * price_sentinel,
            decimals=2),
        'User Type':
        np.random.permutation(
            [chr(random.randrange(97, 97 + 3 + 1)) for i in range(row_count)])
    }

    # introduce missing values
    for index in range(int(np.sqrt(row_count))):
        data_dict['Price'][np.argmax(
            data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(
            data_dict['User Type'] == random.choice(
                data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(
            data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.
                                          choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.
                                         choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(
            data_dict['User ID']))] = -101

    # create data frame
    df = pd.DataFrame(data_dict)

    return df

## **Import dependencies**

In [0]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn import preprocessing

pd.options.mode.chained_assignment = None

## **Generate dataset**

In [7]:
# Generate a dataset with 1000 rows
df = generate_sample_data(row_count=1000)
df.shape

(1000, 7)

### **Analyze generated Dataset**

In [8]:
df.head()

Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,1000,,-101,0,12,1379.22,n
1,1001,2016-01-15,5288,304,5,4749.63,n
2,1002,2016-01-02,5563,478,41,1374.87,n
3,1003,2016-01-16,5772,999,2,4936.43,n
4,1004,2016-04-02,5928,769,9,651.2,n


**Dataframe stats**

In [9]:
print('Number of rows:', df.shape[0])

Number of rows: 1000


In [10]:
print('Number of columns:', df.shape[1])

Number of columns: 7


In [11]:
print('Column data types: \n', df.dtypes)

Column data types: 
 Serial No               int64
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int64
Price                 float64
User Type              object
dtype: object


In [12]:
print('Columns with missing values:', df.columns[df.isnull().any()].tolist())

Columns with missing values: ['Date', 'Price']


In [13]:
print('Number of rows with Missing values:', len(pd.isnull(df).any(1).nonzero()[0].tolist()))

Number of rows with Missing values: 61


  """Entry point for launching an IPython kernel.


**General Stats**

In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Serial No             1000 non-null int64
Date                  969 non-null object
User ID               1000 non-null int64
Product ID            1000 non-null int64
Quantity Purchased    1000 non-null int64
Price                 969 non-null float64
User Type             1000 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 54.8+ KB
None


In [15]:
print(df.describe())

         Serial No      User ID   Product ID  Quantity Purchased        Price
count  1000.000000  1000.000000  1000.000000         1000.000000   969.000000
mean   1451.083000  5451.383000   551.211000           21.220000  2334.919876
std     385.701581   355.125691   268.790924           12.051757  1596.970973
min      -1.000000  -101.000000     0.000000            1.000000     0.280000
25%    1221.750000  5214.000000   325.000000           10.750000  1097.220000
50%    1482.000000  5430.000000   515.000000           22.000000  2119.880000
75%    1741.250000  5735.000000   775.250000           32.000000  3371.640000
max    1999.000000  6000.000000  1095.000000           41.000000  8541.130000


 ### **Standardize Columns**

In [16]:
# list all columns
print("Dataframe columns>\n{}".format(df.columns.tolist()))

Dataframe columns>
['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


In [0]:
# change all columns names to lowercase-snakecased column names in python
def cleanup_column_names(df, rename_dict={}, do_inplace=True):
  if not rename_dict:
    return df.rename(columns={col:col.lower().replace(' ','_') for col in df.columns.values.tolist()}, inplace=True)
  else:
    return df.rename(columns = rename_dict, inplace=do_inplace)

In [0]:
cleanup_column_names(df)

In [19]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))

Dataframe columns:
['serial_no', 'date', 'user_id', 'product_id', 'quantity_purchased', 'price', 'user_type']


### **Basic Manipulation**

**Sort basic specific atributes**

In [20]:
# Ascending for serial_no and descending for price
display(df.sort_values(['serial_no','price'], ascending=[True, False]).head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
738,-1,2016-01-18,5398,247,22,8541.13,c
539,-1,2016-01-26,5375,795,3,6960.59,b
563,-1,2016-01-16,5606,298,15,6005.74,b
658,-1,2016-01-14,5009,330,17,5422.24,c
975,-1,2016-01-15,5863,608,9,4811.86,b


**Reorder columns**

In [21]:
display(df[['serial_no','date','user_id','user_type',
              'product_id','quantity_purchased','price']].head())

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
0,1000,,-101,n,0,12,1379.22
1,1001,2016-01-15,5288,n,304,5,4749.63
2,1002,2016-01-02,5563,n,478,41,1374.87
3,1003,2016-01-16,5772,n,999,2,4936.43
4,1004,2016-04-02,5928,n,769,9,651.2


**Select Attributes**

In [22]:
# Using column index print 10 values from column at index 3
print(df.iloc[:,3].values[0:10])

[  0 304 478 999 769 852 738 792 523 876]


In [23]:
# Using column name print 10 values of quantity_purchased
print(df.quantity_purchased.values[0:10])

[12  5 41  2  9 16 38 22 41 36]


In [24]:
# Using datatype, print 10 values of columns with data type float
print(df.select_dtypes(include=['float64']).values[:10,0])

[1379.22 4749.63 1374.87 4936.43  651.2  1654.07 2345.68  350.46 3240.16
 1401.46]


**Select rows**

In [25]:
# using row index
display(df.iloc[[10,501,20]])

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
10,1010,,5870,186,31,4999.07,a
501,1501,2016-03-02,5288,304,20,2668.83,d
20,1020,,5899,423,1,977.11,n


In [26]:
# excluding specific rows
display(df.drop([0,24,51], axis=0).head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
1,1001,2016-01-15,5288,304,5,4749.63,n
2,1002,2016-01-02,5563,478,41,1374.87,n
3,1003,2016-01-16,5772,999,2,4936.43,n
4,1004,2016-04-02,5928,769,9,651.2,n
5,1005,2016-06-02,5171,852,16,1654.07,n


In [27]:
# Conditional filtering
# Quantity_purchased > 25
display(df[df.quantity_purchased>25].head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
2,1002,2016-01-02,5563,478,41,1374.87,n
6,1006,,5684,738,38,2345.68,n
8,1008,,5220,523,41,3240.16,n
9,1009,,5011,876,36,1401.46,a
10,1010,,5870,186,31,4999.07,a


In [28]:
# Offset from top
display(df[100:].head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
100,1100,2016-01-19,5806,429,28,2715.85,c
101,1101,2016-01-28,5288,304,28,3484.24,c
102,1102,2016-02-02,5563,478,32,1262.4,d
103,1103,2016-09-01,5772,999,27,49.54,b
104,1104,2016-02-02,5928,769,27,4365.0,c


In [29]:
#offset from Bottom
display(df[-10:].head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
990,1990,2016-06-02,5370,448,2,2023.46,b
991,1991,2016-01-19,5324,422,4,712.27,c
992,1992,2016-01-21,5873,825,11,1222.19,a
993,1993,2016-03-01,5515,167,6,2360.77,b
994,1994,2016-03-01,5407,886,17,1097.22,a


**Type casting**

In [30]:
# existing datatypes
df.dtypes

serial_no               int64
date                   object
user_id                 int64
product_id              int64
quantity_purchased      int64
price                 float64
user_type              object
dtype: object

In [31]:
# set datatime as dtype for date column
df['date'] = pd.to_datetime(df.date)
print(df.dtypes)

serial_no                      int64
date                  datetime64[ns]
user_id                        int64
product_id                     int64
quantity_purchased             int64
price                        float64
user_type                     object
dtype: object


**Map/Apply Functionality**

In [0]:
def expand_user_type(u_type):
  if u_type in ['a','b']:
    return 'new'
  elif u_type == 'c':
    return 'existing'
  elif u_type == 'd':
    return 'loyal_existing'
  else:
    return 'error'

In [38]:
# Map user Type to user class
df['user_class'] = df['user_type'].map(expand_user_type)
display(df.tail())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
995,1995,2016-05-01,5488,325,40,2869.8,a,new,17
996,1996,2016-01-29,5258,390,9,2636.31,b,new,4
997,1997,2016-02-02,5927,730,10,6485.87,c,existing,5
998,1998,2016-01-17,5376,717,34,,c,existing,2
999,1999,2016-10-01,5030,129,22,164.45,a,new,39


In [34]:
# Apply: Using apply to get attribute ranges
display(df.select_dtypes(include=[np.number]).apply(lambda x: x.max()-x.min()))

serial_no             2000.00
user_id               6101.00
product_id            1095.00
quantity_purchased      40.00
price                 8540.85
dtype: float64

In [0]:
# Apply-Map: Extract week from date
df['purchase_week'] = df[['date']].applymap(lambda dt: dt.week if not pd.isnull(dt.week) else 0)

In [36]:
display(df.head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
0,1000,NaT,-101,0,12,1379.22,n,error,0
1,1001,2016-01-15,5288,304,5,4749.63,n,error,2
2,1002,2016-01-02,5563,478,41,1374.87,n,error,53
3,1003,2016-01-16,5772,999,2,4936.43,n,error,2
4,1004,2016-04-02,5928,769,9,651.2,n,error,13


### **Handle Missing Values**

In [40]:
# Drop rows with missing values
df_dropped = df.dropna(subset=['date'])
display(df_dropped.head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
1,1001,2016-01-15,5288,304,5,4749.63,n,error,2
2,1002,2016-01-02,5563,478,41,1374.87,n,error,53
3,1003,2016-01-16,5772,999,2,4936.43,n,error,2
4,1004,2016-04-02,5928,769,9,651.2,n,error,13
5,1005,2016-06-02,5171,852,16,1654.07,n,error,22


In [0]:
# filling missing price with mean price
df_dropped['price'].fillna(value=np.round(df.price.mean(), decimals=2), inplace=True)

In [0]:
# filling missing user types using values from previous row
df_dropped['user_type'].fillna(method='ffill', inplace=True)

### **Handle Duplicates**

In [43]:
# sample duplicates. Identify for serial_no
display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())
print('Shape of df={}'.format(df_dropped.shape))

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
137,-1,2016-08-02,5518,521,32,2402.26,b,new,31
214,-1,2016-01-21,5017,667,33,402.88,d,loyal_existing,3
272,-1,2016-05-02,5049,442,34,4535.64,d,loyal_existing,18
285,-1,2016-01-30,5234,877,16,1436.32,a,new,4
298,-1,2016-09-01,5376,717,20,74.21,c,existing,35


Shape of df=(969, 9)


In [44]:
# Drop duplicates
df_dropped.drop_duplicates(subset=['serial_no'], inplace=True)
display(df_dropped.head())
print('Shape of df={}'.format(df_dropped.shape))

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
1,1001,2016-01-15,5288,304,5,4749.63,n,error,2
2,1002,2016-01-02,5563,478,41,1374.87,n,error,53
3,1003,2016-01-16,5772,999,2,4936.43,n,error,2
4,1004,2016-04-02,5928,769,9,651.2,n,error,13
5,1005,2016-06-02,5171,852,16,1654.07,n,error,22


Shape of df=(939, 9)


In [45]:
# Remove rows which have less than 3 attributes with non-missing data
display(df.dropna(thresh=3).head())
print('Shape of df={}'.format(df.dropna(thresh=3).shape))

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
0,1000,NaT,-101,0,12,1379.22,n,error,0
1,1001,2016-01-15,5288,304,5,4749.63,n,error,2
2,1002,2016-01-02,5563,478,41,1374.87,n,error,53
3,1003,2016-01-16,5772,999,2,4936.43,n,error,2
4,1004,2016-04-02,5928,769,9,651.2,n,error,13


Shape of df=(1000, 9)


### **Handle Categoricals**

**One Hot Encoding**

In [46]:
display(pd.get_dummies(df, columns=['user_type']).head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_class,purchase_week,user_type_a,user_type_b,user_type_c,user_type_d,user_type_n
0,1000,NaT,-101,0,12,1379.22,error,0,0,0,0,0,1
1,1001,2016-01-15,5288,304,5,4749.63,error,2,0,0,0,0,1
2,1002,2016-01-02,5563,478,41,1374.87,error,53,0,0,0,0,1
3,1003,2016-01-16,5772,999,2,4936.43,error,2,0,0,0,0,1
4,1004,2016-04-02,5928,769,9,651.2,error,13,0,0,0,0,1


**Label Encoding**

In [47]:
type_map = {'a': 0, 'b': 1, 'c': 2, 'd': 3, np.NAN: -1}
df['encoded_user_type'] = df.user_type.map(type_map)
display((df.tail()))

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
995,1995,2016-05-01,5488,325,40,2869.8,a,new,17,0.0
996,1996,2016-01-29,5258,390,9,2636.31,b,new,4,1.0
997,1997,2016-02-02,5927,730,10,6485.87,c,existing,5,2.0
998,1998,2016-01-17,5376,717,34,,c,existing,2,2.0
999,1999,2016-10-01,5030,129,22,164.45,a,new,39,0.0


### **Handle Numerical Attributes**

**Min-Max Scalar**

In [0]:
df_normalized = df.dropna().copy()
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))
df_normalized['price'] = np_scaled.reshape(-1,1)

In [51]:
display(df_normalized.head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
19,1019,2016-11-01,5747,844,4,0.221145,a,new,44,0.0
22,1022,2016-11-02,5794,1095,8,0.267564,d,loyal_existing,44,3.0
24,1024,2016-01-28,5047,466,2,0.173251,d,loyal_existing,4,3.0
29,1029,2016-03-02,5711,292,12,0.038883,d,loyal_existing,9,3.0
35,1035,2016-08-01,5659,148,1,0.237769,d,loyal_existing,31,3.0


**Robust Scaler**

In [0]:
df_normalized = df.dropna().copy()
robust_scaler = preprocessing.RobustScaler()
rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))
df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1)

In [53]:
display(df_normalized.head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
19,1019,2016-11-01,5747,844,-0.827586,1889.05,a,new,44,0.0
22,1022,2016-11-02,5794,1095,-0.643678,2285.5,d,loyal_existing,44,3.0
24,1024,2016-01-28,5047,466,-0.91954,1479.99,d,loyal_existing,4,3.0
29,1029,2016-03-02,5711,292,-0.45977,332.37,d,loyal_existing,9,3.0
35,1035,2016-08-01,5659,148,-0.965517,2031.03,d,loyal_existing,31,3.0


### **Group by**

In [54]:
# Group by attributes user_class and get sum of quantity_purchased
print(df.groupby(['user_class'])['quantity_purchased'].sum())

user_class
error               679
existing           4537
loyal_existing     5524
new               10480
Name: quantity_purchased, dtype: int64


In [55]:
# Aggregate functions
#Sum, mean and Non Zero Count row count
display(df.groupby(['user_class']).agg([np.sum, np.mean, np.count_nonzero]))

Unnamed: 0_level_0,serial_no,serial_no,serial_no,user_id,user_id,user_id,product_id,product_id,product_id,quantity_purchased,quantity_purchased,quantity_purchased,price,price,price,purchase_week,purchase_week,purchase_week,encoded_user_type,encoded_user_type,encoded_user_type
Unnamed: 0_level_1,sum,mean,count_nonzero,sum,mean,count_nonzero,sum,mean,count_nonzero,sum,mean,count_nonzero,sum,mean,count_nonzero,sum,mean,count_nonzero,sum,mean,count_nonzero
user_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
error,31661,1021.322581,31,163626,5278.258065,31,18635,601.129032,30,679,21.903226,31,78412.93,2613.764333,31.0,318,10.258065,18,0.0,,31.0
existing,332068,1516.292237,219,1188833,5428.461187,219,115906,529.251142,219,4537,20.716895,219,512284.21,2416.434953,219.0,3266,14.913242,218,438.0,2.0,219.0
loyal_existing,388755,1445.185874,269,1465549,5448.137546,269,150738,560.364312,269,5524,20.535316,269,590612.29,2262.882337,269.0,4455,16.561338,264,807.0,3.0,269.0
new,698599,1452.388773,481,2633375,5474.7921,481,265932,552.873181,481,10480,21.787942,481,1081227.93,2320.231609,481.0,7347,15.274428,469,233.0,0.484407,233.0


In [56]:
# Aggregate functions specific to columns
display(df.groupby(['user_class', 'user_type']).agg({'price':np.mean, 'quantity_purchased':np.max}))

Unnamed: 0_level_0,Unnamed: 1_level_0,price,quantity_purchased
user_class,user_type,Unnamed: 2_level_1,Unnamed: 3_level_1
error,n,2613.764333,41
existing,c,2416.434953,41
loyal_existing,d,2262.882337,41
new,a,2104.658299,41
new,b,2551.134578,41


In [57]:
# Multiple aggregate functions
display(df.groupby(['user_class', 'user_type']).agg({'price': {'total_price': np.sum,
                                                               'mean_price': np.mean,
                                                               'variance_price': np.std,
                                                               'count': np.count_nonzero},
                                                     'quantity_purchased': np.sum}))

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,quantity_purchased
Unnamed: 0_level_1,Unnamed: 1_level_1,total_price,mean_price,variance_price,count,sum
user_class,user_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
error,n,78412.93,2613.764333,2277.354978,31.0,679
existing,c,512284.21,2416.434953,1671.281233,219.0,4537
loyal_existing,d,590612.29,2262.882337,1521.002596,269.0,5524
new,a,507222.65,2104.658299,1468.650332,248.0,5474
new,b,574005.28,2551.134578,1611.050013,233.0,5006


### **Pivot Tables**

In [58]:
display(df.pivot_table(index='date', columns='user_type', values='price', aggfunc=np.mean))

user_type,a,b,c,d,n
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,2360.73,2495.096,2552.75,1003.74,
2016-01-02,,2585.63,3380.715,3221.525,1049.763333
2016-01-13,3643.733636,3164.7375,2465.8975,2150.992857,
2016-01-14,1898.148182,1628.5,3405.648889,2623.214,
2016-01-15,1759.955,4074.144286,440.345,2342.03,2725.425
2016-01-16,2032.1,3303.661667,1722.183333,1585.26,2654.22
2016-01-17,1762.17,1627.9975,3467.6575,1234.581429,
2016-01-18,1939.475,2754.012,3461.6325,2650.68,
2016-01-19,1714.3975,960.695,2193.397778,1973.841111,
2016-01-20,688.38,1686.525,2072.63,1433.358333,


### **Stacking**

In [59]:
print(df.stack())

0    serial_no                1000
     user_id                  -101
     product_id                  0
     quantity_purchased         12
     price                 1379.22
                            ...   
999  price                  164.45
     user_type                   a
     user_class                new
     purchase_week              39
     encoded_user_type           0
Length: 9907, dtype: object
