## Imports and Data Initialization

In [5]:
import pandas as pd
import darc_core
import datetime as dt
from darc_core.preprocessing import round1_preprocessing
from darc_core.utils import check_format_trans_file

In [6]:
df = pd.read_csv("../Master/data/ground_truth.csv", parse_dates=["date"])
# Need to find a way to remove the additional date 1900-01-01 while keeping datetime format. 
df["hours"]= pd.to_datetime(df["hours"], format = '%H:%M%S')
df.head(10)

Unnamed: 0,id_user,date,hours,id_item,price,qty
0,17850,2010-12-01,1900-01-01 08:02:06,85123A,2.55,6
1,17850,2010-12-01,1900-01-01 08:02:06,71053,3.39,6
2,17850,2010-12-01,1900-01-01 08:02:06,84406B,2.75,8
3,17850,2010-12-01,1900-01-01 08:02:06,84029G,3.39,6
4,17850,2010-12-01,1900-01-01 08:02:06,84029E,3.39,6
5,17850,2010-12-01,1900-01-01 08:02:06,22752,7.65,2
6,17850,2010-12-01,1900-01-01 08:02:06,21730,4.25,6
7,17850,2010-12-01,1900-01-01 08:02:08,22633,1.85,6
8,17850,2010-12-01,1900-01-01 08:02:08,22632,1.85,6
9,13047,2010-12-01,1900-01-01 08:03:04,84879,1.69,32


In [86]:
df["id_item"]=df["id_item"].astype("category")
df["date"]=df["date"].astype("category")
categorical = set ({'date','id_item'})

In [87]:
# Initial MU : 14.1+ MB
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307054 entries, 0 to 307053
Data columns (total 6 columns):
id_user    307054 non-null int64
date       307054 non-null category
hours      307054 non-null datetime64[ns]
id_item    307054 non-null category
price      307054 non-null float64
qty        307054 non-null int64
dtypes: category(2), datetime64[ns](1), float64(1), int64(2)
memory usage: 10.7 MB


In [24]:
#Quasi identification sur base de date et d'heure
df[(df["date"]=="2010-12-01") & (df["hours"]=="1900-01-01 08:02:06")]

Unnamed: 0,id_user,date,hours,id_item,price,qty
0,17850,2010-12-01,1900-01-01 08:02:06,85123A,2.55,6
1,17850,2010-12-01,1900-01-01 08:02:06,71053,3.39,6
2,17850,2010-12-01,1900-01-01 08:02:06,84406B,2.75,8
3,17850,2010-12-01,1900-01-01 08:02:06,84029G,3.39,6
4,17850,2010-12-01,1900-01-01 08:02:06,84029E,3.39,6
5,17850,2010-12-01,1900-01-01 08:02:06,22752,7.65,2
6,17850,2010-12-01,1900-01-01 08:02:06,21730,4.25,6


In [25]:
# Number of unique users

df["id_user"].nunique()

4034

In [26]:
# The quantity bought by user 17850 is very often 6 (215/297=72%)

#print(df[df["id_user"]==12680].head(1000).to_string())
df[df["id_user"]==17850]["qty"].value_counts()

6     215
4      31
2      24
12     15
8      11
3       1
Name: qty, dtype: int64

In [27]:
# The item 84406B is most bought by user 17850 (15/176 = 8% vs 0.9% mean per other user so more than x8 times )
df[df["id_item"]=="84406B"]["id_user"].value_counts()
#df[df["id_item"]=="84406B"]["id_user"].value_counts().mean()

17850    15
17858     6
17191     5
15708     4
12775     4
13593     4
17419     4
16907     3
17812     3
12477     3
17287     3
14525     3
17730     3
14530     3
17049     3
15059     3
15291     2
16693     2
17082     2
17450     2
17625     2
17865     2
17704     2
15044     2
13458     2
15727     2
15648     2
14113     2
16458     2
16771     2
         ..
13322     1
17629     1
16350     1
13803     1
12836     1
14329     1
16634     1
14217     1
13268     1
16744     1
15493     1
14967     1
16007     1
16244     1
12904     1
17604     1
16426     1
13954     1
13187     1
17146     1
15021     1
16268     1
14221     1
16271     1
12949     1
12956     1
13474     1
14499     1
17580     1
14687     1
Name: id_user, Length: 106, dtype: int64

In [28]:
# The user 12688 is the one who most bought at the date 2011-08-18
df[df["date"]=="2011-08-18"]["id_user"].value_counts()

12688    171
15472     76
15867     54
16813     38
16767     34
14221     32
13381     31
18225     31
14565     31
17750     31
13048     29
16743     28
16187     28
14189     27
14064     26
12729     26
15301     25
17736     23
15615     22
18272     22
12839     20
16261     20
17720     20
12680     20
17576     18
17243     18
14132     18
15189     18
13319     17
16945     16
17045     16
13273     16
15743     16
15024     15
13617     15
15152     14
15144     14
13113     14
15232     13
13014     13
17001     13
15125     12
12962     11
14741      9
15505      7
16928      7
16582      6
13027      5
16626      4
14051      4
13576      3
13784      3
17386      2
17742      2
15400      1
15797      1
14305      1
Name: id_user, dtype: int64

In [29]:
{df["qty"].min(),df["qty"].max()}

{1, 4800}

# Implementation of k-anonymity

In [3]:
def get_spans(df,partition, scale = None):
    spans={}
    for column in df.columns:
        if column in categorical:
            span=len(df[column][partition].unique())
        else:
            span=df[column][partition].max()-df[column][partition].min()
        if scale is not None:
            span = span/scale[column]
        spans[column]=span
    return spans

In [89]:
full_spans = get_spans(df,df.index)
full_spans

{'id_user': 5940,
 'date': 305,
 'hours': Timedelta('0 days 13:59:06'),
 'id_item': 3612,
 'price': 8142.749,
 'qty': 4799}

In [4]:
def split(df, partition, column):
    
    """""
    :param     df: The dataframe from which we want to split a partition
    :param     partition: The range of indexes (rows) we want to select from df 
    :param     column: The column to select from df, and operate the split with
    
    Example : 
    
    index         age(numerical)   gender(categorical)
      0            19              female
      1            25              female
      2            14              male
      3            51              female

      
      If column = age 
      Median = (S[2]+S[1])/2 = (14+25)/2 = 19.5 ; Sorted age : {14,19,25,51} = {S[2],S[0],S[1],S[3]}
      return (Range{2,0},Range{1,3})
      
      If column=gender (It is precisely because of even cardinals that we can't calculate medians for non numerical)
      values = {gender,female}
      return (Range{2},Range{0,1,3}) 
      
      As shown above, cases where the partition is split on a non numerical column, can result in a weird situation
      where left and right partition don't really have the same number of values.
      And this is only a showcase situation, it is virtually possible to have 0 value at left, and all values at right
      (which also happens when the quasi identifier has only one value in the partition : card(values)=1)
      or 75% at left and 25% at right etc... 
      
      So really for categorical values, the notion of median (which is supposed to split the set in 50%) doesn't make
      any sens here.

    """""
    
    
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                                            dfp = df[column][partition]
                                            
                          df[column] is a set (like an array one dimension, index : value)
                          df[column][partition] is a subset of df[column] (selection of rows)
                          It is also treated as a set
                          
                            Example: 
                            
                            df[age] = 0 19 = df[df.index]
                                      1 25
                                      2 14
                                      3 51
                            
                          df[column][0] = 19 
                          df[column][[0,1]]= 19
                                             25
                          [0,1] can be viewed as the range of indexes from 0 to 1 included
                          df.index is the range of all indexes from 0 to df.len-1
                          partition is nothing more than the set of indexes of a given number of rows
                            
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
   
    dfp = df[column][partition]
    
    if ((column in categorical) | (column == "date") | (column == "hours")):
        values = dfp.unique()
        lv=set(values[:len(values)//2])
        rv=set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)],dfp.index[dfp.isin(rv)]
    else:
        median=dfp.median()
        #print(dfp, median)
        dfl=dfp.index[dfp<median]
        dfr=dfp.index[dfp>=median]
        #print(df["qty"][dfl])
        return(dfl,dfr)

In [33]:
39/2

19.5

In [91]:
dfl,dfr = split(df,df.index,"id_item")
dfl,dfr

(Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                  8,      9,
             ...
             307040, 307041, 307042, 307043, 307044, 307045, 307046, 307047,
             307049, 307050],
            dtype='int64', length=217230),
 Int64Index([  9985,  10015,  10062,  10074,  10078,  10110,  10137,  10144,
              10169,  10173,
             ...
             307019, 307023, 307030, 307032, 307033, 307036, 307048, 307051,
             307052, 307053],
            dtype='int64', length=89824))

In [5]:
# If partition has less members than k then it's not a valid one, so we can't further divide it
def is_k_anonymous (df, partition, sensitive_column, k=5):
    #print(pd.to_datetime(df[sensitive_column][partition]).max()-pd.to_datetime(df[sensitive_column][partition]).min())
    #Length of Interval date for each equivalence class
    d=pd.to_datetime(df[sensitive_column][partition]).max()-pd.to_datetime(df[sensitive_column][partition]).min()
    d=d.days
    if ((len(partition)<k)):
        #print(partition)
        #print(pd.to_datetime(df[sensitive_column][partition]).max())
        return False
    return True

In [6]:
def partition_dataset(df, feature_columns,sensitive_column, scale, is_valid):  
    """""
    
    :param     df: The dataframe to partition
    :param     feature_columns: The Quasi-Identifier columns 
    :param     sensitive_column : The Sensitive Data we wish to protect
    :param     scale : original full_spans of df before the first split
    
    
    """""
    finished_partitions=[]
    
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                                            [df.index]
                                            
                    df.index is a range of indexes, it can be seen as an array of indexes while it's not 
                    technically an array.
                    [df.index] in between brackets simply means you're initiliazing a list of index ranges, 
                    you can see it as an array of arrays.
                            
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    partitions=[df.index]
    
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                                            while partitions:
                                            
                    The main loop here works like a chef, think of partitions like a cucumber, 
                    each time we're going through the while, chef cuts it in two equal parts (more or less **)
                                    (i1) <----> => (i2) <--><--> =>  (i3) <-><-><-><->
                                        PS: Sorry for the poor drawing skills
                        ** Refer to the split method to understand this remarque.
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    while partitions:
        
        # Pop the oldest partition from the list of partitions, so that we can split it further (or at least try).
        # Remember partition is the same type as df.index so it's a range of indexes (an "array" of indexes)
        
        partition = partitions.pop(0)
        
        # Update the spans (number of unique values for each quasi-identifier column) for this iteration
        
        spans = get_spans(df[feature_columns],partition, scale)
        
        """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                    for column, span in sorted(spans.items(), key = lambda x:-x[1]):
                    
                    for {column,span} in {column1 : numberOfUniqueItemInColumn_1    
                                          column2 : numberOfUniqueItemInColumn_2   
                                          ....
                                          columnN : numberOfUniqueItemInColumn_N   
                                          } 
                    where numberOfUniqueItemInColumn_i is a drecreasing sequence (suite décroissante) 
                    
                    Here we loop through the quasi identifiers (since span stores the quasi identifier column
                    names and values) ordered in decreasing order of multiplicity (number of # values).
                    
                    
                    Details :
                    
                    spans.items() returns a hashmap of spans column names as keys, and span column values as values
                    
                    key = lambda x:-x[1] tells to sorted(), hey I want you to sort this hashmap in decreasing order 
                    of the elements that are in [1] so in decreasing values in this case. 
                    (key = labmda x:x[0] would've returned a sorted hashmap in increasing order of indexes which 
                    is the default)
             
                    
        """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    
        for column, span in sorted(spans.items(), key = lambda x:-x[1]):
            
            # lp and rp are like partition : they are a range of indexes (an "array" of indexes)
            # lp is the ranges of indexes for whome df[column][lp] < median
            # rp is the ranges of indexes for whome df[column][rp] >= median

            lp, rp = split(df,partition,column)
            
            """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                    if not is_valid(df,lp,sensitive_column) or not is_valid(df,rp,sensitive_column):
                                            
                    As long as one of both split partitions is still valid (len(partition)>k) we need
                    to break from the for loop (don't even look for next quasi identifier) and add lp and rp 
                    to the list of partitions we want to cut even more.
                    
                    Otherwise we enter the if, and execute continue : 
                    What continue does, is forget the rest of the loop and skip to the next iteration,
                    so select the next quasi identifier to divide the partition.
                    
                    If we tried with all quasi identifiers, and none worked, then our partition is done (as
                    small as possible) and when we execute continue it will have no more quasi to explore. 
                    So executes the else and append the finished partition to the set of finished_partitions.
        
                            
            """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
            if not is_valid(df,lp,"date") or not is_valid(df,rp,"date"):
                continue
            partitions.extend((lp,rp))
            break
        else:
            finished_partitions.append(partition)
            #print(partition)
    return finished_partitions

In [199]:
feature_columns=["date","hours"]
sensitive_column="price"
finished_partitions=partition_dataset(df,feature_columns,sensitive_column, full_spans, is_k_anonymous)

In [203]:
len(finished_partitions)
#df["date"][finished_partitions[0]]
#finished_partitions[0]
#finished_partitions

305

In [None]:
len(df)/305
df["qty"][finished_partitions[5]].value_counts()

In [182]:
"""
Various handlers for .agg() method to call on each column of df.loc[partition] depending on the column type

:param   series: Column Set (from df.loc[partition]) to be processed 
"""
def agg_categorical_column(series):
    return[','.join(set(series))]

def agg_numerical_column(series):
    return[series.mean()]

def agg_date_column(series):
    # remove the hh:mm:ss
    #series=series.dt.date
    if(isinstance(series,pd.Series)):
        print(series[series.idxmin], series[series.idxmax])
        return[[series[series.idxmin], series[series.idxmax]]]
    #return [[series.max(),series.min()]]
    return ["something"]


In [183]:
def build_anonymized_dataset(df,partitions,feature_columns,sensitive_column,max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            if column == "date":
                aggregations[column]=agg_date_column
            else:
                aggregations[column]=agg_categorical_column
            
        else: 
            aggregations[column]=agg_numerical_column
            
    rows=[]
    for i, partition in enumerate(partitions):
        if i%100==1:
            print("Finished {} partitions ! ".format(i) )
        if max_partitions is not None and i > max_partitions:
            break
        #df.agg({column : method_to_apply})
        #df.agg({numerical_column : agg_numerical_column
        #         categorical_column : agg_categorical_column
        #        })
        # Result 1 line multiple column, each cell=aggreg result
        #print(aggregations)
        #print(df.loc[partition].agg(aggregations,squeeze=False))
        #print(type(df.loc[partition]["date"]))
        grouped_columns=df.loc[partition].agg(aggregations,squeeze=False)
        # Count spans of sensitive column in a partition
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({
            sensitive_column : 'count'
        })
        values = grouped_columns.iloc[0].to_dict()
        #print(values)
        for sensitive_value, count in sensitive_counts[sensitive_column].items():
            if count==0:
                continue
            values.update({
                sensitive_column : sensitive_value,
                'count' : count,
            })
            rows.append(values.copy())
    return pd.DataFrame(rows)


In [184]:
dfn=build_anonymized_dataset(df,finished_partitions,feature_columns,sensitive_column)

2010-12-01 00:00:00 2010-12-05 00:00:00
Finished 1 partitions ! 
2010-12-06 00:00:00 2010-12-10 00:00:00
2010-12-12 00:00:00 2010-12-16 00:00:00
2010-12-23 00:00:00 2011-01-06 00:00:00
2011-01-13 00:00:00 2011-01-18 00:00:00
2011-01-19 00:00:00 2011-01-24 00:00:00
2011-01-25 00:00:00 2011-01-28 00:00:00
2011-01-30 00:00:00 2011-02-03 00:00:00
2011-02-10 00:00:00 2011-02-15 00:00:00
2011-02-10 00:00:00 2011-02-15 00:00:00
2011-02-16 00:00:00 2011-02-20 00:00:00
2011-02-21 00:00:00 2011-02-25 00:00:00
2011-02-27 00:00:00 2011-03-03 00:00:00
2011-03-10 00:00:00 2011-03-14 00:00:00
2011-03-15 00:00:00 2011-03-20 00:00:00
2011-03-21 00:00:00 2011-03-25 00:00:00
2011-03-27 00:00:00 2011-03-31 00:00:00
2011-04-01 00:00:00 2011-04-05 00:00:00
2011-04-06 00:00:00 2011-04-11 00:00:00
2011-04-12 00:00:00 2011-04-17 00:00:00
2011-04-18 00:00:00 2011-04-26 00:00:00
2011-04-27 00:00:00 2011-05-03 00:00:00
2011-05-04 00:00:00 2011-05-09 00:00:00
2011-05-10 00:00:00 2011-05-15 00:00:00
2011-05-16 00:0

2011-09-12 00:00:00 2011-09-15 00:00:00
2011-10-14 00:00:00 2011-10-16 00:00:00
2011-10-17 00:00:00 2011-10-19 00:00:00
2011-11-06 00:00:00 2011-11-10 00:00:00
2011-11-11 00:00:00 2011-11-13 00:00:00
2011-11-17 00:00:00 2011-11-22 00:00:00
2011-11-29 00:00:00 2011-12-04 00:00:00
2011-12-05 00:00:00 2011-12-09 00:00:00
2011-06-14 00:00:00 2011-06-17 00:00:00
2011-06-19 00:00:00 2011-06-23 00:00:00
2011-06-30 00:00:00 2011-07-05 00:00:00
2011-07-06 00:00:00 2011-07-10 00:00:00
2011-07-11 00:00:00 2011-07-15 00:00:00
2011-07-17 00:00:00 2011-07-21 00:00:00
2011-07-28 00:00:00 2011-08-01 00:00:00
2011-08-02 00:00:00 2011-08-07 00:00:00
2011-08-08 00:00:00 2011-08-12 00:00:00
2011-08-14 00:00:00 2011-08-18 00:00:00
2011-08-19 00:00:00 2011-08-23 00:00:00
2011-08-24 00:00:00 2011-08-30 00:00:00
2011-08-31 00:00:00 2011-09-05 00:00:00
2011-09-06 00:00:00 2011-09-11 00:00:00
2011-09-12 00:00:00 2011-09-15 00:00:00
2011-09-22 00:00:00 2011-09-27 00:00:00
2011-09-28 00:00:00 2011-10-03 00:00:00


2011-12-05 00:00:00 2011-12-09 00:00:00
2011-12-05 00:00:00 2011-12-09 00:00:00
2011-06-24 00:00:00 2011-06-26 00:00:00
2011-06-27 00:00:00 2011-06-29 00:00:00
2011-07-22 00:00:00 2011-07-24 00:00:00
2011-07-25 00:00:00 2011-07-27 00:00:00
2011-09-16 00:00:00 2011-09-18 00:00:00
2011-09-19 00:00:00 2011-09-21 00:00:00
2011-10-14 00:00:00 2011-10-16 00:00:00
2011-10-17 00:00:00 2011-10-19 00:00:00
2011-11-11 00:00:00 2011-11-13 00:00:00
2011-11-14 00:00:00 2011-11-16 00:00:00
2011-06-19 00:00:00 2011-06-23 00:00:00
2010-12-17 00:00:00 2010-12-19 00:00:00
2010-12-17 00:00:00 2010-12-19 00:00:00
2010-12-20 00:00:00 2010-12-22 00:00:00
2010-12-20 00:00:00 2010-12-22 00:00:00
2011-01-07 00:00:00 2011-01-09 00:00:00
2011-01-07 00:00:00 2011-01-09 00:00:00
2011-02-04 00:00:00 2011-02-06 00:00:00
2011-02-04 00:00:00 2011-02-06 00:00:00
2011-03-04 00:00:00 2011-03-06 00:00:00
2011-03-04 00:00:00 2011-03-06 00:00:00
2011-03-07 00:00:00 2011-03-09 00:00:00
2011-03-07 00:00:00 2011-03-09 00:00:00


2011-08-24 00:00:00 2011-08-30 00:00:00
2011-08-24 00:00:00 2011-08-30 00:00:00
2011-08-31 00:00:00 2011-09-05 00:00:00
2011-08-31 00:00:00 2011-09-05 00:00:00
2011-09-06 00:00:00 2011-09-11 00:00:00
2011-09-06 00:00:00 2011-09-11 00:00:00
2011-09-16 00:00:00 2011-09-18 00:00:00
2011-09-16 00:00:00 2011-09-18 00:00:00
2011-09-19 00:00:00 2011-09-21 00:00:00
2011-09-19 00:00:00 2011-09-21 00:00:00
2011-09-22 00:00:00 2011-09-27 00:00:00
2011-09-22 00:00:00 2011-09-27 00:00:00
2011-09-28 00:00:00 2011-10-03 00:00:00
2011-09-28 00:00:00 2011-10-03 00:00:00
2011-10-04 00:00:00 2011-10-07 00:00:00
2011-10-04 00:00:00 2011-10-07 00:00:00
2011-10-09 00:00:00 2011-10-13 00:00:00
2011-10-09 00:00:00 2011-10-13 00:00:00
2011-10-14 00:00:00 2011-10-16 00:00:00
2011-10-17 00:00:00 2011-10-19 00:00:00
2011-10-17 00:00:00 2011-10-19 00:00:00
2011-10-26 00:00:00 2011-10-30 00:00:00
2011-10-26 00:00:00 2011-10-30 00:00:00
2011-10-31 00:00:00 2011-11-04 00:00:00
2011-10-31 00:00:00 2011-11-04 00:00:00


2010-12-06 00:00:00 2010-12-08 00:00:00
2010-12-09 00:00:00 2010-12-13 00:00:00
2010-12-14 00:00:00 2010-12-17 00:00:00
2011-01-05 00:00:00 2011-01-07 00:00:00
2011-01-10 00:00:00 2011-01-13 00:00:00
2010-12-01 00:00:00 2010-12-08 00:00:00
2010-12-01 00:00:00 2010-12-09 00:00:00
2010-12-14 00:00:00 2010-12-17 00:00:00
2010-12-22 00:00:00 2011-01-17 00:00:00
2010-12-01 00:00:00 2010-12-16 00:00:00
2011-03-10 00:00:00 2011-03-14 00:00:00
2011-03-10 00:00:00 2011-03-14 00:00:00
2011-03-25 00:00:00 2011-03-31 00:00:00
2011-03-28 00:00:00 2011-03-31 00:00:00
2011-03-21 00:00:00 2011-03-24 00:00:00
2011-03-25 00:00:00 2011-03-31 00:00:00
2011-03-21 00:00:00 2011-03-25 00:00:00
2011-03-29 00:00:00 2011-03-31 00:00:00
2011-04-12 00:00:00 2011-04-14 00:00:00
2011-04-12 00:00:00 2011-04-15 00:00:00
2011-04-18 00:00:00 2011-04-20 00:00:00
2011-04-21 00:00:00 2011-04-26 00:00:00
2011-04-13 00:00:00 2011-04-21 00:00:00
2011-04-13 00:00:00 2011-04-18 00:00:00
2011-04-13 00:00:00 2011-04-26 00:00:00


In [None]:
sectors=dfn.groupby("date")
#dfn["date"].value_counts().head(60)

In [None]:
dfn.to_csv("k_anon:date+qty:price.csv")

In [None]:
dfn

In [None]:
df["date"]

In [None]:
#df["date"].min()-df["date"].max()

In [None]:
df["date"]=pd.DatetimeIndex(df["date"]).month

In [None]:
df[df["date"]=="2010-12-01"]["id_user"].value_counts()

In [None]:
df["id_item"].nunique()

In [76]:
df["date"].max()

Timestamp('2011-12-09 00:00:00')

In [77]:
type(df["date"][0])

pandas._libs.tslibs.timestamps.Timestamp

In [187]:
dfn.head(200)

Unnamed: 0,count,date,price,qty
0,2,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.14,1.0
1,5,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.21,1.0
2,14,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.29,1.0
3,22,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.42,1.0
4,16,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.55,1.0
5,30,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.65,1.0
6,93,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.85,1.0
7,11,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.95,1.0
8,210,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",1.25,1.0
9,39,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",1.45,1.0


In [188]:
dfn

Unnamed: 0,count,date,price,qty
0,2,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.14,1.000000
1,5,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.21,1.000000
2,14,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.29,1.000000
3,22,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.42,1.000000
4,16,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.55,1.000000
5,30,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.65,1.000000
6,93,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.85,1.000000
7,11,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",0.95,1.000000
8,210,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",1.25,1.000000
9,39,"[2010-12-01 00:00:00, 2010-12-05 00:00:00]",1.45,1.000000


In [192]:
df

Unnamed: 0,id_user,date,hours,id_item,price,qty
0,17850,2010-12-01,1900-01-01 08:02:06,85123A,2.55,6
1,17850,2010-12-01,1900-01-01 08:02:06,71053,3.39,6
2,17850,2010-12-01,1900-01-01 08:02:06,84406B,2.75,8
3,17850,2010-12-01,1900-01-01 08:02:06,84029G,3.39,6
4,17850,2010-12-01,1900-01-01 08:02:06,84029E,3.39,6
5,17850,2010-12-01,1900-01-01 08:02:06,22752,7.65,2
6,17850,2010-12-01,1900-01-01 08:02:06,21730,4.25,6
7,17850,2010-12-01,1900-01-01 08:02:08,22633,1.85,6
8,17850,2010-12-01,1900-01-01 08:02:08,22632,1.85,6
9,13047,2010-12-01,1900-01-01 08:03:04,84879,1.69,32


In [197]:
df["date"].nunique()

305

In [202]:
df["date"].value_counts()

2011-11-06    2821
2011-10-06    2492
2011-11-20    2414
2011-10-30    2384
2011-11-17    2286
2011-11-22    2268
2011-11-13    2180
2011-09-29    2153
2011-11-16    2124
2011-11-10    2046
2011-11-04    2024
2011-10-10    2024
2010-12-05    2020
2011-11-24    2018
2011-12-06    2014
2011-09-22    1987
2011-11-27    1978
2010-12-02    1961
2011-11-11    1948
2011-11-14    1940
2011-11-15    1931
2011-12-05    1879
2011-09-25    1838
2011-10-27    1831
2011-10-13    1831
2011-11-09    1817
2011-11-28    1808
2011-11-03    1806
2011-11-21    1803
2011-11-29    1795
              ... 
2011-08-12     569
2011-07-04     564
2011-06-01     562
2011-04-03     558
2011-02-18     551
2011-07-01     539
2011-01-04     511
2011-01-19     497
2011-04-06     493
2011-08-14     476
2011-01-20     471
2011-07-03     461
2011-02-08     441
2011-02-14     436
2011-01-21     424
2011-05-29     416
2011-08-07     405
2011-02-09     403
2011-06-27     402
2011-06-29     393
2011-02-11     382
2011-05-01  

In [16]:
dfn = pd.read_csv("./oto.csv")
true=pd.read_csv("./ground_truth.csv")
dfn
ground_truth, submission = round1_preprocessing(
    "./ground_truth.csv", "./oto2.csv"
)
check_format_trans_file(ground_truth, submission)

ValueError: Length mismatch: Expected axis has 7 elements, new values have 6 elements

In [14]:
dfn["qty"]=df["qty"]

In [15]:
dfn.to_csv("oto2.csv")

In [27]:
false=pd.read_csv("./oto2.csv")
false

Unnamed: 0.1,Unnamed: 0,id_user,date,hours,id_item,price,qty
0,0,13090,2010/12/02,"[06:20, 08:04]",21421,1.25,6
1,1,13090,2010/12/02,"[06:20, 08:04]",21422,0.85,6
2,2,13538,2011/03/10,"[06:20, 08:04]",48185,6.75,8
3,3,13538,2011/03/10,"[06:20, 08:04]",48187,6.75,6
4,4,13538,2011/03/10,"[06:20, 08:04]",21524,6.75,6
5,5,13538,2011/03/10,"[06:20, 08:04]",21523,6.75,2
6,6,13538,2011/03/10,"[06:20, 08:04]",21955,6.75,6
7,7,12435,2011/03/17,"[06:20, 08:04]",22605,12.75,6
8,8,12435,2011/03/17,"[06:20, 08:04]",22606,12.75,6
9,9,14619,2011/04/08,"[06:20, 08:04]",22767,9.95,32


In [28]:
dfn["qty"]=df["qty"]
dfn
dfn.to_csv("./oto2.csv")
dfn

Unnamed: 0,id_user,date,hours,id_item,price,qty
0,13090,2010/12/02,"[06:20, 08:04]",21421,1.25,6
1,13090,2010/12/02,"[06:20, 08:04]",21422,0.85,6
2,13538,2011/03/10,"[06:20, 08:04]",48185,6.75,8
3,13538,2011/03/10,"[06:20, 08:04]",48187,6.75,6
4,13538,2011/03/10,"[06:20, 08:04]",21524,6.75,6
5,13538,2011/03/10,"[06:20, 08:04]",21523,6.75,2
6,13538,2011/03/10,"[06:20, 08:04]",21955,6.75,6
7,12435,2011/03/17,"[06:20, 08:04]",22605,12.75,6
8,12435,2011/03/17,"[06:20, 08:04]",22606,12.75,6
9,14619,2011/04/08,"[06:20, 08:04]",22767,9.95,32


In [23]:
df["hours"].dt.hour

0          8
1          8
2          8
3          8
4          8
5          8
6          8
7          8
8          8
9          8
10         8
11         8
12         8
13         8
14         8
15         8
16         8
17         8
18         8
19         8
20         8
21         8
22         8
23         8
24         8
25         8
26         8
27         8
28         8
29         8
          ..
307024    12
307025    12
307026    12
307027    12
307028    12
307029    12
307030    12
307031    12
307032    12
307033    12
307034    12
307035    12
307036    12
307037    12
307038    12
307039    12
307040    12
307041    12
307042    12
307043    12
307044    12
307045    12
307046    12
307047    12
307048    12
307049    12
307050    12
307051    12
307052    12
307053    12
Name: hours, Length: 307054, dtype: int64

In [4]:
df["date"].value_counts()

2011-11-06    2821
2011-10-06    2492
2011-11-20    2414
2011-10-30    2384
2011-11-17    2286
2011-11-22    2268
2011-11-13    2180
2011-09-29    2153
2011-11-16    2124
2011-11-10    2046
2011-10-10    2024
2011-11-04    2024
2010-12-05    2020
2011-11-24    2018
2011-12-06    2014
2011-09-22    1987
2011-11-27    1978
2010-12-02    1961
2011-11-11    1948
2011-11-14    1940
2011-11-15    1931
2011-12-05    1879
2011-09-25    1838
2011-10-27    1831
2011-10-13    1831
2011-11-09    1817
2011-11-28    1808
2011-11-03    1806
2011-11-21    1803
2011-11-29    1795
              ... 
2011-02-02     569
2011-07-04     564
2011-06-01     562
2011-04-03     558
2011-02-18     551
2011-07-01     539
2011-01-04     511
2011-01-19     497
2011-04-06     493
2011-08-14     476
2011-01-20     471
2011-07-03     461
2011-02-08     441
2011-02-14     436
2011-01-21     424
2011-05-29     416
2011-08-07     405
2011-02-09     403
2011-06-27     402
2011-06-29     393
2011-02-11     382
2011-05-01  