In [None]:
import json
import pandas as pd
import re
import numpy as np 

In [None]:
data = pd.read_json('sizes.json', lines=True)


In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
column_names=list(data)

In [None]:
column_types=[type(data._id[0]),
type(data.__v[0]),
type(data.created_at[0]),
type(data.filter_label[0]),
type(data.label[0]),
type(data.old_label[0]),
type(data.updated_at[0])]

In [None]:
for i in range (0,len(column_names)):
    print(column_names[i],column_types[i])

In [None]:
data.isnull().sum()

**Observations**
- dataframe has 7 columns/ 51210 lines
- there are practically no missing values, except in column 'filter_label' where most of the values are missing. All other columns have no missing data
- after an initial glance at the dataframe, we can make the following assumption/hypothesis: 
    - in columns 'created_at' and 'updated_at' the dictionary key holds reference to the information provided, and dictionary value holds the information (date / time of data entry)
    - in columns 'created_at' and 'updated_at' dictionary key holds only one value 
    - columns' 'filter_label', 'label' and 'old_label' entries are lists which hold 1 or more values, with no particular pattern 
    

**Potential Issues**
- dataframe entries are objects which is not the most suitable type for data manipulation
- as cells contain more than one value, it also makes descriptive analysis more complicated 
- columns' 'filter_label', 'label' and 'old_label' hold inconsistent data: for instance, we observe inconsistencies relating to data type, spelling, length of entry 
- as a result, the data presented in this format does not give an opportunity to control and maintain business processes relating to stock control/forecasting, sales analysis/forecasting and planning for elevated customer experience (smooth order process, attractive and relavent merchandise offering, personalised recommendations etc)

**Data Cleansing Strategy**  

* **STEP 1: columns 'created_at' and 'updated_at'**

- flatten the columns out: explore whether dictionary keys should become additional columns (depending on the information they present, and how many different key values we find), and assign dictionary values as cell values accordingly  

* **STEP2: label columns** 

*Column 'old_label' suggests containing the original values/information submitted by the boutiques and poses an issue of not only varied spelling, but also the type of data (information) provided.* 

*Dealing with this column should be done iteratively, taking small steps, and focusing on attempt to cluster the data and find similarities/connections between the data entries, removing duplicates and finally identifying what information is presented: for instance, colour of the item, size of the item etc.* 

*Consequently, we may need to create additional columns which would hold different types of information, such has: 'size', 'colour', 'shape' of the item*   

*Columns 'filter_label' and 'label' seem to hold the information resulting after some cleaning operations have been completed on 'old_label' column*

---

**STEP 1**

**Plan**
- separate dictionary keys and values 
- count the number of unique keys, and the number of unique values within a column

In [None]:
def flatten_dic(df,col_name):
    """store dictionary keys and values of the column in separate list"""
    keys=[]
    values=[]
    for index, row in data.iterrows():
        item=row[col_name]
        keys.extend(item.keys())
        values.extend(item.values())
    return keys, values

In [None]:
def check_unique(item):
    """count unique keys and unique values within a column"""
    unq_items=set(item)
    return len(unq_items)

In [None]:
def clean_col(data,col_name):
    '''extract the dictionary value, and assign it cell value '''
    for index,row in data.iterrows():
        item=row[col_name]
        value=list(item.values())    
        data.loc[data.index[index], col_name]=str(value[0])
    return data
 

In [None]:
def into_string(data):
    for index,row in data.iterrows():
        item=row['old_label']
        item=', '.join(item)
        item=re.sub('[^A-Za-z0-9,-]+', ' ', item)
        data.loc[data.index[index], 'old_label']=item
    return data


In [None]:


def add_infotype(data):
    for index,row in data.iterrows():
        item=row['old_label']
        label=[]
        """ split the sentence back into original sub-items, and assign the category for each item in the cell"""
        result=item.replace(' ','').split(',')
        for r in result:
            if r.isdigit():

                label.append('1')
            elif r.isalpha():

                label.extend('2')
            else:

                label.extend('3')
        final_label=set(label)

        """ if cell has more than one unique category, it needs to be assigned category 3 and cleaned further"""
        if len(final_label)==1:
            info_type1=list(final_label)[0]
            
            data.loc[index,['info_type']] = info_type1
            
        else:
            info_type2='3'
            data.loc[index,['info_type']] = info_type2
    return data
            

In [None]:
col_name='_id'
col_name2='created_at'
col_name3='updated_at'

In [None]:
keys_id, values_id= flatten_dic(data, col_name)
keys_cr_at, values_cr_at=flatten_dic(data,col_name2)
keys_updt_at, values_updt_at=flatten_dic(data,col_name3)


In [None]:
print('col _id unique keys:', check_unique(keys_id), ';' , 'col _id unique values:', check_unique(values_id))

In [None]:
print('col created_at unique keys:', check_unique(keys_cr_at), ';' , 'col created_at unique values:', check_unique(values_cr_at))

In [None]:
print('col updt_at unique keys:', check_unique(keys_updt_at), ';' , 'col updt_at unique values:', check_unique(values_updt_at))

**Observations**
- Only 1 unique key value across all lines/per column
- We can clean up each cell/per column by removing the dictionary key. The key relates to the current name of the column. 


In [None]:
clean_col(data, '_id')


In [None]:
clean_col(data, col_name2)

In [None]:
clean_col(data, col_name3)

In [None]:
data.head()

**Observations**
- 'created_at' and 'updated_at' columns point to the date of the entry and should by converted to datetime type


In [None]:
data['created_at']  = pd.to_datetime(data['created_at'])
data['updated_at']  = pd.to_datetime(data['updated_at'])

In [None]:
data.head()

---

**STEP 2** 

**Plan**
- we observe, that each cell contains a list of items.
- convert the list into a string, separated by ','
- once the list is converted to a string, we can clean special characters as part of the first iterations of the cleaning exercise.
- following the initial exploratin of the data set, I suggest keeping numerical and alphabetical characters, and "-", which is common characteristics of what would appear on merchandise label. 

---

- identify if a cell contains numbers only, strings only, combination of both (could be within a string, could be within a cell) 

- based on that, assign each cell to a category:
    * #1: numbers only  
    * #2: alpha characters only
    * #3 combination of both: either within a string, or, within the cell 
    
- once the categories have been assigned, I suggest the following: 
    * #1: numbers only: through exploratory analysis identify if the data is interval/ratio 
    * #1: explore distribution, outliers, median/mean values, if applicable
    * #1: info above could form further hypothesis what information this data may present. For instance, clothing                 sizes/shoe size?
    
    * #2: alpha characters only: explore distribution, unique values 
    * #2: form a word corpus, apply NLP techniques in order to: explore potential word clusters? identify word groups?          (verb, adjective, noun?) predict information type? (colour? pattern? location? etc?)             
    * #2: experiment with NLP in order to clean the spelling looping through the dataframe line-by-line 
    
- further cleaning iterations needed for category 3:
    
    * #3 further iterations may include: alternative splitting strategies (split by comma?/ space? ) 
    * #3 further iterations may include: after each splitting strategy check for the number of unique values (do any          of the words repeat throughout the dataframe?)  
    * #3 further iterations may include identifying lines where combination of alpha / numerical characters are within          one string and/or within once cell? Explore unique values/ distribution 
    * #3 further iterations may include: explore how strings are distributed across the dataframe before any splitting          strategies(ie, analyse all lines as they are, just after assigning them to category 3)


---

In [None]:
data=into_string(data)

In [None]:
data.head()

In [None]:
#test if all lines have been converted to string 
#for index,row in data.iterrows():
    #item=row['old_label']
    

- create and additional column ['info_type'] which will indicate what type of infomation is presented in column old_label


In [None]:
data['info_type'] = ""
data['info_type'] = np.nan

In [None]:
data.head()

In [None]:
add_infotype(data)

In [None]:
import matplotlib.pyplot as plt

In [37]:
pd.to_numeric(data['info_type'])

0        1
1        1
2        1
3        1
4        1
5        3
6        3
7        3
8        1
9        1
10       3
11       2
12       2
13       2
14       2
15       2
16       3
17       2
18       3
19       3
20       3
21       2
22       3
23       3
24       3
25       3
26       3
27       2
28       3
29       3
        ..
51180    3
51181    3
51182    3
51183    3
51184    3
51185    3
51186    3
51187    3
51188    3
51189    3
51190    3
51191    3
51192    3
51193    3
51194    3
51195    3
51196    3
51197    3
51198    3
51199    3
51200    3
51201    3
51202    3
51203    3
51204    3
51205    3
51206    3
51207    3
51208    3
51209    3
Name: info_type, Length: 51210, dtype: int64

TypeError: len() of unsized object