# Data Checks on: departments.csv

#### Contents. 
- Missing Data Check
- Duplicate Check
- Mixed-type Data Check
- Other inconsistency, if needed, check
- Summary if changes to the original dataframe

## Importing libraries and data

In [1]:
#importing the libraries

import pandas as pd
import numpy as np
import os

In [2]:
#importing datasets:
path = r'C:\Users\chris\Documents\Instacart Basket Analysis'
df_deps = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'departments.csv'))

## Viewing the dataframe

In [3]:
# size:
df_deps.shape

(1, 22)

In [4]:
# columns:
df_deps.columns

Index(['department_id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21'],
      dtype='object')

In [5]:
# looks:
df_deps

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [6]:
# data types:
df_deps.dtypes

department_id    object
1                object
2                object
3                object
4                object
5                object
6                object
7                object
8                object
9                object
10               object
11               object
12               object
13               object
14               object
15               object
16               object
17               object
18               object
19               object
20               object
21               object
dtype: object

In [7]:
# basic stats:
df_deps.describe().round(1)

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
count,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
unique,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
top,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


##### This data needs to be transposed. 

### Missing Data Check

In [8]:
# checking for any missing values:
df_deps.isna().sum()

department_id    0
1                0
2                0
3                0
4                0
5                0
6                0
7                0
8                0
9                0
10               0
11               0
12               0
13               0
14               0
15               0
16               0
17               0
18               0
19               0
20               0
21               0
dtype: int64

### Duplicated Data Check

In [9]:
# looking for duplicates:
df_dups = df_deps[df_deps.duplicated()]
df_dups

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21


##### Since it's only one row, there are no duplicated rows.

### Mixed-typed Data Check

In [10]:
# checking for mixed-typed data:
for col in df_deps.columns.tolist():
  weird = (df_deps[[col]].applymap(type) != df_deps[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_deps[weird]) > 0:
    print (col)

##### no mix-typed data

## Summary: What needs to be addressed:

#### The data is in a wide format, when a vertical format would be best. I transpose it. It is actually a data dictionary, explaining what those department numbers refer to.

# Changing and Exporting the dataframe >> departments_wrangled.csv

In [11]:
# transposing the whole dataframe:
df_deps.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [12]:
# assigning new dataframe to this and calling the df:
df_deps_t=df_deps.T
df_deps_t.head

<bound method NDFrame.head of                              0
department_id       department
1                       frozen
2                        other
3                       bakery
4                      produce
5                      alcohol
6                international
7                    beverages
8                         pets
9              dry goods pasta
10                        bulk
11               personal care
12                meat seafood
13                      pantry
14                   breakfast
15                canned goods
16                  dairy eggs
17                   household
18                      babies
19                      snacks
20                        deli
21                     missing>

In [13]:
#adding an index, because we need it in the next steps:
df_deps_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [14]:
# create a new header and call it:
new_header = df_deps_t.iloc[0]
new_header

0    department
Name: department_id, dtype: object

In [15]:
# remove the first row in the dataframe by creating an new dataframe without it:
df_deps_t_new = df_deps_t[1:]
df_deps_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [16]:
# set the header row as the actual new header:
df_deps_t_new.columns = new_header

In [17]:
# calling the final dataframe:
df_deps_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


##### Last check only reading the contents. Data looks fine so far - what could be meant with 21 = 'missing' we'll check out when merging the dataframes.

### Exporting the dataframe

In [22]:
# exporting the dataframe:
df_deps_t_new.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.pkl'))