# 2 IC Data Consistency Checks - departments

### This script contains the following points
#### 01 Import Libraries
#### 02 Import Data
#### 03 First Look at Data
#### 04 Consistency Checks
#### 05 Export Data

# 01 Import Libraries

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [2]:
# Set the data path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [8]:
# Import the "wrangled_departments" file

deps_wr = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'wrangled_departments.csv'), index_col = False)

# 03 First Look at Data

In [9]:
# Shape of "deps_wr"

deps_wr.shape

(21, 2)

In [10]:
# First few rows of "deps_wr"

deps_wr.head()

Unnamed: 0.1,Unnamed: 0,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [11]:
# Rename "Unnamed: 0" to "department_id"

deps_wr.rename(columns = {'Unnamed: 0' : 'department_id'}, inplace = True)

In [12]:
deps_wr.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [13]:
# Data types of "deps_wr"

deps_wr.dtypes

department_id     int64
department       object
dtype: object

# 04 Consistency Checks

In [14]:
# Descriptive statistics of "deps_wr"

deps_wr.describe()

Unnamed: 0,department_id
count,21.0
mean,11.0
std,6.204837
min,1.0
25%,6.0
50%,11.0
75%,16.0
max,21.0


No obvious problems were detected.

### 01 Mixed-Type Data

In [15]:
# Check for mixed-type data

for col in deps_wr.columns.tolist():
    weird = (deps_wr[[col]].map(type) != deps_wr[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (deps_wr[weird]) > 0:
        print (col)

No mixed-type data could be found.

### 02 Missing Values

In [16]:
# Check for missing values in "deps_wr"

deps_wr.isnull().sum()

department_id    0
department       0
dtype: int64

No missing values could be found.

### 03 Duplicate Data

In [18]:
# Create a new subset with only duplicates

deps_wr_dups = deps_wr[deps_wr.duplicated()]

In [19]:
deps_wr_dups

Unnamed: 0,department_id,department


There is no duplicate data in this data set.

In [20]:
# Number of rows after consistency check

deps_wr.shape

(21, 2)

# 05 Export Data

In [21]:
# Export "deps_wr"

deps_wr.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'checked_departments.csv'))