# 1 IC Data Wrangling - customers

### This script contains the following points
#### 01 Import Libraries
#### 02 Import Data
#### 03 First Look at Data
#### 04 Data Wrangling
#### 05 Export Data

# 01 Import Libraries

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [2]:
# Set a path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
# Import the "customers" file

cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# 03 First Look at Data

In [4]:
# Shape of "cust"

cust.shape

(206209, 10)

In [5]:
# First few rows of "cust"

cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
# Data types of "cust"

cust.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

# 04 Data Wrangling

### 01 Drop Columns

In [7]:
# Drop "First Name" and "Surnam" from "cust" for security reasons
# Drop "date_joined" from "cust" because it's irrelevant for this project

cust = cust.drop(columns =['First Name', 'Surnam', 'date_joined'])

In [8]:
cust.head()

Unnamed: 0,user_id,Gender,STATE,Age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


### 02  Rename Columns

In [9]:
# Rename columns from "customers" to follow the same naming convention

In [10]:
# Create a variabel with new column names

column_names = {
    'Gender' : 'gender',
    'STATE' : 'state',
    'Age' : 'age'}

In [11]:
# Rename columns

cust.rename(columns = column_names, inplace=True)

In [12]:
cust.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


### 03 Change Data Types

No data types are being changed for this dataframe.

# 05 Export Data

In [13]:
# Export the "cust" dataframe

cust.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'wrangled_customers.csv'))