# Customer and store location data analysis

In [10]:
import pandas as pd
from src.utils.db_engine import db_engine

### Why city and country do most and least customers live in? How global is the BluckBoster business?

In [11]:
# create a sql engine
engine = db_engine()

# write a query
query = ''' 
            SELECT 
                COUNT(customer.customer_id) AS total_customers, 
                city.city, 
                country.country
            FROM 
                customer
            LEFT JOIN 
                address
            ON 
                customer.address_id = address.address_id
            LEFT JOIN
                city
            ON
                address.city_id = city.city_id
            LEFT JOIN
                country
            ON
                city.country_id = country.country_id
            GROUP BY
                city.city, country.country
            ORDER BY
                total_customers DESC,
                country,
                city
            ;         
        '''
# create a pandas dataframe using the engine and query
df = pd.read_sql(query, engine)

# print df
df

Unnamed: 0,total_customers,city,country
0,3,,
1,2,London,United Kingdom
2,2,Aurora,United States
3,1,Kabul,Afghanistan
4,1,Batna,Algeria
...,...,...,...
590,1,Sanaa,Yemen
591,1,Taizz,Yemen
592,1,Kragujevac,Yugoslavia
593,1,Novi Sad,Yugoslavia


In [12]:
# which country appears the most on the list? this means we operate in the most cities in this country out of any other country

df['country'].mode()[0]

'India'

In [13]:
# which country has the most and least customers?

df.groupby('country')['total_customers'].sum().sort_values(ascending=False)

country
India                   60
China                   52
United States           35
Japan                   30
Mexico                  30
                        ..
Tunisia                  1
Turkmenistan             1
Tuvalu                   1
Virgin Islands, U.S.     1
Zambia                   1
Name: total_customers, Length: 108, dtype: int64

In [14]:
# how many countries does BluckBoster operate in?

len(df['country'].unique())

109

## Investigate null values

In [15]:
# write a query
query = ''' 
            SELECT 
                *
            FROM 
                customer
            ;         
        '''
# create a pandas dataframe using the engine and query
df = pd.read_sql(query, engine)

# print df
df.isnull().sum()

customer_id    0
store_id       0
first_name     0
last_name      0
email          0
address_id     3
activebool     0
create_date    0
last_update    0
active         0
dtype: int64

In [16]:
# write a query
query = ''' 
            SELECT 
                *
            FROM 
                address
            ;         
        '''
# create a pandas dataframe using the engine and query
df = pd.read_sql(query, engine)

# print df
df.isnull().sum()

address_id     0
address        0
address2       4
district       0
city_id        0
postal_code    4
phone          0
last_update    0
dtype: int64

In [17]:
# write a query
query = ''' 
            SELECT 
                *
            FROM 
                city
            ;         
        '''
# create a pandas dataframe using the engine and query
df = pd.read_sql(query, engine)

# print df
df.isnull().sum()

city_id        0
city           0
country_id     0
last_update    0
dtype: int64

In [18]:
# write a query
query = ''' 
            SELECT 
                *
            FROM 
                country
            ;         
        '''
# create a pandas dataframe using the engine and query
df = pd.read_sql(query, engine)

# print df
df.isnull().sum()

country_id     0
country        0
last_update    0
dtype: int64