# 1.0 - IMPORTS

## 1.1 - LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import inflection

## 1.2 - LOAD DATA

In [2]:
df_raw = pd.read_csv('../dataset/zomato.csv', low_memory=False, encoding='ISO-8859-1')

## 1.3 - HELPER FUNCTIONS

### 1.3.1 - RENAME COLUMNS FUNCTION

In [3]:
def rename_columns(column_name):
    words = column_name.split()
    return '_'.join(words).lower()

### 1.3.2 - NA VOLUMETRY

In [9]:
def na_volumetry(df):

    df_na = pd.DataFrame()
    df_na['attributes'] = df.columns
    df_na['quantity'] = ''
    df_na['percentual'] = ''

    for i in df_na.index:
        df_na.quantity[i]    = df[df_na['attributes'][i]].isna().sum()
        df_na.percentual[i]  = (df[df_na['attributes'][i]].isna().sum()/df['restaurant_id'].count())*100
    
    return df_na

# 2.0 - DATA CLEANING

In [4]:
df2 = df_raw.copy()

## 2.1 - DATA DIMENSION

In [5]:
unique_data = df2.apply(lambda x: x.nunique())
rows = df2.shape[0]
cols = df2.shape[1]
print('The Dataset have {} columns and {} rows'.format(cols, rows))
print('\nAmount of unique data for data feature:\n\n{}'.format(unique_data))

The Dataset have 21 columns and 9551 rows

Amount of unique data for data feature:

Restaurant ID           9551
Restaurant Name         7446
Country Code              15
City                     141
Address                 8918
Locality                1208
Locality Verbose        1265
Longitude               8120
Latitude                8677
Cuisines                1825
Average Cost for two     140
Currency                  12
Has Table booking          2
Has Online delivery        2
Is delivering now          2
Switch to order menu       1
Price range                4
Aggregate rating          33
Rating color               6
Rating text                6
Votes                   1012
dtype: int64


## 2.2 - RENAME COLUMNS

In [6]:

df2.columns = [rename_columns(columns) for columns in df2.columns]
df2.columns


Index(['restaurant_id', 'restaurant_name', 'country_code', 'city', 'address',
       'locality', 'locality_verbose', 'longitude', 'latitude', 'cuisines',
       'average_cost_for_two', 'currency', 'has_table_booking',
       'has_online_delivery', 'is_delivering_now', 'switch_to_order_menu',
       'price_range', 'aggregate_rating', 'rating_color', 'rating_text',
       'votes'],
      dtype='object')

## 2.3 - DATA TYPES

In [7]:
df2.dtypes

restaurant_id             int64
restaurant_name          object
country_code              int64
city                     object
address                  object
locality                 object
locality_verbose         object
longitude               float64
latitude                float64
cuisines                 object
average_cost_for_two      int64
currency                 object
has_table_booking        object
has_online_delivery      object
is_delivering_now        object
switch_to_order_menu     object
price_range               int64
aggregate_rating        float64
rating_color             object
rating_text              object
votes                     int64
dtype: object

## 2.4 - NA VOLUMETRY

In [13]:
df2.isna().sum()

restaurant_id           0
restaurant_name         0
country_code            0
city                    0
address                 0
locality                0
locality_verbose        0
longitude               0
latitude                0
cuisines                9
average_cost_for_two    0
currency                0
has_table_booking       0
has_online_delivery     0
is_delivering_now       0
switch_to_order_menu    0
price_range             0
aggregate_rating        0
rating_color            0
rating_text             0
votes                   0
dtype: int64

In [14]:
na_volumetry(df2)

Unnamed: 0,attributes,quantity,percentual
0,restaurant_id,0,0.0
1,restaurant_name,0,0.0
2,country_code,0,0.0
3,city,0,0.0
4,address,0,0.0
5,locality,0,0.0
6,locality_verbose,0,0.0
7,longitude,0,0.0
8,latitude,0,0.0
9,cuisines,9,0.094231
