<a href="https://colab.research.google.com/github/dal7collab/google_collab_py/blob/main/indian_food.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# Google User Authentication
from google.colab import auth
auth.authenticate_user()

In [25]:
# Google Credentials Authentication
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

In [26]:
# Import Google Sheet into Google Collab Data Object
cn = gc.open('indian_food').sheet1
sh = cn.get_all_values()
print(sh[0:1])

[['name', 'ingredients', 'diet', 'prep_time', 'cook_time', 'flavor_profile', 'course', 'state', 'region']]


In [27]:
# Convert Google Sheet into Pandas Data Frame
import pandas as pd
df_sh = pd.DataFrame.from_records(sh)

In [28]:
# Convert First Row into Header
header = df_sh.iloc[0]        # Grab the first row for the header
df_sh  = df_sh[1:]            # Take the data less the header row
df_sh.columns = header        # Set the header row as the df header

In [29]:
# Copy Data Frame
df = df_sh.copy(deep = True)

In [30]:
# Obtain Data Frame General Info 
# Column names, Missing values, Data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 1 to 255
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            255 non-null    object
 1   ingredients     255 non-null    object
 2   diet            255 non-null    object
 3   prep_time       255 non-null    object
 4   cook_time       255 non-null    object
 5   flavor_profile  255 non-null    object
 6   course          255 non-null    object
 7   state           255 non-null    object
 8   region          255 non-null    object
dtypes: object(9)
memory usage: 18.1+ KB


In [31]:
# Retrieve First n (5) Rows
n = 5
df.head(n)
# df.tail(n)

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
1,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
2,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
3,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
4,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
5,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East


In [32]:
# Describe Data Frame General Info
df['prep_time'].describe()

count     255
unique     22
top        10
freq       98
Name: prep_time, dtype: object

In [33]:
# Cast Column Type
df['prep_time'] = df['prep_time'].astype({'prep_time': 'int32'})

In [58]:
# Function Casting Column Type
def cast_type(data_frame, column_name, type_name):
  data_frame[column_name] = data_frame[column_name].astype({column_name:type_name}) 
  print(column_name + ":ok")

In [59]:
# Function Launch
column_name = 'cook_time'
cast_type(df,column_name,'int32')
df[column_name].describe()

cook_time:ok


count    255.000000
mean      34.529412
std       48.265650
min       -1.000000
25%       20.000000
50%       30.000000
75%       40.000000
max      720.000000
Name: cook_time, dtype: float64

In [36]:
# Data Numerical Columns 'Simple' Stats
df.describe()

Unnamed: 0,prep_time,cook_time
count,255.0,255.0
mean,31.105882,34.529412
std,72.554409,48.26565
min,-1.0,-1.0
25%,10.0,20.0
50%,10.0,30.0
75%,20.0,40.0
max,500.0,720.0


In [37]:
# Data Categorical Columns 'Simple' Stats
df.describe(include=['O'])

Unnamed: 0,name,ingredients,diet,flavor_profile,course,state,region
count,255,255,255,255,255,255,255
unique,255,252,2,5,4,25,8
top,Red Rice,"Arbi ke patte, sesame seeds, gur, bengal gram ...",vegetarian,spicy,main course,Gujarat,West
freq,1,2,226,133,129,35,74


In [38]:
# Replace -1 values to np.nan values in specific column data frame
import numpy as np
df_rp = df.copy(deep = True)
df_rp["cook_time_none"] = df_rp["cook_time"].replace(to_replace = -1,
                                                          value = np.nan,
                                                        inplace = False)

In [39]:
df_rp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 1 to 255
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            255 non-null    object 
 1   ingredients     255 non-null    object 
 2   diet            255 non-null    object 
 3   prep_time       255 non-null    int32  
 4   cook_time       255 non-null    int32  
 5   flavor_profile  255 non-null    object 
 6   course          255 non-null    object 
 7   state           255 non-null    object 
 8   region          255 non-null    object 
 9   cook_time_none  227 non-null    float64
dtypes: float64(1), int32(2), object(7)
memory usage: 18.1+ KB


In [40]:
# Replace -1 values to np.nan values in all data frame
import numpy as np
df_rp = df.copy(deep = True)
df_rp.replace(to_replace = -1,
                   value = np.nan,
                 inplace = True)

In [41]:
df_rp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 1 to 255
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            255 non-null    object 
 1   ingredients     255 non-null    object 
 2   diet            255 non-null    object 
 3   prep_time       225 non-null    float64
 4   cook_time       227 non-null    float64
 5   flavor_profile  255 non-null    object 
 6   course          255 non-null    object 
 7   state           255 non-null    object 
 8   region          255 non-null    object 
dtypes: float64(2), object(7)
memory usage: 18.1+ KB


In [42]:
# Function replaces np.nan values in all data frame with relevant statistics (mean, mode)
def rep_missings(data_frame):
    d = data_frame.copy(deep = True)
    for c in data_frame:
        if (data_frame[c].dtype == 'int32'):
            if data_frame[c].isnull().values.any():
                m = data_frame[c].dropna().mean()
                d[c].fillna(m, inplace = True)
        else:          
            if data_frame[c].isnull().values.any():
                m = data_frame[c].dropna().mode()[0]
                d[c].fillna(m, inplace = True)
    return d

In [43]:
# Replace np.nan values in all data frame with relevant statistics (mean, mode)
df_rp = rep_missings(df_rp)

In [44]:
df_rp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 1 to 255
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            255 non-null    object 
 1   ingredients     255 non-null    object 
 2   diet            255 non-null    object 
 3   prep_time       255 non-null    float64
 4   cook_time       255 non-null    float64
 5   flavor_profile  255 non-null    object 
 6   course          255 non-null    object 
 7   state           255 non-null    object 
 8   region          255 non-null    object 
dtypes: float64(2), object(7)
memory usage: 18.1+ KB


In [45]:
# Mount Google drive in Google Collab
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [46]:
# Export data_frame into Google drive in comma separated format
df_rp.to_csv('/drive/My Drive/indian_food_nulls.csv')