##### ------- INSTALL LIBRARIES  -------- #####

In [None]:
# INSTALL THE PANDAS LIBRARY
!pip3 install --upgrade pandas
!pip3 install --upgrade numpy
!pip3 install --upgrade openpyxl
!pip3 install --upgrade pyarrow

##### ---------- About Pandas ----------- #####

In [None]:
# What is Pandas...
""" 
Pandas is the open source python liberary which provide high-performance, easy-to-use data stractures and data analytis tool.
And it is useful to handle different types of data and perform the operation on these data. 
"""

# Features of Pandas is... 
"""
* It's well suited for tabular data as in sql/excel, ordered/unordered series data and arbitrary matrix data.
* It's use for huge amount of data processing and analysis of relation data.
* It have two primary data structure such as Series, Dataframe for handle vast varity of use cases in finance, statistic and many more.
* pandas is built on top of NumPy and is intended to integrate well within a scientific computing environment with many other
  3rd party libraries.
"""

# About the Series and Dataframe...
"""
1. Series : Series is a 1-Dimensional array which can store one type values as list.

2. DataFrame : DataFrame is a 2-Dimensional array or matrix. Which is the combination of multiple series that can store
              tabluar data where records can be use of rows and columns.
"""

# Here are just a few of the things that pandas does well...
"""
* Easy handling of missing data (represented as NaN) in floating point as well as non-floating point data
* Size mutability: columns can be inserted and deleted from DataFrame and higher dimensional objects
* Automatic and explicit data alignment: objects can be explicitly aligned to a set of labels, or the user can simply
  ignore the labels and let Series, DataFrame, etc. automatically align the data for you in computations
* Powerful, flexible group by functionality to perform split-apply-combine operations on data sets, for both aggregating
  and transforming data
* Make it easy to convert ragged, differently-indexed data in other Python and NumPy data structures into DataFrame objects
* Intelligent label-based slicing, fancy indexing, and subsetting of large data sets
* Intuitive merging and joining data sets
* Flexible reshaping and pivoting of data sets
* Hierarchical labeling of axes (possible to have multiple labels per tick)
* Robust IO tools for loading data from flat files (CSV and delimited), Excel files, databases, and saving / loading data
  from the ultrafast HDF5 format
* Time series-specific functionality: date range generation and frequency conversion, moving window statistics,
  date shifting, and lagging. 
"""

# Mutability and copying of data...
"""
All pandas data structures are value-mutable (the values they contain can be altered) but not always size-mutable. The length of a Series
cannot be changed, but, for example, columns can be inserted into a DataFrame. However, the vast majority of methods produce new objects
and leave the input data untouched. In general we like to favor immutability where sensible.
"""



##### ---------- Import Libraries ---------- ######

In [31]:
import pandas as pd
import numpy as np
from io import StringIO
import openpyxl
import json
import pyarrow.parquet as pq

##### ----------- Dataframe And Series ------------- #####

In [None]:
# Create the dataframe..
df = pd.DataFrame(
    {
        "Name": [
                "Braund, Mr. Owen Harris",
                "Allen, Mr. William Henry",
                "Bonnell, Miss. Elizabeth",
            ],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }
)
print("--------> Full DataFrame : \n", df, '\n')
# Show some of the rows from first or last..
print("--------> First 2 rows : \n", df.head(2), '\n')
print("--------> Last 1 row : \n", df.tail(1))


# Create the series..
ages = pd.Series([12, 44, 25, 56], name="Age")
print("\n---------> Series is : \n", ages)

# Getting the series from existing dataframe
# because each columns in dataframe is series...
print("\n---------> Get series from df : \n", df.Age)   # series can also get in df['Age'] format


##### ---------- READ AND WRITE DATA ----------- ##### 

In [None]:
"""_READ AND WRITE THE TABULAR DATA IN DIFFERENT FILE FORMATS_

REFERENCE : https://pandas.pydata.org/docs/user_guide/io.html
"""

# Use to_csv/read_csv to work with csv/txt file...
# there are use path (location to read/write), sep (seperated by delimiter like ",/./-/" etc), header (column names),
# index (row numbers), index_col (to set columns as index) and so on..
csv_df = pd.read_csv('data/csv_business_price.csv', low_memory=False, header=0, index_col=None)
print("\n--------> read csv data is : \n", csv_df.head(2))
csv_df.to_csv('data/txt_business_price.txt', sep=',', index=False)


# Read the text file using read_csv..
txt_df = pd.read_csv('data/txt_business_price.txt', sep=',', low_memory=False)
print("\n--------> text data is :\n", txt_df.head(3))


# Read the excel file using read_excel...
excel_df = pd.read_excel('data/excel_business_operations.xlsx', header=0, index_col=0)
print("\n--------> excel data is :\n", excel_df.head(5))


# Read the json file using read_json...
# there can handling the different data structure at read the json file.
# 1. Nested json : In this case, data file use data as nested objects in single object.
#    which can be use by json_normalize() after loading the JSON data.
# 2. Line delimited json : In this case, data file use data as multiple json objects each on new lines. 
#    which can be access by specifying "line=True"
# Note : At the read/write json data, use the specific orientation for define the different format data structure. 
#        like : split, index, columns, records, table, values etc.
# --------> Use Nested data. With define the max_level 0, 1 so on..
with open('data/json_nested_data.json') as f:
    data = json.load(f)
json_nested_df = pd.json_normalize(data, max_level=1)
print("\n--------> json nested data is :\n", json_nested_df.head(10))
# --------> Use Line delimited data.
json_df = pd.read_json('data/json_line_delimited_flights.json', lines=True, orient=None)
print("\n--------> json line delimited data is :\n", json_df.head(5))
"""
Ref : https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
"""

# Read the parquet file using read_parquet or by using pyarrow.parquet but need to import pyarrow module...
parquet_df = pd.read_parquet('data/titanic.parquet', engine='auto') # ---> reading of parquet file can give issue on ipython/jupyter notebook.
                                                                    #      so try on using python file.
print("\n----------> parquet file data is :\n", parquet_df.head(5))
parquet_df.to_parquet('data/new_titanic.parquet')
# read parquet using pyarrow library
parq_df = pq.read_table('data/titanic.parquet').to_pandas()
print("\n-----------> read parquet data using pyarrow is :\n", parq_df.head(10))


# Check the basic concepts..
print("\n---------> select the single column :\n", parq_df['Name'])
print("\n---------> select the multiple columns :\n", parq_df[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age']])
print("\n--------> get the single field val : \n", parq_df['PassengerId'][0])        # get the field value..
parq_df['PassengerId'][0] = 5
print("\n--------> change the field value : \n", parq_df.head(5))  # changes field value but it will gave the warning due to not good way..




##### ---------- Uses The Dataframe And Series Functions/Attributes ------------ #####

In [None]:
"""_THIS IS ABOUT SOME COMMANLY USED PANDAS FUNCTIONS_

   REFERENCE : https://pandas.pydata.org/docs/reference/frame.html
"""

# Use type function to indentify the Data structure...
print("\n-------> Identify the dataframe structure : ", type(df), "\n")  
print("\n-------> Identify the series structure : ", type(ages), "\n") 

# Use shape attribute to find out the count of rows and columns...
print("\n-------> Find the shape/size of the df is : ", df.shape, "\n")     # ex-(3, 3) : means there are 3 rows and 3 columns..

# Use the max/min function to Find maximum/minimum age from Dataframe/Series
print("\n--------> Max and Min age from df : \n", df.max(), '\n\n', df.min())
print("\n--------> Mix and Min age from series : ", ages.max(), ages.min())


# Use describe to find out the statistic calculation at numerical data of dataframe..
# or it is use for statistical analysis.
print("\n-------> Describe the statistic data is : \n", df.describe())

# Use the info method to display all the info about the dataframe..
print("\n-------> Display the info of df is :\n")
print(df.info())

# Use dtypes to display the data type of dataframe or series...
print("\n-------> Types of the df : \n", df.dtypes)
print("\n-------> Types of the series : \n", ages.dtypes)

# Use memory_usage function that showcase the memory usage of each columns..
# and by specifying deep attribute as True, we can get actual space taken by each columns..
print("\n-------> Memory usage of the df columns : \n", df.memory_usage(deep=True))


# Use index and columns for get the starting and ending position, columns name...
print("\n--------> Get df indexs :\n", df.index)
df.index = ['first', 'second', 'third']
print("\n--------> change the index value : \n", df)                 # changes index value..
print("\n--------> Get df columns :\n", df.columns)

# Use T for transpose the data frame from rows into columns and vise-versa...
print("\n--------> Transpose the df :\n", df.T)


# Use astype for casting the data type of a specific column or all columns..
temp_df = pd.DataFrame(data={'col1': [1, 2, 4, 6, 7, 3, 5 ,7], 'col2': [3, 4, 6, 8, 9, 5, 7, 9], 'col3': [23, 16, 27, 34, 98, 37, 22, 29]})
print("\n--------> Current datatype of temp data frame :\n", temp_df.dtypes)
print("\n--------> Cast single column is : \n", temp_df.astype({'col1': 'int32'}).dtypes)
print("\n--------> Cast all the columns are : \n", temp_df.astype('int32').dtypes)

# Extract the sample dataframe from "df" and store it in "sample_df"
df = pd.read_csv('data/csv_business_price.csv', low_memory=False)
sample_df = df[['Series_reference', 'Period', 'STATUS', 'UNITS', 'Subject', 'Series_title_1', 'Series_title_2']].sample(15)
print("\n---------> Print the sample data frame :\n", sample_df)

# Use sort index function for sort the dataframe based on index(rows)/columns...
# Syntax: DataFrame.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort',
#                              na_position='last', sort_remaining=True, by=None)
print("\n--------> Sort the data index based on row axis :\n",
        sample_df.sort_index(axis=0, ascending=False, inplace=False, kind='quicksort', na_position='first'))
        # ex.- here axis=0 (row-wise), axis=1 (col-wise)
"""More About : https://www.geeksforgeeks.org/python-pandas-dataframe-sort_index/ """ 

# Use sort values function for sort the dataframe based on values and row/column axis...
# Syntax: DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
print("\n--------> Sort the data values based on row axis :\n", sample_df.sort_values(by=['Series_reference', 'Period'], axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last'))
"""More About : https://www.geeksforgeeks.org/python-pandas-dataframe-sort_values-set-1/ """



# Use view(reference) or copy for data frame....
df1 = temp_df                # df1 use as view/reference of temp_df bcz when will make change in df1 that also applied on actual temp_df.
print("\n---------> df1 use as view of temp_df : \n", df1)
df1['col1'][0] = 6
print("\n---------> Actual temp_df after change df1 view : \n", temp_df)

df1 = temp_df.copy()         # df1 use as copy of actual temp_df that will not make change temp_df after changes into df1.
df1['col1'][0] = 3
print("\n---------> Actual temp_df after no change df1 copy : \n", temp_df)
print("\n---------> df1 use as copy of temp_df : \n", df1)



# Whenever we have made direct changes in position value of view/copy dataframe. then in pandas, it can create issue to differenciat
# in manipulation of view/copy df.  and that by it's raised the warning (same as in above case of update the value with 3).
# So handle these all issues, Pandas allow us to use of loc and iloc functions..

# There loc and iloc function are used for filtering, accessing and manupulating the data in database.
# And the difference are loc (use with actual row and column name) while iloc (use of position number without care of row and columns name)
# for menupulating records or extract the subset of dataframe...
""" See in the documentation: 
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
# so basically loc/iloc used to take the copy after such operations.
temp_df.loc[0, 'col1'] = 19                                           # use with actual row/col name
print("\n---------> Update value with loc : \n", temp_df)
temp_df1 = temp_df.loc[[0, 1], ['col1', 'col3']]                       # get the subset table df from actual df. 
print("\n---------> Get subset of df : \n", temp_df1)
temp_df2 = temp_df.loc[:, ['col1', 'col3']]                          # get the subset table df with all row from actual df. 
print("\n---------> Get subset of df with all rows : \n", temp_df2)
temp_df3 = temp_df.loc[[0, 1], :]                                       # get the subset table df with all col from actual df. 
print("\n---------> Get subset of df with all columns : \n", temp_df3)

temp_df4 = temp_df.iloc[0, 0]                                     # use with position index
print("\n---------> Get value with iloc : \n", temp_df4)
temp_df5 = temp_df.iloc[[0], :]                                   
print("\n---------> Get value with iloc : \n", temp_df5)


# Use drop function to remove the unnacessary row/column...
temp_df.loc[0, 0] = 123                                  # new col is create bcz column was not exiting.
print("\n---------> Df after Added/Update unnacessary col val : \n", temp_df)
temp_df.drop([0], axis=1, inplace=True)                    # inplace=True use for changes applied on same dataframe.
print("\n---------> Df after Drop unnacessary col : \n", temp_df)


# Use of reset_index function to reset the index value again in df..
print("\n---------> Df before reset : \n", temp_df)
temp_df6 = temp_df.drop([3, 1], axis=0)
print("\n---------> Df before reset after drop : \n", temp_df6)
temp_df6.reset_index(drop=True, inplace= True)
print("\n---------> Df after reset : \n", temp_df6)


# Create new column with None or NaN values..
# there are different way to add new column with none/nan values..
temp_df7 = temp_df.copy()
temp_df7['col4'] = None                  # assign no value col. to python oriented none, which is used for all data type in columns..
print("\n--------> Df col with none value :\n", temp_df7)   
temp_df7['col5'] = np.nan              # assign no value col. to numpy NaN which is more appropriat for numarical column for data accuracy..
print("\n--------> Df col with np.nan value :\n", temp_df7)
temp_df7.insert(4, 'col6', np.nan)      # assign no value col. using insert method with specific position..
print("\n--------> Df col with insert method is :\n", temp_df7)
temp_df8 = temp_df7.assign(col7 = np.nan)   # assign no value col. using assign method which is more functional
                                            # and return new df with effect original one..
print("\n-------> Df col with assign method without changes is :\n", temp_df7)
print("\n-------> Df col with assign method with changes is :\n", temp_df8)


# Use of isNull and isNotNull function in df..
# In pandas dataframe, isnull() and notnull() method are used to detect missing and non-missing values in dataframe..
# And the purpose of identify the missing values that (represent as NaN for numerical columns, None for python object 
# and NaT for datetime)..
# By using these method, return the boolean value True/False for missing and non-missing values in df...
print("\n---------> Df indicate missing or nan values is :\n", temp_df7.isna())     # here isna() and isnull() are same..
print("\n---------> Df indicate the missing or non values is :\n", temp_df7.isnull())
print("\n---------> Df indicate the non-missing value is :\n", temp_df7.notna())    # here notna() and notnull() are same..
print("\n---------> Df indicate the non-missing values is :\n", temp_df7.notnull())


# Use of dropna function for remove the NaN value rows or update by default value inplace of NaN inside df...
# syntax : DataFrame.dropna(*, axis=0, how=<no_default>, thresh=<no_default>, subset=None, inplace=False, ignore_index=False)
# here..
# axis=0 means remove row based on row index, axis=1 means remove column based on column index.
# how=<value> define 'any' and 'all' at place of <value>
#     Any - row/col removed if any values in row/col is NA, It is default
#     All - row/col removed if all values in row/col are NA
# thresh - it's define the number which is keep only the rows with at least n non-NA values.
# subset - it's define in which columns to look for missing values.
# ignore_index use for skip the row which will be mansion like [0, 3, 7 so on..]
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]})
print("\n--------> drop all row with any NA value : \n", df.dropna())
df.loc[:, 'sax'] = [np.nan, np.nan, np.nan]
print("\n--------> drop column with any NA value : \n", df.dropna(axis=1))
print("\n--------> drop row if all field in row conatins NA value : \n", df.dropna(how='all'))
print("\n--------> drop column if all field in col conatins NA value : \n", df.dropna(axis=1, how='all'))
print("\n--------> drop row and keep only at least 2 non-NA values rows : \n", df.dropna(thresh=2))
print("\n--------> based on given column check the NA value and drop if found NA : \n", df.dropna(subset=['name', 'toy']))


# Use of drop_duplicates for remove the duplicate records from the df..
# syntax : DataFrame.drop_duplicates(subset=None, *, keep='first', inplace=False, ignore_index=False)
# here..
# subset - only consider specify columns for indentify duplicate, default use of all the columns
# keep - it's identify that duplicates should be contains. default is 'first' (use for keep first record from two or more duplicates)
#        'last' (use for keep last record from duplicates), False (use for drop all duplicates)
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})
print("\n--------> drop duplicates based on all cols : \n", df.drop_duplicates())
print("\n--------> drop duplicates based on specified cols : \n", df.drop_duplicates(subset=['brand', 'style']))
print("\n--------> drop based on specify cols and keep last record : \n", df.drop_duplicates(subset=['brand', 'style'], keep='last'))


# Use of fillna function for fill the NaN value by default value inside df..
# syntax : DataFrame.fillna(value=None, *, method=None, axis=None, inplace=False, limit=None, downcast=<no_default>)
# here..
# value - use for fill the single value by default on index level, 
#         or use alternative for fill the nan/none value using dict/series/dataframe on column level.
# method - it's use for fill based on ‘backfill’, ‘bfill’, ‘ffill’ and defaul is None.
# axis - use axis=0 for index level fill or axis=1 for column level
# limit - it's use for fill the nan up to by given number of limit.
df = pd.read_excel('data/excel_business_operations.xlsx', header=0, index_col=0)
df1 = df.iloc[0:10, :]
df1.iloc[[1, 2, 8, 9], [0, 1]] = np.nan
# It's fill NaN/None
df1.columns = ['A', 'B']
print("\n--------> fill nan by default single val : \n", df1.fillna(0))       # by default, we can fill any value at index lavel
print("\n--------> fill nan with dict by each col val : \n",
        df1.fillna({'A': "UNKNOWN", 'B': "UNKNOWN"}))    # dist fill values each column based.
print("\n--------> fill nan with dict val with up to limit no. : \n",
        df1.fillna({'A': "UNKNOWN", 'B': "UNKNOWN"}, limit=3))   # it's fill values on given limit number of NaN/None
t_df1 = pd.DataFrame(np.zeros((df1.shape[0]+1, df1.shape[1])), columns=df1.columns)
print("\n--------> fill nan with dataframe values : \n", df1.fillna(t_df1))   # fill the nan value by creating dataframe on column level


# Use of isin() method in df..
# syntax : DataFrame.isin(values)
# values - It will return true if any the label match with values. and values is series, It should be match the all index with values.
#           And values is dict, It should be match column index with values key name. In case of dataframe, It should match label
#           and index both with the values.. 
df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
                  index=['falcon', 'dog'])

print("\n--------> isin use iterator val with col : \n", df['num_legs'].isin([2, 1])) # use as iterator value on single column.
print("\n--------> isin use iterator val with df : \n", df.isin([2, 0]))    # use as iterator value that match all the label
print("\n--------> isin use dict val with df : \n", df.isin({'num_wings': [1, 2]})) # use as dict value that match on specific column
ser = pd.Series([2, 4], name='num_legs', index=['falcon', 'dogs'])
print("\n--------> isin use series val with df : \n", df.isin(ser)) # use as series values thet must match both index and column lebel
print("\n--------> isin use df val with df : \n", df.isin(pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
                  index=['falcon', 'dog']))) # use as df values that must also match both index and column label


# Use of query() method in df..
# It's use for Query the columns of dataframe with the boolean expression..
# syntax : DataFrame.query(expr, *, inplace=False, **kwargs)
# here..
# expr (str) : The query string to evaluate.
#            You can refer to variables in the environment by prefixing them with an ‘@’ character like @a + b.
#            We can refer the column name, but if column contains some space or panctuation then use backtics like `num legs`.
# ** kwargs : we can pass some keyword argument which can run through eval() or evaluation string.
df = pd.DataFrame({'A': range(1, 6),
                   'B': range(10, 0, -2),
                   'C C': range(10, 5, -1)})
print("\n--------> query dataframe : \n", df)
print("\n--------> use query string on df : \n", df.query('A < B'))  # this is similar to query df[df.A < df.B]
print("\n--------> use query string on df space column : \n", df.query('B == `C C`'))  # this is similar to query df[df.B == df['C C']]


# Use of str methods in df..
# str accessor in pandas provides a collection of string methods that can be applied for manipulation string data 
# in series/index of a dataframe..
# It's provides some variety of methods -
#   Transformation: lower(), upper(), capitalize(), title(), strip(), replace(), etc.
#   Searching and Matching: contains(), startswith(), endswith(), find(), match(), etc.
#   Splitting and Joining: split(), join().
#   Extraction: get(), extract().
df = pd.DataFrame({'str_val' : ['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe', np.nan],
                   'age' : [23, 45, 67, 20, 0]})
print("\n------> df : \n", df)
ser = df.str_val
print("------> ser in lower case : \n", ser.str.lower())     # convert string into lower case
print("------> ser in upper case : \n", ser.str.upper())     # convert string into upper case
print("------> ser in capitalized : \n", ser.str.capitalize()) # convert the first letter of string in upper case
print("------> ser as title : \n", ser.str.title())     # return the string word start with capital letter
print("------> ser is in swapcase : \n", ser.str.swapcase())  # use for convert lower string to upper case and vise-versa
# concatinate the string using str.cat
# syntax : Series.str.cat(others=None, sep=None, na_rep=None, join='left')
# here others can be a series, dataframe or np.ndarray
print("\n------> cancat series : \n", ser.str.cat(sep=' ')) # When not passing others, all values are concatenated into a single string
print("------> concat with seperation : \n", ser.str.cat(sep=' ', na_rep='?')) # nan values replace by ?
print("------> concat with other series : \n", ser.str.cat(others=['case', 'letter', '!', 'letter', 'yes'], sep='_'))
# centerlized the sting using center method
# Series.str.center(width, fillchar=' ')
print("\n------> center series string : \n", ser.str.center(20, fillchar='.'))
print("------> left justify string : \n", ser.str.ljust(20, fillchar='-'))
print("------> right justify string : \n", ser.str.rjust(20, fillchar='-'))
# use str.contains()
# syntax : Series.str.contains(pat, case=True, flags=0, na=<no_default>, regex=True)
# pat - sequence/regular expression, case - boolean default True, flags - default 0 pass through to re module re.IGNORECASE
# na - can be update with True/False, regex - If True, assumes pat is a regular expression other false for literl string.
s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
print("\n------> check literal in series : \n", s1.str.contains('og', regex=False))  # user pat as string literal
print("------> check regex expr in case sens : \n", s1.str.contains('oG', case=True, regex=True))  # when case sensitive is true
print("------> check regex with set nan false : \n", s1.str.contains('og', na=False, regex=True)) # convert na to False value
import re   # ignore the upper/lower case in regex
print("------> check regex with igone case: \n", s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True))
# Use str.startswith(), str.endswith()
# Test if the end of each string element matches a pattern.
# syntax : Series.str.endswith(pat, na=<no_default>)
s = pd.Series(['bat', 'bear', 'caT', np.nan])
print("\n------> check string in series end with given pattern : \n", s.str.endswith('t'))
print("------> check string in series end with given multiple pattern : \n", s.str.endswith(('t', 'T')))
print("------> check string in series that start with given pattern : \n", s.str.startswith(('b', 'B')))
# check the count occurance in the string..
# syntax : Series.str.count(pat, flags=0)
s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
print("\n------> count the number of occurance of pattern in strings : \n", s.str.count('a'))



"""_Use for more others functions/attributes related to pandas, go through below link_
Ref : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
"""



##### ------------ USE OF SELECTION (Rows/Columns), APPLY CONDITION AND FILTER (On Rows/Columns) ---------- #####

In [None]:
"""_USE FOR SELECT THE SPECIFIC COLUMNS/ROWS AND APPLIED SOME SET OF CONDITIONS AND FILTERS OR MANY MORE_
"""

# Read the titanic data...
ttc_df = pd.read_parquet('data/titanic.parquet', engine='auto').iloc[10:30, :]
ttc_df.reset_index(drop=True, inplace=True)
print("-----> print titanic df : \n", ttc_df)


# Use of Selection columns in dataframe with/without loc/iloc...
print("\n-----> select single col from df as series : \n", ttc_df['PassengerId'])
print("\n-----> select multiple col from df as subset df : \n", ttc_df[['PassengerId', 'Pclass', 'Name', 'Age']])
print("\n-----> select single col using loc with entire rows : \n", ttc_df.loc[:, 'Pclass'])
print("\n-----> select subset df using iloc with specific rows and cols : \n", ttc_df.iloc[[1, 2, 4, 6, 10, 15], [1, 2, 4]])


# Use filter dataframe by using single/multiple/predefine_method conditions (without loc/iloc)...
print("\n------> Any Comparison/Condition on series/col, return Boolean True/False list : \n", (ttc_df.Age > 18))
print("\n------> filter all ttc passenger using single condition where age greater than 18 : \n", ttc_df[(ttc_df.Age > 18)])
print("\n-----> filter with combined condition with predefined_method : \n", ttc_df[(ttc_df.Age > 18) & (ttc_df['Cabin'].isnull())])
print("\n-----> filer record where age is nan : \n", ttc_df[ttc_df['Age'].isna()])


# Use filter dataframe by using single/multiple/predefine_method conditions (with loc/iloc)...
print("\n-----> filter rows where 'Sex' is 'male' and select all columns \n", ttc_df.loc[(ttc_df.Sex=='male'), :])
print("\n-----> filter cols where match col is 'Sex' and select all rows \n", ttc_df.loc[:, (ttc_df.columns=='Sex')])
print("\n-----> filter row where col value present is in given specific list/dict/series/df values : \n",
        ttc_df.loc[ttc_df['Parch'].isin([1, 2, 5])])
print("\n------> filter males recors where use regex pattern 'Mr.' for checking : \n",
        ttc_df.loc[ttc_df['Name'].str.contains('Mr\\.', case=True, regex=True), ['PassengerId', 'Name', 'Age', 'Fare']])
print("\n-----> use iloc which supports interger based indexing So only filter using list of True/False values on condition :\n", 
        ttc_df.iloc[ttc_df['Embarked'].str.startswith(('Q', 'C')).values, :])


# Use filter datafrane by using single/multiple/default_method conditions (with query/where/mask etc)..
print("\n-----> filter record using query like-SQL and useful for complex condition : \n", ttc_df.query("Fare > 10 and Cabin.notnull()"))
print("\n-----> use for replaces values that do not satisfy a condition with NaN or a specified value. : \n",
       ttc_df.where(ttc_df.Age > 18))
print("\n-----> use for replaces values that do satisfy a condition with NaN or a specified value. : \n",
        ttc_df.mask(ttc_df.Age > 18))




"""_For more reference use below link_
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.filter.html
"""

##### --------- REFERENCES -------- #####

In [None]:
"""_MORE REFERENCES_

* https://pandas.pydata.org/docs/getting_started/index.html
* https://pandas.pydata.org/docs/user_guide/index.html
* https://pandas.pydata.org/docs/reference/#

"""