In [30]:
'''
Pandas in Python is a powerful data manipulation library used for data analysis and data cleaning.
It provides two primary data structures: Series and DataFrame. 
    - Series is one-dimentional array like object. It is similar to a column in table.
    - DataSeries two-dimentional, size-mutable, and potentially heterogeneous 
      tabular data structure with labeled axes (row and columns). 
'''
import pandas as pd
import numpy as np

###--------------------------
### 1. Series
###--------------------------

## Create a series from a list
data = [1,2,3,4,5]
series = pd.Series(data)
#print(type(series))
#print("Series: \n",series)

## Create a series from a dictionary
data = {'a':1,'b':2,'c':3}
series_dict = pd.Series(data)
#print(type(series_dict))
#print("Series: \n",series_dict)

## Create a series with our own indexes
data = [1,2,3,4,5]
index = ['a','b','c','d','e']
series_ind = pd.Series(data, index=index)
#print(type(series_ind))
#print(series_ind)

###--------------------------
### 2. Dataframe
###--------------------------

## Create a DataFrame from a dictionary
data = {
        'Name':['Vaibhav','Aniket','Sandeep'],
        'Age':[36,25,40],
        'City':['Mumbai','Devgiri','Sangli']}
df = pd.DataFrame(data)
#print(df)
#print(type(df))

## We can convert dataframe into numpy array
npArr = np.array(df)
#print(npArr)

## Create a dataframe from a list of dictionaries
data = [
    {'Name':'Vaibhav', 'Age':36, 'City':'Mumbai'},
    {'Name':'Aniket', 'Age':25, 'City':'Devgiri'},
    {'Name':'Sandeep', 'Age':40, 'City':'Sangli'}
]
df1 = pd.DataFrame(data)
#print(df1)
#print(type(df1))

## Create dataframe from csv file
df = pd.read_csv('customers-100.csv')
#df.head(5)          ## get top 5 rows of csv
#df.tail(5)          ## get last 5 rows of csv

## Accessing data from dataframe
data = {
        'Name':['Vaibhav','Aniket','Sandeep'],
        'Age':[36,25,40],
        'City':['Mumbai','Devgiri','Sangli']}
df = pd.DataFrame(data)
df
#df['Name']          # Get all elements of Name columns. It retuns a series
#type(df['Name'])
#df.loc[0]   # gives first row
#df.iloc[0]  # this also gives first row

'''
loc  → cell by name → A2
iloc → cell by position → (row1,col1)
'''

df.loc[1,'Name'] # gives 'Name' column of second row : loc is row index, we have to use labeled index for columns
df.iloc[0,1] # gives 2nd column of first row    : iloc is column index, we use numbered index for columns

## Accessing specified element using at
df.at[1,'City']

## Accessing specified element using iat
df.iat[2,2]

###--------------------------
### Data Manipulation with Dataframe
###--------------------------

# Adding a column to DataFrame
df['Skill']=['Guitarist','Flutist','Accordionist']
df

# Remove a column
#df.drop('Skill')  # This will give error as we are not specifying axis and its default value is 0. axis = 0 ==> rows, axis = 1 ==> Cols. error comes because in rows there is no label named 'Skill'.
df.drop('Skill', axis=1)
df                      # Here we see the dropped columns re-appears. Because the drop is not permanent. For permanent drop we have to use inplace=true
df.drop('Skill', axis=1, inplace = True)   # This is correct way.
df

# We can modify all column values in DataFrame at once e.g.
#df['Age'] = df['Age'] + 1
df

# We can drop row with index
df.drop(0, inplace=True)
df

# Display the datatype of each column
print("Data Types:\n", df.dtypes)

# Describe the DataFrame
print("Stattistical Summary:\n", df.describe())

# Group by a column and perform aggregation
grouped = df.groupby('Skill')['Age'].mean()
print("Mean value by category:\n", grouped)
df


Unnamed: 0,Name,Age,City,Skill
0,Vaibhav,36,Mumbai,Guitarist
1,Aniket,25,Devgiri,Flutist
2,Sandeep,40,Sangli,Accordionist
