# Pandas #

- is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,

- built on top of the Python programming language.


In [1]:
# imports!
import pandas as pd 
import numpy as np 

## Series ##

In [2]:
l1 = pd.Series([1,2,3]) #Series defined from a list
l1

0    1
1    2
2    3
dtype: int64

In [26]:
# series back to numpy
l1.values

array([1, 2, 3])

In [11]:
l2 = pd.Series(np.array([1,2,3,4,5,6,7,8])) #Series defined from an array
l2

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

In [12]:
type(l2)

pandas.core.series.Series

In [13]:
l3 = pd.Series(np.array([1,3,5,7,'',9,11])) #Series can have mixed datatypes, though it would allow only operations on them
print('pandas l3:\n',l3)
print('mean:', l3.mean()) # empty string is a valid value - treated as if zero
print('max:', l3.max())

# Though below would throw an error and not allowed. 
#l4 = pd.Series(np.array([1,'Sandeep',4]))
#l4.mean()

pandas l3:
 0     1
1     3
2     5
3     7
4      
5     9
6    11
dtype: object
mean: 193987.2857142857
max: 9


In [14]:
l3.replace("1","2") #multiple funtions like replace available

0     2
1     3
2     5
3     7
4      
5     9
6    11
dtype: object

In [15]:
print('#indexing allowed to access the data:',l3[0]) #indexing allowed to access the data
print('#datatype post basic indexing:', type(l3[0]))
print('#DEFAULT series index defined as:', l3.index)
print('#post defined indexing #1:', l3[0:3])
print('#post defined indexing #2:', l3[2:])
print('#post defined indexing #3:', l3[:-1])
print('#post defined indexing #4:', l3[1:-1:2])
print('#post defined indexing #5:', l3[::2])

#indexing allowed to access the data: 1
#datatype post basic indexing: <class 'str'>
#DEFAULT series index defined as: RangeIndex(start=0, stop=7, step=1)
#post defined indexing #1: 0    1
1    3
2    5
dtype: object
#post defined indexing #2: 2     5
3     7
4      
5     9
6    11
dtype: object
#post defined indexing #3: 0    1
1    3
2    5
3    7
4     
5    9
dtype: object
#post defined indexing #4: 1    3
3    7
5    9
dtype: object
#post defined indexing #5: 0     1
2     5
4      
6    11
dtype: object


In [16]:
l4 = pd.Series([1,3,5,7,9], index=['A','B','C','D','E']) # CUSTOM index defined
print('l4[''A'']:',l4['A']) # case-sensitive l4['a'] NOT allowed
print('l4[''B'']:',l4['B']) 
print('l4[''C'']:',l4['C']) 
print('l4[''B'':''D'']:',l4['B':'D']) #Range indexing on custom index allowed 

l4[A]: 1
l4[B]: 3
l4[C]: 5
l4[B:D]: B    3
C    5
D    7
dtype: int64


## Dataframes ##

*** Dataframe from list of tuples ***

In [5]:
name = ['Sandeep','Murari','Niya','John','Swarnima']
age = [35, 25, 30, 33, 23]

people = list (zip(name,age))
print(people)
df = pd.DataFrame(data=people, columns=['Name','Age'])
df

[('Sandeep', 35), ('Murari', 25), ('Niya', 30), ('John', 33), ('Swarnima', 23)]


Unnamed: 0,Name,Age
0,Sandeep,35
1,Murari,25
2,Niya,30
3,John,33
4,Swarnima,23


In [6]:
df.rank() # compute numerical data ranks (1 through n) along axis

Unnamed: 0,Name,Age
0,4.0,5.0
1,2.0,2.0
2,3.0,3.0
3,1.0,4.0
4,5.0,1.0


*** Data Retrieval from Data Frames ***

In [18]:
# rows can be retrieved via index
df.iloc[0]

Name    Sandeep
Age          35
Name: 0, dtype: object

In [8]:
# columns can be retrieved using names
df ["Name"]

0     Sandeep
1      Murari
2        Niya
3        John
4    Swarnima
Name: Name, dtype: object

In [19]:
print(type(df["Age"])) #each column is a Series!
print("Index access in a Series:",df["Name"][0])

<class 'pandas.core.series.Series'>
Index access in a Series: Sandeep


In [22]:
# index is not unique. Thus, when accessed, if multiple values - gives all
findme = pd.Series(["LEARN","BY","INSIGTH"])
findme = findme.append(pd.Series(["INSIGHT","BY","LEARN"]), ignore_index=True)
findme = findme.append(pd.Series(["LEARNBYINSIGHT"])) #resets index by default on append
print(findme)
print("Row access of Series using index directly:\n",findme[0]) # returns both values (visible)
print("Row access of Series using iloc:\n", findme.iloc[6]) # returns actual index (assume hidden unique) 

0             LEARN
1                BY
2           INSIGTH
3           INSIGHT
4                BY
5             LEARN
0    LEARNBYINSIGHT
dtype: object
Row access of Series using index directly:
 0             LEARN
0    LEARNBYINSIGHT
dtype: object
Row access of Series using iloc:
 LEARNBYINSIGHT


In [23]:
df.info() # basic info about dataframe, generally first thing to gauge about

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [28]:
df2 = pd.DataFrame(data=people)
df2[0] #only allowed when columns are not defined. Thus, df[0] will throw error as Name, Age defined.

0     Sandeep
1      Murari
2        Niya
3        John
4    Swarnima
Name: 0, dtype: object

In [29]:
#Can define any column as index
df.set_index('Name', inplace=True)
df

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Sandeep,35
Murari,25
Niya,30
John,33
Swarnima,23


In [33]:
# cannot do df['Sandeep'] - its not an accessible index
# range around them can be done
df['Sandeep':'Niya']

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Sandeep,35
Murari,25
Niya,30


In [35]:
#loc access can be done (SEE: loc and not iloc => no more numerical index)
df.loc['Sandeep']

Age    35
Name: Sandeep, dtype: int64

In [39]:
#df['Name'] => NOT allowed. No more a column but index
df['Age'] # => Allowed

Name
Sandeep     35
Murari      25
Niya        30
John        33
Swarnima    23
Name: Age, dtype: int64

*** Dataframe from a dictionary ***

In [40]:
educated_dict = { 'Country': [ 'India', 'China', 'United States', 'Malaysia' ],
                  'Educated' : [200001234, 300001234, 100001234, 20001234] }

for k,v in educated_dict.items():
    print (k,v)

df2 = pd.DataFrame(educated_dict)
df2

Country ['India', 'China', 'United States', 'Malaysia']
Educated [200001234, 300001234, 100001234, 20001234]


Unnamed: 0,Country,Educated
0,India,200001234
1,China,300001234
2,United States,100001234
3,Malaysia,20001234


In [41]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Country   4 non-null      object
 1   Educated  4 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 192.0+ bytes


In [43]:
#df.head() => would give entire data back
df2.head(2)

Unnamed: 0,Country,Educated
0,India,200001234
1,China,300001234


In [44]:
df2.sample(2) #any 

Unnamed: 0,Country,Educated
3,Malaysia,20001234
1,China,300001234


In [45]:
df2["Country"]

0            India
1            China
2    United States
3         Malaysia
Name: Country, dtype: object

*** Dataframe from CSV ***

In [47]:
# The option sep="," is used to indicate field separators
# The file name can be replaced with a URL
df3 = pd.read_csv('./data-files/panda/population.csv',sep=",") 
df3.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1960,92490932.0
1,Arab World,ARB,1961,95044497.0
2,Arab World,ARB,1962,97682294.0
3,Arab World,ARB,1963,100411076.0
4,Arab World,ARB,1964,103239902.0


In [48]:
df3.info() #get the jist of data => 14885 rows :thumbsup: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14885 entries, 0 to 14884
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  14885 non-null  object 
 1   Country Code  14885 non-null  object 
 2   Year          14885 non-null  int64  
 3   Value         14885 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 465.3+ KB


*** Dataframe from Built-in dataset ***

Packages like sklearn and seaborn come with practice datasets


In [3]:
#Pandas dataframe from iris dataset
from sklearn.datasets import load_iris

iris = load_iris()
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']