# Pandas

Pandas is an numerical open source python library that is built on top of NumPy. Why learning Pandas is important:


*   Pandas allows you do fast analysis as well as data cleaning and preparation
*   Pandas can work well with data from a wide variety of sources such as; Excel sheet, csv file, sql file or even a webpage



In [1]:
#import numpy and pandas
import numpy as np
import pandas as pd

**Pandas Series**

Pandas series are one-dimensional labeled array that are capable of holding of data of any type

In [2]:
#creating panda series from list
ls = ['a', 'b', 'c', 'd', 'e']
ser_1 = pd.Series(ls)

print (ser_1)

0    a
1    b
2    c
3    d
4    e
dtype: object


In [3]:
#creating panda series from array
arr = np.array([10, 20, 30, 40, 50])
ser_2 = pd.Series(arr)

print (ser_2)

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [4]:
#create series with specific indexing
ser_3 = pd.Series(arr, index = ['sarah', 'bob', 'alex', 'den', 'nancy'])

print (ser_3)

sarah    10
bob      20
alex     30
den      40
nancy    50
dtype: int64


In [0]:
#accessing an element using the index 
print (ser_3[3], '\n\n')
print (ser_3[[0, 2, 4]], '\n\n')
print (ser_3[:3])

40 


sarah    10
alex     30
nancy    50
dtype: int64 


sarah    10
bob      20
alex     30
dtype: int64


In [0]:
#accessing element using an index label
print (ser_3['den'], '\n\n')
print (ser_3[['den', 'alex', 'sarah']])

40 


den      40
alex     30
sarah    10
dtype: int64


**Panadas DataFrame**

Pandas DataFrame is a two-dimensional labeled data structure.

In [0]:
#create DataFrame from dictionary of pandas series
d = {'first' : pd.Series([10, 20, 30, 40], index = ['sarah', 'bob', 'alex', 'den']),
     'second' : pd.Series([11, 22, 44, 55], index = ['sarah', 'bob', 'den', 'nancy'])}

df = pd.DataFrame(d)

print (df)

       first  second
alex    30.0     NaN
bob     20.0    22.0
den     40.0    44.0
nancy    NaN    55.0
sarah   10.0    11.0


In [0]:
#get the labels of index and columns

print ('Rows:    ', df.index)
print ('Columns: ', df.columns)

Rows:     Index(['alex', 'bob', 'den', 'nancy', 'sarah'], dtype='object')
Columns:  Index(['first', 'second'], dtype='object')


In [0]:
#we can also select index and columns labels when createing df
df = pd.DataFrame(d, index = ['sarah', 'alex', 'bob'])
print (df, '\n')

df = pd.DataFrame(d, columns = ['second'])
print (df)

       first  second
sarah     10    11.0
alex      30     NaN
bob       20    22.0 

       second
sarah      11
bob        22
den        44
nancy      55


In [0]:
#create DataFrame from list of dictionaries
ld = [{'sarah': 10, 'alex': 30}, {'claire': 30, 'bob': 40, 'den': 50}, {'chris': 20, 'sarah': 40}]
df = pd.DataFrame(ld)
df

Unnamed: 0,alex,bob,chris,claire,den,sarah
0,30.0,,,,,10.0
1,,40.0,,30.0,50.0,
2,,,20.0,,,40.0


In [0]:
#the same way we can select index and columns
df = pd.DataFrame(ld, columns = ['alex', 'claire', 'chris'])
print (df, '\n')

df = pd.DataFrame(ld, index = ['first', 'second', 'third'])
print (df)

   alex  claire  chris
0  30.0     NaN    NaN
1   NaN    30.0    NaN
2   NaN     NaN   20.0 

        alex   bob  chris  claire   den  sarah
first   30.0   NaN    NaN     NaN   NaN   10.0
second   NaN  40.0    NaN    30.0  50.0    NaN
third    NaN   NaN   20.0     NaN   NaN   40.0


**DataFrame operations**

In [0]:
df = pd.DataFrame(ld, index = ['first', 'second', 'third'])
df

Unnamed: 0,alex,bob,chris,claire,den,sarah
first,30.0,,,,,10.0
second,,40.0,,30.0,50.0,
third,,,20.0,,,40.0


In [0]:
#get a column by name
df['alex']

first     30.0
second     NaN
third      NaN
Name: alex, dtype: float64

In [0]:
#get a row by name
df.loc['first']

alex      30.0
bob        NaN
chris      NaN
claire     NaN
den        NaN
sarah     10.0
Name: first, dtype: float64

In [0]:
#retriev multiple rows or columns
print ('multiple columns', '\n',  df[['bob', 'claire', 'den']], '\n\n')
print ('multiple rows', '\n', df.loc[['first', 'third']], '\n\n')
print ('multiple rows and columns', '\n', df.loc[['first', 'third'], ['bob', 'claire', 'den']])

multiple columns 
          bob  claire   den
first    NaN     NaN   NaN
second  40.0    30.0  50.0
third    NaN     NaN   NaN 


multiple rows 
        alex  bob  chris  claire  den  sarah
first  30.0  NaN    NaN     NaN  NaN   10.0
third   NaN  NaN   20.0     NaN  NaN   40.0 


multiple rows and columns 
        bob  claire  den
first  NaN     NaN  NaN
third  NaN     NaN  NaN


In [0]:
#the same way we can use iloc to get rows and columns by position
df.iloc[[1,2], [0,2,3]]

Unnamed: 0,alex,chris,claire
second,,,30.0
third,,20.0,


In [0]:
df

Unnamed: 0,alex,bob,chris,claire,den,sarah
first,30.0,,,,,10.0
second,,40.0,,30.0,50.0,
third,,,20.0,,,40.0


In [0]:
#add new column
df['lily'] = df['alex'] - df['sarah']
df

Unnamed: 0,alex,bob,chris,claire,den,sarah,lily
first,30.0,,,,,10.0,20.0
second,,40.0,,30.0,50.0,,
third,,,20.0,,,40.0,


In [0]:
#add another one 
df['sam'] = df['chris'] < 30
df

Unnamed: 0,alex,bob,chris,claire,den,sarah,lily,sam
first,30.0,,,,,10.0,20.0,False
second,,40.0,,30.0,50.0,,,False
third,,,20.0,,,40.0,,True


In [0]:
#pop a column
last = df.pop('claire')
print (last, '\n')
df

first      NaN
second    30.0
third      NaN
Name: claire, dtype: float64 



Unnamed: 0,alex,bob,chris,den,sarah,lily,sam
first,30.0,,,,10.0,20.0,False
second,,40.0,,50.0,,,False
third,,,20.0,,40.0,,True


In [0]:
#delete a column
del(df['chris'])
df

Unnamed: 0,alex,bob,den,sarah,lily,sam
first,30.0,,,10.0,20.0,False
second,,40.0,50.0,,,False
third,,,,40.0,,True


In [0]:
#insert a column
df.insert(2, 'alex+bob', df['alex']+df['bob'])
df

Unnamed: 0,alex,bob,alex+bob,den,sarah,lily,sam
first,30.0,,,,10.0,20.0,False
second,,40.0,,50.0,,,False
third,,,,,40.0,,True


**Use pandas to read the dataset**

In [0]:
#to get files from Google drive to Colab, mount the Google drive to Colab session
from google.colab import drive


In [0]:
#this will prompt for authorization
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#after executing the cell above, the Drive files will be present in "/content/drive/My Drive"
!ls '/content/drive/My Drive/Python for Data Analysis Data Science'

'BlackFriday copy.csv'
'Interview Questions.gdoc'
 lesson1_numpy.ipynb
'Lesson 1 - Pandas and Numpy.gdoc'
 lesson1_pandas.ipynb
'Lesson 2: Data Visualization.gdoc'
 lesson2.ipynb
 Planning.gdoc
'Python For Data Analysis Flyer.gslides'
'Python For Data Analysis Host Request.gdoc'
 ted-talks


In [0]:
#read data from BlackFriday copy.csv to black_friday
ted_talks = pd.read_csv('/content/drive/My Drive/Python for Data Analysis Data Science/ted-talks/ted_main.csv', sep = ',')
#let's see what does each row and column represents
ted_talks.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


**Descriptive Statistics**

In [0]:
#let's see the shape of our dataset
ted_talks.shape

(2550, 17)

In [0]:
#now let's see data type for each column
ted_talks.dtypes

comments               int64
description           object
duration               int64
event                 object
film_date              int64
languages              int64
main_speaker          object
name                  object
num_speaker            int64
published_date         int64
ratings               object
related_talks         object
speaker_occupation    object
tags                  object
title                 object
url                   object
views                  int64
dtype: object

In [0]:
#lets check for columns with missing values
ted_talks.isnull().any()

comments              False
description           False
duration              False
event                 False
film_date             False
languages             False
main_speaker          False
name                  False
num_speaker           False
published_date        False
ratings               False
related_talks         False
speaker_occupation     True
tags                  False
title                 False
url                   False
views                 False
dtype: bool

In [0]:
#remove the column that contains missing values
ted_talks.drop("speaker_occupation", axis = 1, inplace = True)
ted_talks.shape

(2550, 16)

**Descriptive Statistics**

In [0]:
#let's see how comments are distributed
ted_talks["comments"].describe()

count    2550.000000
mean      191.562353
std       282.315223
min         2.000000
25%        63.000000
50%       118.000000
75%       221.750000
max      6404.000000
Name: comments, dtype: float64

In [0]:
#we can access this values separately as well
ted_talks["comments"].mean()

191.56235294117647

In [0]:
ted_talks["comments"].std()

282.3152232572839

In [0]:
ted_talks["comments"].mode()

0    45
dtype: int64

In [0]:
ted_talks["comments"].max()

6404

In [0]:
#we also can use filters
filter_1 = ted_talks["views"] < 1000
filter_1.any()

False

In [0]:
filter_2 = ted_talks["views"] >= 1000
filter_2.all()

True