# Pandas tutorial

Pandas is a powerful tool to for data analysis manipulation

## Creating Pandas Series

In [1]:
import pandas as pd

In [5]:
#lest create a series containing grosery items
#for indices we will use food names as index labels and the quatities we need to buy as our data.
groceries = pd.Series(data=[30,6, 'Yes', 'No'], index=['eggs', 'apples', 'milk', 'bread']) #data and indices
#panda series is a 1D array, that can hold many types, such numbers or string
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [6]:
groceries.shape #dimesion shape of the data

(4,)

In [7]:
groceries.ndim #number of dimesion of the data

1

In [8]:
groceries.size #total number of values in the array

4

In [9]:
#print the index label on the data separately
groceries.index #this give as the index labels of the series object

Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')

In [10]:
groceries.values #give us the data into series object

array([30, 6, 'Yes', 'No'], dtype=object)

In [12]:
#if we are working with long data we can make a search to find the item
'banana' in groceries

False

In [13]:
'bread' in groceries

True

### Accessing and Deleting Elements in Pandas Series

In [54]:
groceries = pd.Series(data=[30,6, 'Yes', 'No'], index=['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [40]:
groceries['eggs']

30

In [41]:
groceries[['eggs', 'milk']]

eggs     30
milk    Yes
dtype: object

In [42]:
groceries[0]

30

In [43]:
groceries[-1]

'No'

In [44]:
groceries[[0,1,2]]

eggs       30
apples      6
milk      Yes
dtype: object

### changing and removing 

In [45]:
#(loc and iloc)

In [46]:
groceries.loc[['eggs', 'apples']] #stand for a location, it´s used to explicity state thar we are using a lebelled index

eggs      30
apples     6
dtype: object

In [47]:
groceries.iloc[[2,3]]#stands for integer location, is used to explicity state that we are using a numerical index


milk     Yes
bread     No
dtype: object

In [48]:
#panda series is also mutable like NumPy arrays
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [49]:
#let´s change the number of eggs thant we need
groceries['eggs'] = 2
groceries

eggs        2
apples      6
milk      Yes
bread      No
dtype: object

In [55]:
#let´s remove
print(groceries.drop('apples'))
print('\nhere it just returned the modified series and it did not change the original one')

eggs      30
milk     Yes
bread     No
dtype: object

here it just returned the modified series and it did not change the original one


In [51]:
groceries

eggs        2
apples      6
milk      Yes
bread      No
dtype: object

In [56]:
#let´s remove
groceries.drop('apples', inplace=True)

In [59]:
print(groceries)
print('\nhere it returned the modified series and IT DID change the original one')

eggs      30
milk     Yes
bread     No
dtype: object

here it returned the modified series and IT DID change the original one


### Arithmetic Operations on Pandas Series

In [60]:
import pandas as pd

In [62]:
fruit = pd.Series([10,6,3], ['apple', 'orange', 'bananas'])
fruit

apple      10
orange      6
bananas     3
dtype: int64

In [63]:
fruit + 2

apple      12
orange      8
bananas     5
dtype: int64

In [64]:
fruit - 3

apple      7
orange     3
bananas    0
dtype: int64

In [65]:
fruit * 2

apple      20
orange     12
bananas     6
dtype: int64

In [66]:
fruit / 2

apple      5.0
orange     3.0
bananas    1.5
dtype: float64

In [69]:
import numpy as np

fruit

apple      10
orange      6
bananas     3
dtype: int64

In [71]:
#get the square root
np.sqrt(fruit)

apple      3.162278
orange     2.449490
bananas    1.732051
dtype: float64

In [72]:
#the exponential of each element
np.exp(fruit)

apple      22026.465795
orange       403.428793
bananas       20.085537
dtype: float64

In [74]:
#each element to the power of two
np.power(fruit, 2)

apple      100
orange      36
bananas      9
dtype: int64

In [75]:
fruit

apple      10
orange      6
bananas     3
dtype: int64

In [77]:
#add 2 just to banana item
fruit['bananas'] + 2

5

In [79]:
#subtract 2 from apples
fruit['apple'] - 2

8

In [80]:
fruit.iloc[0] - 2

8

In [81]:
#multiply two items by 2
fruit[['apple', 'orange']] * 2

apple     20
orange    12
dtype: int64

In [87]:
#divide two items by 2
fruit.loc[['apple', 'oranges']] / 2

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  from ipykernel import kernelapp as app


apple      5.0
oranges    NaN
dtype: float64

In [88]:
groceries = pd.Series(data=[30,6, 'Yes', 'No'], index=['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [90]:
groceries * 2 
#we can multiply for any type element, but, we can not divide. We have to be sure that own data
#have the same type

eggs          60
apples        12
milk      YesYes
bread       NoNo
dtype: object

###### Quiz

Create a Pandas Series that contains the distance of some planets from the Sun.
Use the name of the planets as the index to your Pandas Series, and the distance from the Sun as your data. The distance from the Sun is in units of 10^6 km


In [96]:
distance = [149.6, 1433.5, 227.9, 108.2, 778.6]

planets = ['Earth','Saturn', 'Mars','Venus', 'Jupiter']

In [97]:
# Create a Pandas Series using the above data, with the name of the planets as
# the index and the distance from the Sun as your data.

dist_planets = pd.Series(data = distance, index = planets)

In [98]:
# Calculate the number of minutes it takes sunlight to reach each planet. You can
# do this by dividing the distance from the Sun for each planet by the speed of light.
# Since in the data above the distance from the Sun is in units of 10^6 km, you can
# use a value for the speed of light of c = 18, since light travels 18 x 10^6 km/minute.

time_light = dist_planets / 18

In [99]:
# Use Boolean indexing to select only those planets for which sunlight takes less
# than 40 minutes to reach them.

close_planets = time_light[time_light < 40]

In [100]:
close_planets

Earth     8.311111
Mars     12.661111
Venus     6.011111
dtype: float64

### Creating Pandas DataFrames

DataFrame: is a two dimensional object with labeled rows and columns,it can also hold multiple data types. 
We can created Pandas DataFrame manually or by loading data from file.

We are going to create a DataFrame manually from a diccionary containig several pandas series. Let´s create this diccionary and then pass in into DataFrame function.

In [101]:
import pandas as pd

In [105]:
items = {'Bob': pd.Series([245, 25, 55], index = ['bike', 'pants', 'watch']),'Alice': pd.Series([40, 110,500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

type(items)

dict

In [120]:
shoppin_carts = pd.DataFrame(items)
shoppin_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [107]:
#if we want to work in a deep learning algoritihm we have to remove the NaN elements.

In [113]:
data = {'Bob': pd.Series([245, 25, 55]),'Alice': pd.Series([40, 110,500, 45])}

df = pd.DataFrame(data)
df

Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [121]:
shoppin_carts.index

Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

In [122]:
shoppin_carts.columns

Index(['Bob', 'Alice'], dtype='object')

In [123]:
shoppin_cats.values

array([[245., 500.],
       [ nan,  40.],
       [ nan, 110.],
       [ 25.,  45.],
       [ 55.,  nan]])

In [124]:
shoppin_carts.shape

(5, 2)

In [125]:
shoppin_carts.ndim

2

In [126]:
shoppin_carts.size

10

In [128]:
bob_shopping_cart = pd.DataFrame(items, columns=['Bob']) #DataFrame that only loads Bob´s shopping cart.
bob_shopping_cart

Unnamed: 0,Bob
bike,245
pants,25
watch,55


In [131]:
sel_shoppin_cart = pd.DataFrame(items, index = ['pants', 'book'])
sel_shoppin_cart #dataframe that only has selected items for both Alice and Bob.

Unnamed: 0,Bob,Alice
pants,25.0,45
book,,40


In [133]:
alice_sel_shopping_cart = pd.DataFrame(items, index=['glasses', 'bike'], columns=['Alice'])
alice_sel_shopping_cart

Unnamed: 0,Alice
glasses,110
bike,500


In [137]:
data = {'Integers': [1,2,3], 'floats': [4.5, 8.2, 9.6]}
df = pd.DataFrame(data, index=['label 1', 'label 2', 'label 3'])
df

Unnamed: 0,Integers,floats
label 1,1,4.5
label 2,2,8.2
label 3,3,9.6


Let`s create a Pandas DataFrames using a list of Python dictionary:

In [142]:
items = [{'bikes': 20, 'pants': 30, 'watches': 35}, {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]
store_items = pd.DataFrame(items, index=['store 1', 'store 2'])
store_items

Unnamed: 0,bikes,glasses,pants,watches
store 1,20,,30,35
store 2,15,50.0,5,10


In [143]:
store_items[['bikes']] #access to a column using columns index lebel

Unnamed: 0,bikes
store 1,20
store 2,15


In [144]:
store_items[['pants', 'watches']] #access to columns using columns index lebel

Unnamed: 0,pants,watches
store 1,30,35
store 2,5,10


In [146]:
store_items.loc[['store 1']] #access a row ussinf row index functions

Unnamed: 0,bikes,glasses,pants,watches
store 1,20,,30,35


In [152]:
store_items['bikes']['store 2']

15

In [155]:
#add a column

store_items['shirts'] = [15,2]
store_items

Unnamed: 0,bikes,glasses,pants,watches,shirt,shirts
store 1,20,,30,35,15,15
store 2,15,50.0,5,10,2,2


In [157]:
#add a column with the value of other column´s value

store_items['suits'] = store_items['shirts'] + store_items['pants']
store_items  

Unnamed: 0,bikes,glasses,pants,watches,shirt,shirts,suits
store 1,20,,30,35,15,15,45
store 2,15,50.0,5,10,2,2,7


In [158]:
new_items = [{'bikes':20, 'pants':30, 'watches':35, 'glasses':4}]
new_store = pd.DataFrame(new_items, index=['store 3'])
new_store

Unnamed: 0,bikes,glasses,pants,watches
store 3,20,4,30,35


In [159]:
#we can add the last DataFrame to the oldest by:

store_items = store_items.append(new_store)
store_items

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,bikes,glasses,pants,shirt,shirts,suits,watches
store 1,20,,30,15.0,15.0,45.0,35
store 2,15,50.0,5,2.0,2.0,7.0,10
store 3,20,4.0,30,,,,35


In [161]:
#we can add a new item in a new column with the same quantity that one of the item than we already have.

store_items['new_watches'] = store_items['watches'][1:]
store_items

Unnamed: 0,bikes,glasses,pants,shirt,shirts,suits,watches,new_watches
store 1,20,,30,15.0,15.0,45.0,35,
store 2,15,50.0,5,2.0,2.0,7.0,10,10.0
store 3,20,4.0,30,,,,35,35.0


In [163]:
#we are add a new column right to the suits column

store_items.insert(5,'shoes', [8,5,0])
store_items

Unnamed: 0,bikes,glasses,pants,shirt,shirts,shoes,suits,watches,new_watches
store 1,20,,30,15.0,15.0,8,45.0,35,
store 2,15,50.0,5,2.0,2.0,5,7.0,10,10.0
store 3,20,4.0,30,,,0,,35,35.0


Delete columns and rows by using pop and drop method:
-Pop is using to delete columns
-Drop method used to delete rows and columns by using the axis keyword.

In [164]:
#delete new watches columnd using pop

store_items.pop('new_watches')
store_items

Unnamed: 0,bikes,glasses,pants,shirt,shirts,shoes,suits,watches
store 1,20,,30,15.0,15.0,8,45.0,35
store 2,15,50.0,5,2.0,2.0,5,7.0,10
store 3,20,4.0,30,,,0,,35


In [169]:
#delete watches ans choes with drop

store_items = store_items.drop(['watches', 'shoes'],axis=1)
store_items

Unnamed: 0,bikes,glasses,pants,shirt,shirts,suits
store 1,20,,30,15.0,15.0,45.0
store 2,15,50.0,5,2.0,2.0,7.0
store 3,20,4.0,30,,,


In [170]:
#delete rows store 1 and store 2

store_items = store_items.drop(['store 1', 'store 2'],axis=0)
store_items

Unnamed: 0,bikes,glasses,pants,shirt,shirts,suits
store 3,20,4.0,30,,,


In [172]:
#change a lebel
store_items = store_items.rename(columns={'bikes': 'hats'})
store_items

Unnamed: 0,hats,glasses,pants,shirt,shirts,suits
store 3,20,4.0,30,,,


In [173]:
store_items = store_items.rename(index={'store 3': 'last store'})
store_items

Unnamed: 0,hats,glasses,pants,shirt,shirts,suits
last store,20,4.0,30,,,


In [175]:
store_items = store_items.set_index('pants')
store_items

Unnamed: 0_level_0,hats,glasses,shirt,shirts,suits
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,20,4.0,,,


### Dealing with NaN

Before we can begin analyzing data or usint it to train our learning algorithms, we need to clean it. Thats mean tha we need a way to detect and correct errors in our data.

In [180]:
store_items #here we have 2 Nan

Unnamed: 0,bikes,glasses,pants,shirts,watches
store 1,20,,30,15.0,35
store 2,15,50.0,5,2.0,10
store 3,20,4.0,30,,35


When we are working with a million of item, we can not see easily how many NaN are in the DataFrame, we can do:

In [181]:
x = store_items.isnull().sum().sum()
print(x)

2


In [182]:
x = store_items.isnull()#.sum().sum()
print(x)

         bikes  glasses  pants  shirts  watches
store 1  False     True  False   False    False
store 2  False    False  False   False    False
store 3  False    False  False    True    False


In [183]:
x = store_items.isnull().sum()#.sum()
print(x)

bikes      0
glasses    1
pants      0
shirts     1
watches    0
dtype: int64


In [185]:
#we can do the opposite and count the numbres of NON NaN values

x = store_items.count()
print(x)

bikes      3
glasses    2
pants      3
shirts     2
watches    3
dtype: int64


In [187]:
#removed

store_items.dropna(axis=0) #to elimintae any rows with NaN values

Unnamed: 0,bikes,glasses,pants,shirts,watches
store 2,15,50.0,5,2.0,10


In [188]:
store_items.dropna(axis=1) #to elimintae any column with NaN values

Unnamed: 0,bikes,pants,watches
store 1,20,30,35
store 2,15,5,10
store 3,20,30,35


Te original DataFrame in not mofidicated, if you want it, you must do:

In [191]:
store_items.dropna(axis=1, inplace=True)

In [192]:
store_items #now the original has been modificated

Unnamed: 0,bikes,pants,watches
store 1,20,30,35
store 2,15,5,10
store 3,20,30,35


In [193]:
items = [{'bikes': 20, 'pants': 30, 'watches': 35}, {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]
store_items = pd.DataFrame(items, index=['store 1', 'store 2'])
store_items['shirts'] = [15,2]
new_items = [{'bikes':20, 'pants':30, 'watches':35, 'glasses':4}]
new_store = pd.DataFrame(new_items, index=['store 3'])
store_items = store_items.append(new_store)
store_items

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,bikes,glasses,pants,shirts,watches
store 1,20,,30,15.0,35
store 2,15,50.0,5,2.0,10
store 3,20,4.0,30,,35


In action to eliminate NaN values, let´s replace them with actual values. 

Example: we could choose to replace all NaNs values with the value of zero. The *fillna* method can be used for this.

In [194]:
store_items.fillna(0) 

Unnamed: 0,bikes,glasses,pants,shirts,watches
store 1,20,0.0,30,15.0,35
store 2,15,50.0,5,2.0,10
store 3,20,4.0,30,0.0,35


In [197]:
#in this case we are replace the original value by the previous value. 
#In case of glasses, still being NaN bacause there is not a previous value in this column
store_items.fillna(method='ffill', axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,watches
store 1,20,,30,15.0,35
store 2,15,50.0,5,2.0,10
store 3,20,4.0,30,2.0,35


In [198]:
store_items.fillna(method='ffill', axis=1) #in this case, all the NaN values have been replaced.

Unnamed: 0,bikes,glasses,pants,shirts,watches
store 1,20.0,20.0,30.0,15.0,35.0
store 2,15.0,50.0,5.0,2.0,10.0
store 3,20.0,4.0,30.0,30.0,35.0


In [199]:
#we can choose to replace the NaNs with the value that go after them in the DataFream

store_items.fillna(method='backfill', axis=0) #this replace each NaN witht the nect value in that column.

Unnamed: 0,bikes,glasses,pants,shirts,watches
store 1,20,50.0,30,15.0,35
store 2,15,50.0,5,2.0,10
store 3,20,4.0,30,,35


The NaN value in "store 1" has been replaced by the next value in it is column. But the two NaN values in "store 3" did not, since they are the last values in their columns.The fillna method fills the NaN values out of place.

##### Quiz

In [205]:
import pandas as pd
import numpy as np

# Since we will be working with ratings, we will set the precision of our 
# dataframes to one decimal place.
pd.set_option('precision', 1)

# Create a Pandas DataFrame that contains the ratings some users have given to a
# series of books. The ratings given are in the range from 1 to 5, with 5 being
# the best score. The names of the books, the authors, and the ratings of each user
# are given below:

books = pd.Series(data = ['Great Expectations', 'Of Mice and Men', 'Romeo and Juliet', 'The Time Machine', 'Alice in Wonderland' ])
authors = pd.Series(data = ['Charles Dickens', 'John Steinbeck', 'William Shakespeare', ' H. G. Wells', 'Lewis Carroll' ])

user_1 = pd.Series(data = [3.2, np.nan ,2.5])
user_2 = pd.Series(data = [5., 1.3, 4.0, 3.8])
user_3 = pd.Series(data = [2.0, 2.3, np.nan, 4])
user_4 = pd.Series(data = [4, 3.5, 4, 5, 4.2])

# Users that have np.nan values means that the user has not yet rated that book.
# Use the data above to create a Pandas DataFrame that has the following column
# labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3', 'User 4'. Let Pandas
# automatically assign numerical row indices to the DataFrame. 

# Create a dictionary with the data given above
dat = {'Book Title' : books,
       'Author' : authors,
       'User 1' : user_1,
       'User 2' : user_2,
       'User 3' : user_3,
       'User 4' : user_4}

# Use the dictionary to create a Pandas DataFrame
book_ratings = pd.DataFrame(dat)
print(book_ratings)
# If you created the dictionary correctly you should have a Pandas DataFrame
# that has column labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3',
# 'User 4' and row indices 0 through 4.

# Now replace all the NaN values in your DataFrame with the average rating in
# each column. Replace the NaN values in place. HINT: you can use the fillna()
# function with the keyword inplace = True, to do this. Write your code below:

book_ratings.fillna(book_ratings.mean(), inplace = True)
book_ratings

            Book Title               Author  User 1  User 2  User 3  User 4
0   Great Expectations      Charles Dickens     3.2     5.0     2.0     4.0
1      Of Mice and Men       John Steinbeck     NaN     1.3     2.3     3.5
2     Romeo and Juliet  William Shakespeare     2.5     4.0     NaN     4.0
3     The Time Machine          H. G. Wells     NaN     3.8     4.0     5.0
4  Alice in Wonderland        Lewis Carroll     NaN     NaN     NaN     4.2


Unnamed: 0,Book Title,Author,User 1,User 2,User 3,User 4
0,Great Expectations,Charles Dickens,3.2,5.0,2.0,4.0
1,Of Mice and Men,John Steinbeck,2.9,1.3,2.3,3.5
2,Romeo and Juliet,William Shakespeare,2.5,4.0,2.8,4.0
3,The Time Machine,H. G. Wells,2.9,3.8,4.0,5.0
4,Alice in Wonderland,Lewis Carroll,2.9,3.5,2.8,4.2


### Loading Data into a Pandas DataFrame

When we are working with a long data set could be complicated to have a general vision of the DataFrame.
To load a document:

In [None]:
import pandas as pd

loading_file = pd.read.csv('/nombre_documento.csv')
print(type(loading_file))
print(loading_file.shape)

Ones we are load de file, probably we will see millions of values, we can take a look to firt five arrows of data using:

In [None]:
loading_file.head()

Or the five last arrows:

In [None]:
loading_file.tail()

Or the las eight:

In [None]:
loading_file.tail(8)

Or the first two rows:

In [None]:
loading_file.head(2)

Let´s do check to see whether we have any one values in this data set. To do so we will use "is not" method followed by the "any" method to check whether any of the columns conrain none values.

In [None]:
loadin_file.isnull().any()

Descriptive statictics on each column of the DataSet:

In [None]:
loadin_file.describe()
#we can use the desribe method in a single column like this:
loadin_file.['column_name'].describe()

In [None]:
loadin_file.max() #maximum value in each column

In [None]:
loadin_file.min() #minimun value in each column
loadin_file.['column_name'].min() #minimun value in a specific column

In [None]:
#DATA CORRELATION: to get the correlation between different columns

loadin_file.corr()