# Intro To Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
ages = np.array([13,25,19])
series1 = pd.Series(ages,index=['Emma', 'Swetha', 'Serajh'])
print(series1)

Emma      13
Swetha    25
Serajh    19
dtype: int64


Create DF

In [3]:
df = pd.DataFrame([
    ['John Smith','123 Main St',34],
    ['Jane Doe', '456 Maple Ave',28],
    ['Joe Schmo', '789 Broadway',51]
    ],
    columns=['Name', 'Address', 'Age']
)
print(df)

         Name        Address  Age
0  John Smith    123 Main St   34
1    Jane Doe  456 Maple Ave   28
2   Joe Schmo   789 Broadway   51


Set Index

In [4]:
df = pd.DataFrame([
    ['John Smith','123 Main St',34],
    ['Jane Doe', '456 Maple Ave',28],
    ['Joe Schmo', '789 Broadway',51]
    ],
    columns=['Name','Address', 'Age'],
).set_index('Name')
print(df)

                  Address  Age
Name                          
John Smith    123 Main St   34
Jane Doe    456 Maple Ave   28
Joe Schmo    789 Broadway   51


New Dataframe

In [5]:
# Create the DataFrame
library = pd.DataFrame([
    ['To Kill a Mockingbird', 'Harper Lee', 'Fiction'],
    ['1984', 'George Orwell', 'Dystopian'],
    ['Moby Dick', 'Herman Melville', 'Adventure'],
    ['Pride and Prejudice', 'Jane Austen', 'Romance'],
    ['The Great Gatsby', 'F. Scott Fitzgerald', 'Fiction'],
    ['Brave New World', 'Aldous Huxley', 'Dystopian']
    ],
    columns=['title', 'author', 'genre'],
    index=['book1', 'book2', 'book3', 'book4', 'book5', 'book6']
)
#above, index was set, to later help distinguish how to select rows with .loc and .iloc
print("Original DataFrame:")
print(library)

Original DataFrame:
                       title               author      genre
book1  To Kill a Mockingbird           Harper Lee    Fiction
book2                   1984        George Orwell  Dystopian
book3              Moby Dick      Herman Melville  Adventure
book4    Pride and Prejudice          Jane Austen    Romance
book5       The Great Gatsby  F. Scott Fitzgerald    Fiction
book6        Brave New World        Aldous Huxley  Dystopian


# Selecting Columns

### Selecting One Column

In [6]:
# Select the 'title' column
title_column = library['title']
print("\nTitle Column using bracket notation:")
print(title_column)


Title Column using bracket notation:
book1    To Kill a Mockingbird
book2                     1984
book3                Moby Dick
book4      Pride and Prejudice
book5         The Great Gatsby
book6          Brave New World
Name: title, dtype: object


In [7]:
# Select the 'title' column
title_column = library.title
print("\nTitle Column using dot notation:")
print(title_column)


Title Column using dot notation:
book1    To Kill a Mockingbird
book2                     1984
book3                Moby Dick
book4      Pride and Prejudice
book5         The Great Gatsby
book6          Brave New World
Name: title, dtype: object


### Selecting Multiple Columns

In [8]:
# Select the 'title' and 'author' columns
selected_columns = library[['title', 'author']]
print("\nSelected Columns (title and author) using bracket notation with a list:")
print(selected_columns)



Selected Columns (title and author) using bracket notation with a list:
                       title               author
book1  To Kill a Mockingbird           Harper Lee
book2                   1984        George Orwell
book3              Moby Dick      Herman Melville
book4    Pride and Prejudice          Jane Austen
book5       The Great Gatsby  F. Scott Fitzgerald
book6        Brave New World        Aldous Huxley


# Selecting Rows

### Selecting One Row

In [9]:
# Select the row with the label 'book3'
row_by_label = library.loc['book3']
print("\nRow with label 'book3' using loc:")
print(row_by_label)




Row with label 'book3' using loc:
title           Moby Dick
author    Herman Melville
genre           Adventure
Name: book3, dtype: object


In [10]:
# Select the row with integer position 2
row_by_position = library.iloc[2]
print("\nRow with integer position 2 using iloc:")
print(row_by_position)



Row with integer position 2 using iloc:
title           Moby Dick
author    Herman Melville
genre           Adventure
Name: book3, dtype: object


### Selecting Multiple Rows

In [11]:
# Select rows with labels 'book2', 'book4', and 'book6'
rows_by_label = library.loc[['book2', 'book4', 'book6']]
print("\nRows with labels 'book2', 'book4', and 'book6' using loc:")
print(rows_by_label)


Rows with labels 'book2', 'book4', and 'book6' using loc:
                     title         author      genre
book2                 1984  George Orwell  Dystopian
book4  Pride and Prejudice    Jane Austen    Romance
book6      Brave New World  Aldous Huxley  Dystopian


In [12]:
# Select a range of rows by label
rows_by_label_range = library.loc['book2':'book5']
print("\nRows from 'book2' to 'book5' using loc:")
print(rows_by_label_range)



Rows from 'book2' to 'book5' using loc:
                     title               author      genre
book2                 1984        George Orwell  Dystopian
book3            Moby Dick      Herman Melville  Adventure
book4  Pride and Prejudice          Jane Austen    Romance
book5     The Great Gatsby  F. Scott Fitzgerald    Fiction


In [13]:
# Select the second, fourth, and sixth rows (integer positions 1, 3, and 5)
rows_by_position = library.iloc[[1, 3, 5]]
print("\nRows with integer positions 1, 3, and 5 using iloc:")
print(rows_by_position)



Rows with integer positions 1, 3, and 5 using iloc:
                     title         author      genre
book2                 1984  George Orwell  Dystopian
book4  Pride and Prejudice    Jane Austen    Romance
book6      Brave New World  Aldous Huxley  Dystopian


In [14]:
# Select a range of rows by integer position
rows_by_position_range = library.iloc[1:5]
print("\nRows from integer position 1 to 4 using iloc:")
print(rows_by_position_range)



Rows from integer position 1 to 4 using iloc:
                     title               author      genre
book2                 1984        George Orwell  Dystopian
book3            Moby Dick      Herman Melville  Adventure
book4  Pride and Prejudice          Jane Austen    Romance
book5     The Great Gatsby  F. Scott Fitzgerald    Fiction


# Selecting Rows and Columns

### Selecting a Row and a Column

In [15]:
# Select the row with label 'book3' for the 'author' column; syntax is [row, column]
value_loc = library.loc['book3', 'author']
print("\nAuthor of book3 using loc:")
print(value_loc)



Author of book3 using loc:
Herman Melville


In [16]:
# Select the cell at integer position (2, 1) where 2 is the row index and 1 is the column index
value_iloc = library.iloc[2, 1]
print("\nAuthor of book3 using iloc:")
print(value_iloc)



Author of book3 using iloc:
Herman Melville


### Multiple Rows and Multiple Columns

In [17]:
# Select rows 'book2', 'book4', 'book6' and columns 'title' and 'author'
subset_loc = library.loc[['book2', 'book4', 'book6'], ['title', 'author']]
print("\nSubset using loc with labels:")
print(subset_loc)



Subset using loc with labels:
                     title         author
book2                 1984  George Orwell
book4  Pride and Prejudice    Jane Austen
book6      Brave New World  Aldous Huxley


In [18]:
# Select rows with integer positions 1, 3, 5 and columns with integer positions 0, 1
subset_iloc = library.iloc[[1, 3, 5], [0, 1]]
print("\nSubset using iloc with integer positions:")
print(subset_iloc)



Subset using iloc with integer positions:
                     title         author
book2                 1984  George Orwell
book4  Pride and Prejudice    Jane Austen
book6      Brave New World  Aldous Huxley


# Adding and Modifying Columns

### Adding a New Column

In [19]:
# Add a new column 'publication_year' to the DataFrame
library['publication_year'] = [1960, 1949, 1851, 1813, 1925, 1932]
print("\nDataFrame after adding a new column 'publication_year':")
print(library)



DataFrame after adding a new column 'publication_year':
                       title               author      genre  publication_year
book1  To Kill a Mockingbird           Harper Lee    Fiction              1960
book2                   1984        George Orwell  Dystopian              1949
book3              Moby Dick      Herman Melville  Adventure              1851
book4    Pride and Prejudice          Jane Austen    Romance              1813
book5       The Great Gatsby  F. Scott Fitzgerald    Fiction              1925
book6        Brave New World        Aldous Huxley  Dystopian              1932


### Modifying an Existing Column


In [20]:
# Modify the 'publication_year' by adding 10 years to each value
library['publication_year'] = library['publication_year'] + 10
print("\nDataFrame after modifying 'publication_year':")
print(library)


DataFrame after modifying 'publication_year':
                       title               author      genre  publication_year
book1  To Kill a Mockingbird           Harper Lee    Fiction              1970
book2                   1984        George Orwell  Dystopian              1959
book3              Moby Dick      Herman Melville  Adventure              1861
book4    Pride and Prejudice          Jane Austen    Romance              1823
book5       The Great Gatsby  F. Scott Fitzgerald    Fiction              1935
book6        Brave New World        Aldous Huxley  Dystopian              1942


# Filtering Data

### Filtering Rows Based on Column Values


In [21]:
# Filter books published after 1950
books_after_1950 = library[library['publication_year'] > 1950]
print("\nBooks published after 1950:")
print(books_after_1950)


Books published after 1950:
                       title         author      genre  publication_year
book1  To Kill a Mockingbird     Harper Lee    Fiction              1970
book2                   1984  George Orwell  Dystopian              1959


### Filtering with Multiple Conditions


In [22]:
# Filter books that are either 'Fiction' or 'Dystopian'
fiction_dystopian_books = library[(library['genre'] == 'Fiction') | (library['genre'] == 'Dystopian')]
print(fiction_dystopian_books)

                       title               author      genre  publication_year
book1  To Kill a Mockingbird           Harper Lee    Fiction              1970
book2                   1984        George Orwell  Dystopian              1959
book5       The Great Gatsby  F. Scott Fitzgerald    Fiction              1935
book6        Brave New World        Aldous Huxley  Dystopian              1942


# Aggregating Data

### Grouping and Aggregating

In [23]:
# Group by 'genre' and get the count of books in each genre
genre_counts = library.groupby('genre').size()
print("\nCount of books in each genre:")
print(genre_counts)


Count of books in each genre:
genre
Adventure    1
Dystopian    2
Fiction      2
Romance      1
dtype: int64


### Aggregating with Multiple Functions


In [24]:
# Get the mean and maximum publication year for each genre
publication_stats = library.groupby('genre')['publication_year'].agg(['mean', 'max'])
print("\nMean and maximum publication year for each genre:")
print(publication_stats)



Mean and maximum publication year for each genre:
             mean   max
genre                  
Adventure  1861.0  1861
Dystopian  1950.5  1959
Fiction    1952.5  1970
Romance    1823.0  1823


# Lambda Functions

### Basic Syntax

In [25]:
''' 
lambda arguments: expression
'''

' \nlambda arguments: expression\n'

In [26]:
# A lambda function that adds 10 to a number
add_ten = lambda x: x + 10
print(add_ten(5))  # Output: 15


15


### Using the Apply Method with Lamba Functions

In [27]:
# Add a new column 'is_classic' based on the publication_year
#the variable x represents each value in libray['publication_year'] 
library['is_classic'] = library['publication_year'].apply(lambda x: 'Yes' if x < 1950 else 'No')
print(library)


                       title               author      genre  \
book1  To Kill a Mockingbird           Harper Lee    Fiction   
book2                   1984        George Orwell  Dystopian   
book3              Moby Dick      Herman Melville  Adventure   
book4    Pride and Prejudice          Jane Austen    Romance   
book5       The Great Gatsby  F. Scott Fitzgerald    Fiction   
book6        Brave New World        Aldous Huxley  Dystopian   

       publication_year is_classic  
book1              1970         No  
book2              1959         No  
book3              1861        Yes  
book4              1823        Yes  
book5              1935        Yes  
book6              1942        Yes  


Example 1: Using a lambda function

In [28]:
# Sample Series
s = pd.Series([1, 2, 3, 4, 5])

# Apply a lambda function to square each value
squared = s.apply(lambda x: x ** 2)
print(squared)


0     1
1     4
2     9
3    16
4    25
dtype: int64


Example 2: Using a named function

In [29]:
def add_five(x):
    return x + 5

# Apply the named function
added_five = s.apply(add_five)
print(added_five)

0     6
1     7
2     8
3     9
4    10
dtype: int64


Example 3: Applying a function to each column


In [30]:
# Sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})
print(f'Dataframe\n\n{df}\n\n')
# Apply a lambda function to sum each column
column_sum = df.apply(lambda x: x.sum())
print(f'Column Sum\n\n{column_sum}\n\n')


Dataframe

   A  B
0  1  4
1  2  5
2  3  6


Column Sum

A     6
B    15
dtype: int64




Example 4: Applying a function to each row


In [31]:
# Apply a lambda function to sum each row
row_sum = df.apply(lambda x: x.sum(), axis=1)
print(row_sum)


0    5
1    7
2    9
dtype: int64
