In [2]:
import pandas as pd
import numpy as np

## Pandas DataFrames

### Filtering

In [78]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen'],
    'Age': [24, 27, 22, 32, 29, 24, 26, 31],
    'Score': [85, 92, 88, 75, 93, 67, 77, 91],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Score,City
0,Alice,24,85,New York
1,Bob,27,92,Los Angeles
2,Charlie,22,88,Chicago
3,David,32,75,Houston
4,Eva,29,93,Phoenix
5,Frank,24,67,Philadelphia
6,Grace,26,77,San Antonio
7,Helen,31,91,San Diego


Exercise:

1. Filter the DataFrame to show only the rows where the Age is greater than 25 and the Score is greater than 80.

2. Filter the DataFrame to show only the rows where the Name is either 'Alice', 'Eva', or 'Grace'.

3. Filter the DataFrame to show only the rows where the City contains the letter 'o'.

4. Filter the DataFrame to show only the rows where Age is less than 30 and Score is between 70 and 90. Only display the Name and City columns.

### Combining multiple columns

Problem: You are given a DataFrame that contains the preferred travel destinations of two friends. Each row represents the travel preferences of a pair of friends—Friend A and Friend B.

Task 1: Construct a column that shows the destinations common to each pair of friends.

Task 2: Construct a column that shows the destinations that are unique to each friend.

In [79]:
friends = pd.DataFrame({
    'Friend_A': [
        ['Paris', 'Tokyo', 'New York'],
        ['Rome', 'Barcelona', 'Sydney'],
        ['Istanbul', 'Dubai', 'New York'],
        ['London', 'Paris', 'Berlin'],
        ['Tokyo', 'Seoul', 'Bangkok']
    ],
    'Friend_B': [
        ['New York', 'London', 'Paris'],
        ['Sydney', 'Rome', 'Venice'],
        ['Dubai', 'New York', 'Istanbul'],
        ['Berlin', 'Moscow', 'Paris'],
        ['Bangkok', 'Seoul', 'Tokyo']
    ]
})

friends

Unnamed: 0,Friend_A,Friend_B
0,"[Paris, Tokyo, New York]","[New York, London, Paris]"
1,"[Rome, Barcelona, Sydney]","[Sydney, Rome, Venice]"
2,"[Istanbul, Dubai, New York]","[Dubai, New York, Istanbul]"
3,"[London, Paris, Berlin]","[Berlin, Moscow, Paris]"
4,"[Tokyo, Seoul, Bangkok]","[Bangkok, Seoul, Tokyo]"


In [80]:
friends["common"] = friends.apply(lambda row:  [x for x in row["Friend_A"] if x in row["Friend_B"]], axis=1)

In [81]:
### Solve task 2 here

In [82]:
friends

Unnamed: 0,Friend_A,Friend_B,common
0,"[Paris, Tokyo, New York]","[New York, London, Paris]","[Paris, New York]"
1,"[Rome, Barcelona, Sydney]","[Sydney, Rome, Venice]","[Rome, Sydney]"
2,"[Istanbul, Dubai, New York]","[Dubai, New York, Istanbul]","[Istanbul, Dubai, New York]"
3,"[London, Paris, Berlin]","[Berlin, Moscow, Paris]","[Paris, Berlin]"
4,"[Tokyo, Seoul, Bangkok]","[Bangkok, Seoul, Tokyo]","[Tokyo, Seoul, Bangkok]"


### Aggregating information 
**Problem:** Library Book Inventory Management
You manage a collection of libraries, and you want to evaluate the availability of books across different locations. Given a DataFrame containing columns for the library name, book title, and the number of copies available:

Task 1: Determine how many libraries carry each book.

Task 2: Determine how many of those libraries have run out of copies.

Task 3: Determine how many of total copies are available.

In [83]:
library_books = pd.DataFrame({
    'library': ['Central', 'Central', 'North', 'North', 'South', 'South', 'East'],
    'book': ['Harry Potter', 'The Hobbit', 'The Hobbit', 'Dune', 'Dune', 'Harry Potter', 'The Catcher in the Rye'],
    'copies_available': [3, 0, 5, 0, 2, 0, 4]
})

library_books

Unnamed: 0,library,book,copies_available
0,Central,Harry Potter,3
1,Central,The Hobbit,0
2,North,The Hobbit,5
3,North,Dune,0
4,South,Dune,2
5,South,Harry Potter,0
6,East,The Catcher in the Rye,4


In [84]:
library_books[["book", "library"]].groupby("book").count()

Unnamed: 0_level_0,library
book,Unnamed: 1_level_1
Dune,2
Harry Potter,2
The Catcher in the Rye,1
The Hobbit,2


In [85]:
def count_copies(x):
    return x["copies_available"].sum()

In [86]:
# include_groups=False is used to avoid some warning messages
library_books.groupby("book").apply(count_copies, include_groups=False).reset_index()

Unnamed: 0,book,0
0,Dune,2
1,Harry Potter,3
2,The Catcher in the Rye,4
3,The Hobbit,5


In [87]:
# Rewrite previous solution with a lamba function

In [88]:
library_books.groupby("book").apply(lambda x: (x["copies_available"] == 0).sum(), include_groups=False).reset_index()

Unnamed: 0,book,0
0,Dune,1
1,Harry Potter,1
2,The Catcher in the Rye,0
3,The Hobbit,1


In [89]:
for group in library_books.groupby("book"):
    print(group)

('Dune',   library  book  copies_available
3   North  Dune                 0
4   South  Dune                 2)
('Harry Potter',    library          book  copies_available
0  Central  Harry Potter                 3
5    South  Harry Potter                 0)
('The Catcher in the Rye',   library                    book  copies_available
6    East  The Catcher in the Rye                 4)
('The Hobbit',    library        book  copies_available
1  Central  The Hobbit                 0
2    North  The Hobbit                 5)


### Solving Problems

**Problem:** Employee Work Hours Analysis

You have a DataFrame representing employee work hours. Each row corresponds to an employee and their reported work hours for different days of the week. However, some data entries are missing.

Given this DataFrame of employee work hours, you need to:

Task 1: Discard rows where more than half of the values are NaN.
Task 2: Impute the missing values (NaN):
> For numeric columns (hours), replace NaN with the average value for that column. </br>
> For non-numeric columns (department), replace NaN with the mode of that column.

In [90]:
employee_hours = pd.DataFrame({
    'employee_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'monday_hours': [8, np.nan, 6, 9, np.nan, 7, np.nan, 5, np.nan, 8],
    'tuesday_hours': [np.nan, 7, np.nan, np.nan, 8, np.nan, 6, 7, 7, np.nan],
    'wednesday_hours': [9, 8, 7, 8, np.nan, np.nan, 6, np.nan, np.nan, 8],
    'department': pd.Series(['HR', 'Finance', 'IT', np.nan, 'Finance', 'HR', 'IT', 'Finance', np.nan, 'HR'], dtype='string')
})

employee_hours

Unnamed: 0,employee_id,monday_hours,tuesday_hours,wednesday_hours,department
0,101,8.0,,9.0,HR
1,102,,7.0,8.0,Finance
2,103,6.0,,7.0,IT
3,104,9.0,,8.0,
4,105,,8.0,,Finance
5,106,7.0,,,HR
6,107,,6.0,6.0,IT
7,108,5.0,7.0,,Finance
8,109,,7.0,,
9,110,8.0,,8.0,HR


In [91]:
# Task 1:
# Compute the number of NAs in each row
# Create a mask (True/False) that it is True is number of NAs per row < 3
# Use the mask to filter
## Your code here

In [92]:
# Task 2:
num_cols = employee_hours.columns[1:4]
cat_cols = employee_hours.columns[-1:]
num_cols, cat_cols

# Write a function that can candle numerical columns and one that can candle categorical columns
# Apply the function to each col.
# Note that mode can give multiple answers.

(Index(['monday_hours', 'tuesday_hours', 'wednesday_hours'], dtype='object'),
 Index(['department'], dtype='object'))

In [94]:
def fill_na_num(df, col):
    # Your code here
    return df

**Problem:** Analyze Text Sentiment Predictions

You are given a DataFrame that represents sentiment predictions for different pieces of text. Each row corresponds to the predictions for a specific text, with the predictions stored in a dictionary  where the keys represent the sentiment and the values represent the probability associated with each sentiment.

Your task is to: Create a new column called prob_positive that calculates the probability that each piece of text has a "positive" sentiment. Use the following list to identify the "positive" sentiments:

In [130]:
positive_sentiments = ['happy', 'joyful', 'excited', 'content', 'pleased', 'satisfied']

In [131]:
predictions = pd.DataFrame.from_dict({
    'preds': {
        101: {'happy': 0.4, 'sad': 0.3, 'neutral': 0.3},
        102: {'angry': 0.6, 'content': 0.2, 'fearful': 0.2},
        103: {'joyful': 0.5, 'worried': 0.25, 'anxious': 0.25},
        104: {'excited': 0.7, 'tired': 0.2, 'calm': 0.1},
        105: {'satisfied': 0.35, 'frustrated': 0.5, 'indifferent': 0.15},
        106: {'pleased': 0.45, 'annoyed': 0.3, 'bored': 0.25},
        107: {'disappointed': 0.6, 'happy': 0.3, 'surprised': 0.1}
    }
})

predictions

Unnamed: 0,preds
101,"{'happy': 0.4, 'sad': 0.3, 'neutral': 0.3}"
102,"{'angry': 0.6, 'content': 0.2, 'fearful': 0.2}"
103,"{'joyful': 0.5, 'worried': 0.25, 'anxious': 0.25}"
104,"{'excited': 0.7, 'tired': 0.2, 'calm': 0.1}"
105,"{'satisfied': 0.35, 'frustrated': 0.5, 'indiff..."
106,"{'pleased': 0.45, 'annoyed': 0.3, 'bored': 0.25}"
107,"{'disappointed': 0.6, 'happy': 0.3, 'surprised..."


### Example of non trivial way of using groupby

In [19]:
x = pd.Series([1]*5)
x

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [20]:
x.cumsum()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
# Sample DataFrame with sales transactions
data = {
    'date': ['2024-01-01', '2024-01-05', '2024-01-07', '2024-01-03', '2024-01-06', '2024-01-08', '2024-01-02', '2024-01-04'],
    'region': ['North', 'North', 'North', 'South', 'South', 'South', 'East', 'East'],
    'sales': [200, 150, 300, 400, 250, 100, 500, 300]
}

df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])

# Define a custom function to compute cumulative sales
def compute_cumulative_sales(group):
    # Sort by date to ensure the cumulative sum is calculated correctly
    group = group.sort_values(by='date')
    
    # Calculate cumulative sales
    group['cumulative_sales'] = group['sales'].cumsum()
    return group

# Apply the custom function to each group using groupby
df_cumulative_sales = df.groupby('region').apply(compute_cumulative_sales, include_groups=False)

df_cumulative_sales.reset_index()


Unnamed: 0,region,level_1,date,sales,cumulative_sales
0,East,6,2024-01-02,500,500
1,East,7,2024-01-04,300,800
2,North,0,2024-01-01,200,200
3,North,1,2024-01-05,150,350
4,North,2,2024-01-07,300,650
5,South,3,2024-01-03,400,400
6,South,4,2024-01-06,250,650
7,South,5,2024-01-08,100,750
