In [1]:
import pandas as pd

In [117]:
# Creating the sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 22],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}

df = pd.DataFrame(data)

# Display the DataFrame
print("Sample DataFrame:")
print(df)

Sample DataFrame:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston
4      Eva   22      Phoenix


## Accessing Data

In [118]:
print(df[df['Name'] == 'Bob']['City'])

print(df.loc[df['Name'] == 'Bob', 'City'])

1    Los Angeles
Name: City, dtype: object
1    Los Angeles
Name: City, dtype: object


In [119]:
print(df.at[3, 'City'])
print(df.iat[3, 2])

Houston
Houston


In [120]:
print(df[(df['Age'] < 30) & (df['City'] == 'New York')])

    Name  Age      City
0  Alice   25  New York


## Adding New Row

In [121]:
new_row = pd.DataFrame({'Name':['Eric'], 'Age':[34], 'City':['Pittsburgh']})

df = pd.concat([df, new_row], axis=0, ignore_index=True)

print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston
4      Eva   22      Phoenix
5     Eric   34   Pittsburgh


In [122]:
another_row = {'Name':'Richard', 'Age':31, 'City':'Cleveland'}
df.loc[df.shape[0]] = another_row
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston
4      Eva   22      Phoenix
5     Eric   34   Pittsburgh
6  Richard   31    Cleveland


## Adding New Column

In [123]:
df['Bet'] = [df.shape[0] - i for i in range(df.shape[0])]

print(df)

      Name  Age         City  Bet
0    Alice   25     New York    7
1      Bob   30  Los Angeles    6
2  Charlie   35      Chicago    5
3    David   40      Houston    4
4      Eva   22      Phoenix    3
5     Eric   34   Pittsburgh    2
6  Richard   31    Cleveland    1


## Remove (Drop) Columns and Rows

In [124]:
df = df.drop(columns=['Bet'])
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston
4      Eva   22      Phoenix
5     Eric   34   Pittsburgh
6  Richard   31    Cleveland


In [125]:
df.drop(index=5)

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,22,Phoenix
6,Richard,31,Cleveland


In [126]:
indices_to_drop = df[df['Age'] > 30].index
df = df.drop(indices_to_drop)
print(df)

    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
4    Eva   22      Phoenix


In [127]:
df = df.reset_index(drop=True)
print(df)

    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
2    Eva   22      Phoenix


## Modify Data

In [128]:
df.loc[df['Name'] == 'Eva', 'City'] = 'Saint Paul'
print(df)

    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
2    Eva   22   Saint Paul


## Practice
Write a function add_person that takes a DataFrame, a person's name, age, and city, and returns a new DataFrame with the person's details added as a new row. Ensure that your function uses pd.concat() for adding the new row. Also, after adding the new row, sort the DataFrame by the 'Age' column in ascending order and reset the index.

In [129]:
def add_person(df, name, age, city):

    new_row = pd.DataFrame({'Name':[name], 'Age':[age], 'City':[city]})
    df = pd.concat([df, new_row], ignore_index=True, axis=0)
    df = df.sort_values(by=['Age'], ascending=True).reset_index(drop=True)
    return df

print(add_person(df, 'Pena', 34, 'Pittsburgh'))

    Name  Age         City
0    Eva   22   Saint Paul
1  Alice   25     New York
2    Bob   30  Los Angeles
3   Pena   34   Pittsburgh


In [130]:
print(add_person(df, 'Carlos', 20, 'Pittsburgh')) 

     Name  Age         City
0  Carlos   20   Pittsburgh
1     Eva   22   Saint Paul
2   Alice   25     New York
3     Bob   30  Los Angeles


## Practice
Write a function filter_by_city_and_age that takes a DataFrame, a city name, and an age limit, and returns a new DataFrame containing only the rows where the 'City' matches the given city name and the 'Age' is greater than the given age limit. The resulting DataFrame should only include the 'Name' and 'Age' columns.

In [131]:
def filter_by_city_and_age(df, city, age_lim):
    return df[(df['City'] == city) & (df['Age'] > age_lim)][['Name', 'Age']].reset_index(drop=True)

In [132]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 22],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Chicago', 'Chicago']
}
df2 = pd.DataFrame(data)

print(filter_by_city_and_age(df2, 'Chicago', 30))

      Name  Age
0  Charlie   35
1    David   40


## Practice
Write a function compute_statistics that takes a DataFrame and a city name, and returns a dictionary containing the following statistics for the people in the specified city:

1. The average age.
2. The average salary.
3. The name(s) of the person(s) with the highest salary in that city.

In [133]:
def compute_statistics(df, city) -> dict:

    return {'average_age' : df.loc[df['City'] == city, 'Age'].mean(),
            'average_salary' : df.loc[df['City'] == city, 'Salary'].mean(),
            'top_earners' : list(df[(df['City'] == city) & (df['Salary'] == max(df['Salary']))]['Name'])}

Could also be written as:

In [134]:
def compute_statistics(df, city):
    # Filter the DataFrame for the specified city
    city_df = df[df['City'] == city]
    
    # Compute the average age and average salary
    avg_age = city_df['Age'].mean()
    avg_salary = city_df['Salary'].mean()
    
    # Find the highest salary and the name(s) of the person(s) with the highest salary
    max_salary = city_df['Salary'].max()
    top_earners = city_df[city_df['Salary'] == max_salary]['Name'].tolist()
    
    # Return the statistics as a dictionary
    return {
        'average_age': avg_age,
        'average_salary': avg_salary,
        'top_earners': top_earners
    }

In [135]:
# Creating the sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 22],
    'City': ['Chicago', 'Chicago', 'Chicago', 'Chicago', 'Phoenix'],
    'Salary': [70000, 80000, 120000, 120000, 85000]
}

df3 = pd.DataFrame(data)

print(compute_statistics(df3, 'Chicago'))

{'average_age': 32.5, 'average_salary': 97500.0, 'top_earners': ['Charlie', 'David']}


## Practice
Write a function department_salary_summary that takes a DataFrame and returns a new DataFrame containing the following information for each department:

1. The total number of employees in the department.
2. The average salary in the department.
3. The minimum and maximum salaries in the department.

The resulting DataFrame should have the department names as the index and the columns named Total_Employees, Average_Salary, Min_Salary, and Max_Salary.

In [159]:
def department_salary_summary(df) -> pd.DataFrame:

    return df.groupby(by=['Department']).agg(
        Total_Employees=('Name', 'count'),
        Average_Salary=('Salary', 'mean'),
        Min_Salary=('Salary', 'min'),
        Max_Salary=('Salary', 'max')
    ).reset_index()

In [160]:
# Creating the sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, 30, 35, 40, 22, 28, 33, 26, 24, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'San Francisco', 'New York', 'Chicago', 'Houston', 'Los Angeles'],
    'Salary': [70000, 80000, 120000, 90000, 85000, 95000, 75000, 105000, 62000, 70000],
    'Department': ['HR', 'Engineering', 'Engineering', 'HR', 'HR', 'Engineering', 'HR', 'Engineering', 'HR', 'Engineering']
}

df4 = pd.DataFrame(data)

print(department_salary_summary(df4))

    Department  Total_Employees  Average_Salary  Min_Salary  Max_Salary
0  Engineering                5         94000.0       70000      120000
1           HR                5         76400.0       62000       90000


## Practice
Write a function tenure_and_salary_increase that takes a DataFrame and performs the following operations:

1. Calculate the tenure (in years) of each employee based on the JoiningDate column.
2. Create a new column Tenure in the DataFrame to store the calculated tenure.
3. Classify employees into three categories based on their tenure: "Junior" (<2 years), "Mid-level" (2-5 years), and "Senior" (>5 years).
3. For "Senior" employees, increase their salary by 10% and update the Salary column accordingly.
5. Return the updated DataFrame.

In [169]:
def tenure_and_salary_increase(df) -> pd.DataFrame:

    current_date = pd.to_datetime('today')
    df['Tenure'] = (current_date - df['JoiningDate']).dt.days / 365

    # :: maybe some function that will take a column value and return a string
    def category(y):
        if y < 2:
            return 'Junior'
        elif y <= 5:
            return 'Mid-level'
        else:
            return 'Senior'
        
    df['Level'] = df['Tenure'].apply(category)

    df.loc[df['Level'] == 'Senior', 'Salary'] *= 1.1

    return df

In [178]:
# Creating the sample DataFrame
data = {
    'EmployeeID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, 30, 35, 40, 22, 28, 33, 26, 24, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'San Francisco', 'New York', 'Chicago', 'Houston', 'Los Angeles'],
    'Salary': [70000, 80000, 120000, 90000, 85000, 95000, 75000, 105000, 62000, 70000],
    'Department': ['HR', 'Engineering', 'Engineering', 'HR', 'HR', 'Engineering', 'HR', 'Engineering', 'HR', 'Engineering'],
    'JoiningDate': ['2020-01-15', '2019-02-20', '2018-03-30', '2017-04-25', '2021-05-10', '2016-06-15', '2015-07-20', '2014-08-25', '2013-09-30', '2022-10-05']
}

df5 = pd.DataFrame(data)
df5['JoiningDate'] = pd.to_datetime(df5['JoiningDate'])

print(tenure_and_salary_increase(df5))

   EmployeeID     Name  Age           City    Salary   Department JoiningDate  \
0           1    Alice   25       New York   70000.0           HR  2020-01-15   
1           2      Bob   30    Los Angeles   88000.0  Engineering  2019-02-20   
2           3  Charlie   35        Chicago  132000.0  Engineering  2018-03-30   
3           4    David   40        Houston   99000.0           HR  2017-04-25   
4           5      Eva   22        Phoenix   85000.0           HR  2021-05-10   
5           6    Frank   28  San Francisco  104500.0  Engineering  2016-06-15   
6           7    Grace   33       New York   82500.0           HR  2015-07-20   
7           8   Hannah   26        Chicago  115500.0  Engineering  2014-08-25   
8           9      Ivy   24        Houston   68200.0           HR  2013-09-30   
9          10     Jack   29    Los Angeles   70000.0  Engineering  2022-10-05   

      Tenure      Level  
0   4.416438  Mid-level  
1   5.317808     Senior  
2   6.213699     Senior  
3   

Another way to write this that may be a bit cleaner is:

In [187]:
# Creating the sample DataFrame
data = {
    'EmployeeID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ivy', 'Jack'],
    'Age': [25, 30, 35, 40, 22, 28, 33, 26, 24, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'San Francisco', 'New York', 'Chicago', 'Houston', 'Los Angeles'],
    'Salary': [70000, 80000, 120000, 90000, 85000, 95000, 75000, 105000, 62000, 70000],
    'Department': ['HR', 'Engineering', 'Engineering', 'HR', 'HR', 'Engineering', 'HR', 'Engineering', 'HR', 'Engineering'],
    'JoiningDate': ['2020-01-15', '2019-02-20', '2018-03-30', '2022-06-15', '2021-05-10', '2016-06-15', '2015-07-20', '2014-08-25', '2013-09-30', '2022-10-05']
}

df5 = pd.DataFrame(data)
df5['JoiningDate'] = pd.to_datetime(df5['JoiningDate'])

def tenure_and_salary_increase(df):
    # Calculate the tenure in years
    current_date = pd.to_datetime('today')
    df['Tenure'] = (current_date - df['JoiningDate']).dt.days / 365
    
    # Classify employees based on tenure
    df['Level'] = pd.cut(df['Tenure'], bins=[-float('inf'), 2, 5, float('inf')], labels=['Junior', 'Mid-level', 'Senior'])
    
    # Increase the salary of "Senior" employees by 10%
    df.loc[df['Level'] == 'Senior', 'Salary'] *= 1.10
    
    return df

# Example usage
updated_df = tenure_and_salary_increase(df5)
print(updated_df)

   EmployeeID     Name  Age           City    Salary   Department JoiningDate  \
0           1    Alice   25       New York   70000.0           HR  2020-01-15   
1           2      Bob   30    Los Angeles   88000.0  Engineering  2019-02-20   
2           3  Charlie   35        Chicago  132000.0  Engineering  2018-03-30   
3           4    David   40        Houston   90000.0           HR  2022-06-15   
4           5      Eva   22        Phoenix   85000.0           HR  2021-05-10   
5           6    Frank   28  San Francisco  104500.0  Engineering  2016-06-15   
6           7    Grace   33       New York   82500.0           HR  2015-07-20   
7           8   Hannah   26        Chicago  115500.0  Engineering  2014-08-25   
8           9      Ivy   24        Houston   68200.0           HR  2013-09-30   
9          10     Jack   29    Los Angeles   70000.0  Engineering  2022-10-05   

      Tenure      Level  
0   4.416438  Mid-level  
1   5.317808     Senior  
2   6.213699     Senior  
3   

## Using `pd.cut`

1. Equal-width Bins: If you want to divide the data into bins of equal width, you can specify the number of bins.

In [180]:
data = [1, 7, 5, 4, 6, 3, 2, 8, 9, 10]
bins = 3
categorized_data = pd.cut(data, bins)
print(categorized_data)

[(0.991, 4.0], (4.0, 7.0], (4.0, 7.0], (0.991, 4.0], (4.0, 7.0], (0.991, 4.0], (0.991, 4.0], (7.0, 10.0], (7.0, 10.0], (7.0, 10.0]]
Categories (3, interval[float64, right]): [(0.991, 4.0] < (4.0, 7.0] < (7.0, 10.0]]


2. Custom Bin Edges: You can define your own bin edges to categorize the data.

In [181]:
bins = [0, 3, 6, 9, 12]
categorized_data = pd.cut(data, bins)
print(categorized_data)

[(0, 3], (6, 9], (3, 6], (3, 6], (3, 6], (0, 3], (0, 3], (6, 9], (6, 9], (9, 12]]
Categories (4, interval[int64, right]): [(0, 3] < (3, 6] < (6, 9] < (9, 12]]


3. Labels for Bins: You can provide labels for each bin.

In [182]:
labels = ['Low', 'Medium', 'High', 'Very High']
categorized_data = pd.cut(data, bins, labels=labels)
print(categorized_data)

['Low', 'High', 'Medium', 'Medium', 'Medium', 'Low', 'Low', 'High', 'High', 'Very High']
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']


4. Customizing Inclusion: If you want the bins to be left-inclusive (i.e., the left edge is included in the bin), you can adjust the right parameter in pd.cut:

In [None]:
# Left-inclusive bins
df['Level'] = pd.cut(df['Tenure'], bins=[-float('inf'), 2, 5, float('inf')], labels=['Junior', 'Mid-level', 'Senior'], right=False)

## Iterating through DataFrame

In [None]:
def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
    buying_customers = list(orders['customerId'].drop_duplicates())
    return pd.DataFrame({'Customers': [customers.loc[i, 'name'] for i in range(customers.shape[0]) if customers.loc[i, 'id'] not in buying_customers]})

def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(customers, orders, how='left', left_on='id', right_on='customerId')
    return df[df['customerId'].isna()][['name']].rename(columns={'name':'Customers'})

def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
    return customers[~customers.id.isin(orders.customerId)][['name']].rename(columns={'name':'Customers'})