Q1. List any five functions of the pandas library with execution.

In [1]:
import pandas as pd

# Sample DataFrame for demonstration
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 22]
})

# 1. head() - Display the first few rows
print("1. head():\n", df.head(), "\n")

# 2. describe() - Summary statistics for numerical columns
print("2. describe():\n", df.describe(), "\n")

# 3. drop() - Drop a column
df_dropped = df.drop('Age', axis=1)
print("3. drop('Age'):\n", df_dropped, "\n")

# 4. sort_values() - Sort the DataFrame by 'Age'
sorted_df = df.sort_values(by='Age')
print("4. sort_values(by='Age'):\n", sorted_df, "\n")

# 5. read_csv() - Read CSV file (commented since no actual file used here)
# df_from_csv = pd.read_csv('data.csv')
print("5. read_csv(): Demonstrated but not executed (requires actual CSV file)\n")


1. head():
       Name  Age
0    Alice   25
1      Bob   30
2  Charlie   22 

2. describe():
              Age
count   3.000000
mean   25.666667
std     4.041452
min    22.000000
25%    23.500000
50%    25.000000
75%    27.500000
max    30.000000 

3. drop('Age'):
       Name
0    Alice
1      Bob
2  Charlie 

4. sort_values(by='Age'):
       Name  Age
2  Charlie   22
0    Alice   25
1      Bob   30 

5. read_csv(): Demonstrated but not executed (requires actual CSV file)



Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [2]:
import pandas as pd

# Sample DataFrame with columns A, B, and C
df = pd.DataFrame({
    'A': [10, 20, 30],
    'B': [40, 50, 60],
    'C': [70, 80, 90]
})

# Function to re-index the DataFrame with index starting at 1 and increasing by 2
def reindex_custom(df):
    new_index = range(1, 2 * len(df) + 1, 2)
    df.index = new_index
    return df

# Apply the function
df_reindexed = reindex_custom(df)

# Display the re-indexed DataFrame
print(df_reindexed)


    A   B   C
1  10  40  70
3  20  50  80
5  30  60  90


Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
function should print the sum to the console.

In [3]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Values': [10, 20, 30, 40, 50]
})

# Function to calculate the sum of the first three values
def sum_first_three(df):
    total = 0
    count = 0
    for val in df['Values']:
        total += val
        count += 1
        if count == 3:
            break
    print("Sum of first three values:", total)

# Call the function
sum_first_three(df)


Sum of first three values: 60


Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
'Word_Count' that contains the number of words in each row of the 'Text' column.

In [4]:
import pandas as pd

df = pd.DataFrame({
    'Text': ["Hello world", "Data Science is fun", "Pandas is powerful"]
})

def add_word_count(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))
    return df

print(add_word_count(df))


                  Text  Word_Count
0          Hello world           2
1  Data Science is fun           4
2   Pandas is powerful           3


Q5. How are DataFrame.size() and DataFrame.shape() different?

ans. df.shape returns a tuple (rows, columns) of the DataFrame.

df.size returns the total number of elements in the DataFrame (rows * columns).

Example:



In [2]:
import pandas as pd  #  Make sure this line is at the top

# Create a sample DataFrame
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

# Print shape and size
print("Shape:", df.shape)  # Output: (2, 2) — 2 rows, 2 columns
print("Size:", df.size)    # Output: 4 — total 2 × 2 = 4 elements


Shape: (2, 2)
Size: 4


Q6. Which function of pandas do we use to read an excel file?

In [4]:
import pandas as pd

#  Create and save a sample Excel file
sample_df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Score': [85, 90]})
sample_df.to_excel('sample.xlsx', index=False)  # Save to Excel

#  Now read the Excel file using pd.read_excel()
df = pd.read_excel('sample.xlsx')

#  Print the DataFrame
print(df)


    Name  Score
0  Alice     85
1    Bob     90


Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
addresses in the format 'username@domain.com'. Write a Python function that creates a new column
'Username' in df that contains only the username part of each email address.

In [6]:
import pandas as pd

# Sample DataFrame with email addresses
df = pd.DataFrame({
    'Email': ['john.doe@example.com', 'alice.smith@domain.org', 'bob99@test.net']
})

# Function to extract the username from each email
def add_username_column(dataframe):
    dataframe['Username'] = dataframe['Email'].apply(lambda x: x.split('@')[0])
    return dataframe

# Apply the function
df = add_username_column(df)

# Display the updated DataFrame
print(df)


                    Email     Username
0    john.doe@example.com     john.doe
1  alice.smith@domain.org  alice.smith
2          bob99@test.net        bob99


Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
function should return a new DataFrame that contains only the selected rows.
For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2

In [None]:
import pandas as pd

# Given DataFrame
df = pd.DataFrame({
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
})

# Function to select rows where A > 5 and B < 5
def filter_rows(dataframe):
    return dataframe[(dataframe['A'] > 5) & (dataframe['B'] < 5)]

# Apply the function
filtered_df = filter_rows(df)

# Display the filtered DataFrame
print(filtered_df)


Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean,
median, and standard deviation of the values in the 'Values' column.

In [None]:
import pandas as pd

def calculate_statistics(df):
    """
    Calculate mean, median, and standard deviation of the 'Values' column in a DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame with a column named 'Values'.

    Returns:
    dict: A dictionary with mean, median, and standard deviation.
    """
    mean_val = df['Values'].mean()
    median_val = df['Values'].median()
    std_dev = df['Values'].std()

    return {
        'mean': mean_val,
        'median': median_val,
        'std_dev': std_dev
    }


In [None]:
Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
should include the current day.

In [None]:
import pandas as pd

def add_moving_average(df):
    """
    Adds a 'MovingAverage' column to the DataFrame which is the 7-day moving average
    of the 'Sales' column, including the current day.

    Parameters:
    df (pd.DataFrame): DataFrame with 'Sales' and 'Date' columns.

    Returns:
    pd.DataFrame: DataFrame with the new 'MovingAverage' column.
    """
    # Ensure 'Date' column is datetime type
    df['Date'] = pd.to_datetime(df['Date'])

    # Sort by date to ensure correct rolling computation
    df = df.sort_values('Date')

    # Calculate the 7-day moving average
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()

    return df


Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new
column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
Monday, Tuesday) corresponding to each date in the 'Date' column.
For example, if df contains the following values:
Date
0 2023-01-01
1 2023-01-02
2 2023-01-03
3 2023-01-04
4 2023-01-05
Your function should create the following DataFrame:

Date Weekday
0 2023-01-01 Sunday
1 2023-01-02 Monday
2 2023-01-03 Tuesday
3 2023-01-04 Wednesday
4 2023-01-05 Thursday
The function should return the modified DataFrame.

In [None]:
import pandas as pd

def add_weekday_column(df):
    """
    Adds a 'Weekday' column to the DataFrame based on the 'Date' column.
    The 'Weekday' column will contain the name of the day (e.g., Monday, Tuesday).

    Parameters:
    df (pd.DataFrame): DataFrame with a 'Date' column.

    Returns:
    pd.DataFrame: Modified DataFrame with an added 'Weekday' column.
    """
    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Add 'Weekday' column
    df['Weekday'] = df['Date'].dt.day_name()

    return df


Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [None]:
import pandas as pd

def filter_dates_in_january(df):
    """
    Select rows where the 'Date' is between '2023-01-01' and '2023-01-31'.

    Parameters:
    df (pd.DataFrame): DataFrame with a 'Date' column containing timestamps.

    Returns:
    pd.DataFrame: Filtered DataFrame with dates in the specified range.
    """
    df['Date'] = pd.to_datetime(df['Date'])
    mask = (df['Date'] >= '2023-01-01') & (df['Date'] <= '2023-01-31')
    return df.loc[mask]


Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to
be imported?

In [None]:
import pandas as pd
