### Q1. List any five functions of the pandas library with execution.

read_csv: Used to read data from a CSV file into a DataFrame.

In [None]:
import pandas as pd

# Assuming you have a CSV file named 'example.csv'
df = pd.read_csv('example.csv')
print(df.head())

head: Displays the first n rows of a DataFrame (default is 5).

In [None]:
# Assuming 'df' is a DataFrame
print(df.head())

info: Provides a concise summary of a DataFrame, including the data types and non-null values.

In [None]:
# Assuming 'df' is a DataFrame
print(df.info())

describe: Generates descriptive statistics of a DataFrame, such as mean, median, min, max, etc.

In [None]:
# Assuming 'df' is a DataFrame
print(df.describe())

groupby: Used to split data into groups based on some criteria and then apply a function to each group independently.

In [None]:
# Assuming 'df' is a DataFrame with a 'category' column
grouped_data = df.groupby('category').mean()
print(grouped_data)

### Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row. 

In [2]:
import pandas as pd

def reindex_with_custom_index(df):
    # Create a new index starting from 1 and incrementing by 2
    new_index = range(1, 2 * len(df) + 1, 2)

    # Set the new index to the DataFrame
    df_reindexed = df.set_index(pd.Index(new_index))

    return df_reindexed

# Example usage:
# Assuming 'df' is your DataFrame with columns 'A', 'B', and 'C'
df = pd.DataFrame({'A': [10, 20, 30], 'B': [40, 50, 60], 'C': [70, 80, 90]})
result_df = reindex_with_custom_index(df)

print(result_df)

    A   B   C
1  10  40  70
3  20  50  80
5  30  60  90


### Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console. For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should calculate and print the sum of the first three values, which is 60.

In [3]:
import pandas as pd

def calculate_sum_of_first_three_values(df):
    # Extract the first three values from the 'Values' column
    first_three_values = df['Values'].head(3)

    # Calculate the sum and print it to the console
    sum_of_first_three_values = first_three_values.sum()
    print("Sum of the first three values:", sum_of_first_three_values)

# Example usage:
# Assuming 'df' is your DataFrame with the 'Values' column
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
calculate_sum_of_first_three_values(df)

Sum of the first three values: 60


### Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [4]:
import pandas as pd

def add_word_count_column(df):
    # Create a new 'Word_Count' column by applying a lambda function to count words in each row
    df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))

# Example usage:
# Assuming 'df' is your DataFrame with the 'Text' column
df = pd.DataFrame({'Text': ['This is an example.', 'Python programming', 'Data Science is awesome!']})

# Call the function to add the 'Word_Count' column
add_word_count_column(df)

# Display the updated DataFrame
print(df)

                       Text  Word_Count
0       This is an example.           4
1        Python programming           2
2  Data Science is awesome!           4


### Q5. How are DataFrame.size() and DataFrame.shape() different?

DataFrame.size:

DataFrame.size returns the total number of elements in the DataFrame, which is equal to the product of the number of rows and columns.
It provides the total count of cells (entries) in the DataFrame, including all the rows and columns.

In [5]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
size_of_dataframe = df.size
print(size_of_dataframe)

6


DataFrame.shape:

DataFrame.shape returns a tuple representing the dimensions of the DataFrame.
The tuple consists of two values - the number of rows and the number of columns in the DataFrame.

In [6]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
shape_of_dataframe = df.shape
print(shape_of_dataframe)

(3, 2)


### Q6. Which function of pandas do we use to read an excel file?

In [None]:
import pandas as pd

# Assuming you have an Excel file named 'example.xlsx' with a sheet named 'Sheet1'
df = pd.read_excel('example.xlsx', sheet_name='Sheet1')

# Now, 'df' is a DataFrame containing the data from the Excel file
print(df)

### Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address. The username is the part of the email address that appears before the '@' symbol. For example, if the email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'. Your function should extract the username from each email address and store it in the new 'Username' column.

In [9]:
import pandas as pd

def extract_username(df):
    # Extract the username part from the 'Email' column and create a new 'Username' column
    df['Username'] = df['Email'].str.split('@').str[0]

# Example usage:
# Assuming 'df' is your DataFrame with the 'Email' column
df = pd.DataFrame({'Email': ['john.doe@example.com', 'alice.smith@example.com', 'bob.jones@example.com']})

# Call the function to add the 'Username' column
extract_username(df)

# Display the updated DataFrame
print(df)

                     Email     Username
0     john.doe@example.com     john.doe
1  alice.smith@example.com  alice.smith
2    bob.jones@example.com    bob.jones


In [10]:
import pandas as pd

def select_rows(df):
    # Use boolean indexing to select rows based on the conditions
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    
    return selected_rows

# Example usage:
# Assuming 'df' is your DataFrame with columns 'A', 'B', and 'C'
df = pd.DataFrame({'A': [3, 8, 6, 2, 9],
                   'B': [5, 2, 9, 3, 1],
                   'C': [1, 7, 4, 5, 2]})

# Call the function to select rows
selected_df = select_rows(df)

# Display the selected DataFrame
print(selected_df)

   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


### Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [11]:
import pandas as pd

def calculate_statistics(df):
    # Calculate mean, median, and standard deviation of the 'Values' column
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_deviation = df['Values'].std()

    # Return the calculated statistics
    return mean_value, median_value, std_deviation

# Example usage:
# Assuming 'df' is your DataFrame with the 'Values' column
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})

# Call the function to calculate statistics
mean_val, median_val, std_dev = calculate_statistics(df)

# Display the calculated statistics
print(f"Mean: {mean_val}")
print(f"Median: {median_val}")
print(f"Standard Deviation: {std_dev}")

Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


### Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

In [12]:
import pandas as pd

def calculate_moving_average(df):
    # Assuming 'Date' column is in datetime format
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()

# Example usage:
# Assuming 'df' is your DataFrame with the 'Sales' and 'Date' columns
df = pd.DataFrame({
    'Date': pd.date_range(start='2022-01-01', periods=10),
    'Sales': [50, 60, 45, 70, 80, 65, 55, 75, 90, 85]
})

# Call the function to calculate moving average
calculate_moving_average(df)

# Display the updated DataFrame
print(df)

        Date  Sales  MovingAverage
0 2022-01-01     50      50.000000
1 2022-01-02     60      55.000000
2 2022-01-03     45      51.666667
3 2022-01-04     70      56.250000
4 2022-01-05     80      61.000000
5 2022-01-06     65      61.666667
6 2022-01-07     55      60.714286
7 2022-01-08     75      64.285714
8 2022-01-09     90      68.571429
9 2022-01-10     85      74.285714


### Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g. Monday, Tuesday) corresponding to each date in the 'Date' column.

In [13]:
import pandas as pd

def add_weekday_column(df):
    # Assuming 'Date' column is in datetime format
    df['Weekday'] = df['Date'].dt.day_name()

# Example usage:
# Assuming 'df' is your DataFrame with the 'Date' column
df = pd.DataFrame({
    'Date': pd.date_range(start='2022-01-01', periods=5)
})

# Call the function to add the 'Weekday' column
add_weekday_column(df)

# Display the updated DataFrame
print(df)

        Date    Weekday
0 2022-01-01   Saturday
1 2022-01-02     Sunday
2 2022-01-03     Monday
3 2022-01-04    Tuesday
4 2022-01-05  Wednesday


### Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [16]:
import pandas as pd

def select_rows_in_date_range(df):
    # Assuming 'Date' column is in datetime format
    start_date = '2023-01-01'
    end_date = '2023-01-31'

    # Use boolean indexing to select rows within the specified date range
    selected_rows = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)].reset_index(drop=True)

    return selected_rows

# Example usage:
# Assuming 'df' is your DataFrame with the 'Date' column
date_range = pd.date_range(start='2022-12-01', end='2023-02-28', freq='D')
df = pd.DataFrame({
    'Date': date_range[:len(date_range)//2],  # Ensure both arrays have the same length
    'Value': range(len(date_range)//2)
})

# Call the function to select rows in the specified date range
selected_df = select_rows_in_date_range(df)

# Display the selected DataFrame
print(selected_df)

         Date  Value
0  2023-01-01     31
1  2023-01-02     32
2  2023-01-03     33
3  2023-01-04     34
4  2023-01-05     35
5  2023-01-06     36
6  2023-01-07     37
7  2023-01-08     38
8  2023-01-09     39
9  2023-01-10     40
10 2023-01-11     41
11 2023-01-12     42
12 2023-01-13     43
13 2023-01-14     44


### Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

In [18]:
import pandas as pd
# To use the basic functions of pandas, the first and foremost library that needs to be imported is the pandas library itself. 