In [2]:
import pandas as pd
import numpy as np

# 1. Create and Save Sample Data
# Create a DataFrame with the specified names
data = {
    'Name': ['Jayanth', 'Lokesh', 'Sasank', 'Teja', np.nan],
    'Age': [28, np.nan, 34, 29, 40],
    'Salary': [70000, 80000, np.nan, 60000, 90000],
    'Gender': ['M', 'M', 'M', 'F', np.nan]
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('sample_data.csv', index=False)

# 2. Read the Data from the CSV File
df = pd.read_csv('sample_data.csv')

print("Original DataFrame:\n", df)

# 3. Handling Missing Data

# a. Fill missing values with specified values or methods
df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill missing Age with the mean age
df['Salary'].fillna(df['Salary'].median(), inplace=True)  # Fill missing Salary with the median salary

# b. Drop rows with missing values in 'Name' or 'Gender'
df.dropna(subset=['Name', 'Gender'], inplace=True)

print("\nDataFrame after handling missing values:\n", df)

# 4. Data Transformation

# a. Add a new column 'Annual Income' by transforming 'Salary' column
df['Annual Income'] = df['Salary'] * 12

# b. Create a 'Senior Citizen' column based on Age
df['Senior Citizen'] = df['Age'].apply(lambda x: 'Yes' if x >= 60 else 'No')

print("\nDataFrame after data transformations:\n", df)

# Save the transformed DataFrame to a new CSV file
df.to_csv('transformed_data.csv', index=False)

Original DataFrame:
       Name   Age   Salary Gender
0  Jayanth  28.0  70000.0      M
1   Lokesh   NaN  80000.0      M
2   Sasank  34.0      NaN      M
3     Teja  29.0  60000.0      F
4      NaN  40.0  90000.0    NaN

DataFrame after handling missing values:
       Name    Age   Salary Gender
0  Jayanth  28.00  70000.0      M
1   Lokesh  32.75  80000.0      M
2   Sasank  34.00  75000.0      M
3     Teja  29.00  60000.0      F

DataFrame after data transformations:
       Name    Age   Salary Gender  Annual Income Senior Citizen
0  Jayanth  28.00  70000.0      M       840000.0             No
1   Lokesh  32.75  80000.0      M       960000.0             No
2   Sasank  34.00  75000.0      M       900000.0             No
3     Teja  29.00  60000.0      F       720000.0             No


In [3]:
import pandas as pd
import numpy as np

# Create a sample DataFrame for demonstration
data = {
    'Name': ['Jayanth', 'Lokesh', 'Sasank', 'Teja', 'Jayanth', 'Lokesh', 'Teja'],
    'Category': ['A', 'B', 'A', 'B', 'C', 'C', 'A'],
    'Value': [10, 20, 30, 40, 50, 60, 70],
    'Quantity': [5, 3, 6, 2, 8, 7, 4]
}
df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

# 1. Generating Summary Statistics
summary_statistics = df.describe(include='all')  # include='all' to include categorical columns
print("\nSummary Statistics:\n", summary_statistics)

Original DataFrame:
       Name Category  Value  Quantity
0  Jayanth        A     10         5
1   Lokesh        B     20         3
2   Sasank        A     30         6
3     Teja        B     40         2
4  Jayanth        C     50         8
5   Lokesh        C     60         7
6     Teja        A     70         4

Summary Statistics:
            Name Category      Value  Quantity
count         7        7   7.000000  7.000000
unique        4        3        NaN       NaN
top     Jayanth        A        NaN       NaN
freq          2        3        NaN       NaN
mean        NaN      NaN  40.000000  5.000000
std         NaN      NaN  21.602469  2.160247
min         NaN      NaN  10.000000  2.000000
25%         NaN      NaN  25.000000  3.500000
50%         NaN      NaN  40.000000  5.000000
75%         NaN      NaN  55.000000  6.500000
max         NaN      NaN  70.000000  8.000000
