In [1]:
import pandas as pd
import numpy as np
import re

# Creating the DataFrame
data = {
    'Visitor_ID': [f'V{1000 + i}' for i in range(20)],
    'Name': ['ABHISHEK SANDEEP      ZADE     ', 'ARNAV AJAY DESHPANDE.. ', '.. .. ASHWINI LALCHAND MUNDAWARE/ ', 'GAYATRI SURESH GAIKWAD', 'HARSHADA GANESH CHAUDHARI',
             'VAIBHAVI HARISHWAR PATIL', np.nan, 'VISHAKHA PUNDLIK JADHAV', 'YASH BHARAT SOLUNKE/', 'VIVEK SANTOSH KHANDWE',
             'VISHAKHA PUNDLIK JADHAV', 'Tanushree chhanwal', 'Shruti jaiswal', 'Shriyash Sulakhe', 'YASH BHARAT SOLUNKE',
             np.nan, 'ARNAV AJAY DESHPANDE', 'RUTUJA SANTOSH THOTE', 'ROHIT DILIP BILWAL', 'RITESH SHIVAJI BAIRAGI'],
    'Age': [25, 23, 22, np.nan, 21, 25, 24, 24, 28, np.nan,
            22, 23, 25, 27, 25, 30, 31, 26, 19, 19],
    'Ticket_Price': [500, 750, 500, 1000, np.nan, 500, 700, 650, 750, 1000,
                      500, 800, np.nan, 750, 500, 700, 900, 850, 750, np.nan],
    'Check_In_Time': ['10:00 AM', '10:30 AM', '11:00 AM', '11:30 AM', np.nan,
                       '10:00 AM', '12:00 PM', '12:30 PM', '01:00 PM', '01:30 PM',
                       '11:00 AM', '02:00 PM', np.nan, '02:30 PM', '10:00 AM',
                       '12:00 PM', '03:00 PM', '03:30 PM', '04:00 PM', np.nan],
    'City': ['Delhi', 'Aurangabad', 'Mumbai', 'Bombay', 'New Delhi', 'NDL', 'Chennai', 'Chenai', 'Chennaai', 'Bangalore',
             'Delhi', 'Pune', 'New Delhi', 'Bombay', 'New Delhi', 'NDL', 'Indore', 'Bangalore', 'Ujjain', 'Bangalore'],
    'State': ['Delhi', 'Maharastra', 'Maharastra', 'Maharastra', 'Delhi', 'Delhi', 'Tamilnadu', 'Tamilnadu', 'Tamilnadu', 'Karnataka',
              'Delhi', 'Maharastra', 'Delhi', 'Maharastra', 'Delhi', 'Delhi', 'Madhya Pradesh', 'Karnataka', 'Madhya Pradesh', 'Karnataka']
}

df = pd.DataFrame(data)

#Print size of the DataFrame
print("Size of DataFrame:", df.size)

#Print top 5 and last 2 rows
print("\nTop 5 Rows:")
print(df.head())
print("\nLast 2 Rows:")
print(df.tail(2))

#Print column data types
print("\nColumn Data Types:")
print(df.dtypes)

#Print statistical description
print("\nStatistical Description:")
print(df.describe(include='all'))

#Data Cleaning: Remove special characters from Names
df['Name'] = df['Name'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z ]+', '', x).strip().title() if pd.notna(x) else x)

#Handle blank data (fill missing values)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Ticket_Price'].fillna(df['Ticket_Price'].median(), inplace=True)
df['Check_In_Time'].fillna('Unknown', inplace=True)
df['Name'].fillna('Unknown', inplace=True)

#Remove duplicates
df.drop_duplicates(inplace=True)

#Print cleaned DataFrame
print("\nCleaned DataFrame:")
print(df)


Size of DataFrame: 140

Top 5 Rows:
  Visitor_ID                                Name   Age  Ticket_Price  \
0      V1000     ABHISHEK SANDEEP      ZADE       25.0         500.0   
1      V1001             ARNAV AJAY DESHPANDE..   23.0         750.0   
2      V1002  .. .. ASHWINI LALCHAND MUNDAWARE/   22.0         500.0   
3      V1003              GAYATRI SURESH GAIKWAD   NaN        1000.0   
4      V1004           HARSHADA GANESH CHAUDHARI  21.0           NaN   

  Check_In_Time        City       State  
0      10:00 AM       Delhi       Delhi  
1      10:30 AM  Aurangabad  Maharastra  
2      11:00 AM      Mumbai  Maharastra  
3      11:30 AM      Bombay  Maharastra  
4           NaN   New Delhi       Delhi  

Last 2 Rows:
   Visitor_ID                    Name   Age  Ticket_Price Check_In_Time  \
18      V1018      ROHIT DILIP BILWAL  19.0         750.0      04:00 PM   
19      V1019  RITESH SHIVAJI BAIRAGI  19.0           NaN           NaN   

         City           State  
18     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Ticket_Price'].fillna(df['Ticket_Price'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [3]:
import pandas as pd
import numpy as np
import re

# Creating the DataFrame
data = {
    'Singer': ['Arijit Singh', 'Shreya Ghoshal', 'Sonu Nigam', 'Lata Mangeshkar', 'Kishore Kumar'],
    'Years_Active': [15, 20, 30, 70, 40],
    'Number_of_Songs': [500, 700, 600, 1000, 800],
    'Awards_Won': [50, 60, 40, 100, 80]
}

df = pd.DataFrame(data)

#Print size of the DataFrame
print("Size of DataFrame:", df.size)

#Print top 5 and last 2 rows
print("\nTop 5 Rows:")
print(df.head())
print("\nLast 2 Rows:")
print(df.tail(2))

#Print column data types
print("\nColumn Data Types:")
print(df.dtypes)

#Print statistical description
print("\nStatistical Description:")
print(df.describe(include='all'))

#Data Cleaning: Remove special characters from Names
df['Singer'] = df['Singer'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z ]+', '', x).strip().title() if pd.notna(x) else x)

#Handle blank data (fill missing values)
df.fillna({'Years_Active': df['Years_Active'].median(),
           'Number_of_Songs': df['Number_of_Songs'].median(),
           'Awards_Won': df['Awards_Won'].median()}, inplace=True)
df['Singer'].fillna('Unknown', inplace=True)

#Remove duplicates
df.drop_duplicates(inplace=True)

#Print cleaned DataFrame
print("\nCleaned DataFrame:")
print(df)


Size of DataFrame: 20

Top 5 Rows:
            Singer  Years_Active  Number_of_Songs  Awards_Won
0     Arijit Singh            15              500          50
1   Shreya Ghoshal            20              700          60
2       Sonu Nigam            30              600          40
3  Lata Mangeshkar            70             1000         100
4    Kishore Kumar            40              800          80

Last 2 Rows:
            Singer  Years_Active  Number_of_Songs  Awards_Won
3  Lata Mangeshkar            70             1000         100
4    Kishore Kumar            40              800          80

Column Data Types:
Singer             object
Years_Active        int64
Number_of_Songs     int64
Awards_Won          int64
dtype: object

Statistical Description:
              Singer  Years_Active  Number_of_Songs  Awards_Won
count              5      5.000000         5.000000    5.000000
unique             5           NaN              NaN         NaN
top     Arijit Singh           NaN   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Singer'].fillna('Unknown', inplace=True)
