In [11]:
import pandas as pd

# Define column names from UCI Auto MPG dataset
column_names = [
    "mpg", "cylinders", "displacement", "horsepower", "weight",
    "acceleration", "model_year", "origin", "car_name"
]

# URL of Auto MPG dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

# Load dataset
df = pd.read_csv(
    url,
    sep='\s+',   # data separated by spaces
    names=column_names,      # assign column names
    na_values="?"            # missing values marked as '?'
)

# Save to local CSV file
df.to_csv("auto_mpg.csv", index=False)

print("✅ Auto MPG dataset saved as 'auto_mpg.csv'")
print(df.head())

✅ Auto MPG dataset saved as 'auto_mpg.csv'
    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model_year  origin                   car_name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  


In [12]:
df.head(50)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354.0,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312.0,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425.0,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190.0,3850.0,8.5,70,1,amc ambassador dpl


In [13]:
df.tail(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
388,26.0,4,156.0,92.0,2585.0,14.5,82,1,chrysler lebaron medallion
389,22.0,6,232.0,112.0,2835.0,14.7,82,1,ford granada l
390,32.0,4,144.0,96.0,2665.0,13.9,82,3,toyota celica gt
391,36.0,4,135.0,84.0,2370.0,13.0,82,1,dodge charger 2.2
392,27.0,4,151.0,90.0,2950.0,17.3,82,1,chevrolet camaro
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,chevy s-10


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [15]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [16]:
df.describe(percentiles=[0.4,0.65,0.95])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
40%,20.0,4.0,122.0,88.0,2580.6,14.8,75.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
65%,26.0,6.0,231.0,105.0,3250.7,16.5,78.0,2.0
95%,37.03,8.0,400.0,180.0,4464.0,20.415,82.0,3.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [17]:
df['car_name'].describe()

count            398
unique           305
top       ford pinto
freq               6
Name: car_name, dtype: object

In [20]:
df.describe(include='all')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0,398
unique,,,,,,,,,305
top,,,,,,,,,ford pinto
freq,,,,,,,,,6
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864,
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055,
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0,
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0,
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0,
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0,


In [23]:
print('(Rows,Columns): ',df.shape)
print('Rows: ',df.shape[0])
print('Columns: ',df.shape[1])

(Rows,Columns):  (398, 9)
Rows:  398
Columns:  9


In [26]:
print(df.columns)
col_list = list(df.columns)
print(col_list)

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'car_name'],
      dtype='object')
['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']


In [27]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
393    False
394    False
395    False
396    False
397    False
Length: 398, dtype: bool

In [29]:
import pandas as pd

data = {
    'Name':['Om','Guru','Raj','Om','Guru','Raj'],
    'Age':[21,22,23,21,22,23],
    'City':['Jsr','Cut','Ang','Jsr','Cut','Ang']
}
df = pd.DataFrame(data)
print(df)


   Name  Age City
0    Om   21  Jsr
1  Guru   22  Cut
2   Raj   23  Ang
3    Om   21  Jsr
4  Guru   22  Cut
5   Raj   23  Ang


In [30]:
df.duplicated()

0    False
1    False
2    False
3     True
4     True
5     True
dtype: bool

In [31]:
df.drop_duplicates()

Unnamed: 0,Name,Age,City
0,Om,21,Jsr
1,Guru,22,Cut
2,Raj,23,Ang


In [32]:
df

Unnamed: 0,Name,Age,City
0,Om,21,Jsr
1,Guru,22,Cut
2,Raj,23,Ang
3,Om,21,Jsr
4,Guru,22,Cut
5,Raj,23,Ang


In [33]:
#To print only the duplicated rows
df[df.duplicated()]

Unnamed: 0,Name,Age,City
3,Om,21,Jsr
4,Guru,22,Cut
5,Raj,23,Ang


In [34]:
df_first = df.drop_duplicates()
df_first

Unnamed: 0,Name,Age,City
0,Om,21,Jsr
1,Guru,22,Cut
2,Raj,23,Ang


In [35]:
df_last = df.drop_duplicates(keep='last')
df_last

Unnamed: 0,Name,Age,City
3,Om,21,Jsr
4,Guru,22,Cut
5,Raj,23,Ang


In [36]:
#Drop duplicates based on specific columns
df_subset = df.drop_duplicates(subset=['Name','Age'])
df_subset

Unnamed: 0,Name,Age,City
0,Om,21,Jsr
1,Guru,22,Cut
2,Raj,23,Ang


In [37]:
#Drop duplicates and keep neither(drop all duplicates)
df_none = df.drop_duplicates(keep=False)
df_none

Unnamed: 0,Name,Age,City


In [39]:
#Number of duplicate rows
duplicate_count = df.duplicated().sum()
duplicate_count

3

In [40]:
#Handling duplicates in a specific column(eg.: Name)
unique_names = df['Name'].drop_duplicates()
unique_names

0      Om
1    Guru
2     Raj
Name: Name, dtype: object

In [41]:
df

Unnamed: 0,Name,Age,City
0,Om,21,Jsr
1,Guru,22,Cut
2,Raj,23,Ang
3,Om,21,Jsr
4,Guru,22,Cut
5,Raj,23,Ang


In [3]:
import numpy as np
import pandas as pd

data={
    'Name':['Om','Guru','Raj','Soura'],
    'Age':[21,np.nan,22,np.nan],
    'City':[np.nan,'Cut',np.nan,'Bala']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Om,21.0,
1,Guru,,Cut
2,Raj,22.0,
3,Soura,,Bala


In [49]:
#Detect Nan Values using isna()
df.isna()

Unnamed: 0,Name,Age,City
0,False,False,True
1,False,True,False
2,False,False,True
3,False,True,False


In [48]:
df.isna().sum()

Name    0
Age     2
City    2
dtype: int64

In [50]:
#Drop rows with NaN Values
df_drop_rows = df.dropna()
df_drop_rows

Unnamed: 0,Name,Age,City


In [52]:
#Drop rows where Nan appears in specific columns
df_count_subset = df.dropna(subset=['Age'])
df_count_subset

Unnamed: 0,Name,Age,City
0,Om,21.0,
2,Raj,22.0,


In [53]:
#Fill Nan Values with a specific value
df_fill = df.copy()
df_fill['Age'] = df_fill['Age'].fillna(0)
df_fill['City'] = df_fill['City'].fillna('Unknown')
df_fill

Unnamed: 0,Name,Age,City
0,Om,21.0,Unknown
1,Guru,0.0,Cut
2,Raj,22.0,Unknown
3,Soura,0.0,Bala


In [6]:
#Fill NaN values with column mean(for numeric columns)
df_fill_mean = df.copy();
df_fill_mean['Age'] = df_fill_mean['Age'].fillna(df_fill_mean['Age'].mean())
df_fill_mean

Unnamed: 0,Name,Age,City
0,Om,21.0,
1,Guru,21.5,Cut
2,Raj,22.0,
3,Soura,21.5,Bala


In [15]:
#Forward fill NaN values (propagate previous value forward)
df_i = df.copy()
df_i['Age'] = df_i['Age'].ffill()
df_i
#there is also an bfill() for propagating next value forward

Unnamed: 0,Name,Age,City
0,Om,21.0,
1,Guru,21.0,Cut
2,Raj,22.0,
3,Soura,22.0,Bala


In [13]:
#Interpolate NAN values (linear interpolation for numeric columns)
df_in = df.copy()
df_in['Age'] = df_in['Age'].interpolate(methods='linear')
df_in

Unnamed: 0,Name,Age,City
0,Om,21.0,
1,Guru,21.5,Cut
2,Raj,22.0,
3,Soura,22.0,Bala
