Data manipulation and analysis are key tasks in any data science or data analysis project. Pandas provide a wide range of functions for data manipulation and analysis, making it easier to clean, transform and extract insights from data.

In [18]:
import pandas as pd

In [19]:
df = pd.read_csv('data.csv')

# fetch the first 5 rows
df.head()

Unnamed: 0,Car,Model,Volume,Weight,CO2
0,Toyoty,Aygo,1000.0,790.0,99.0
1,Mitsubishi,Space Star,1200.0,1160.0,95.0
2,Skoda,Citigo,1000.0,929.0,95.0
3,Fiat,500,900.0,865.0,90.0
4,Mini,Cooper,1500.0,1140.0,105.0


In [20]:
df.tail()

Unnamed: 0,Car,Model,Volume,Weight,CO2
31,Volvo,XC70,2000.0,1746.0,
32,Ford,B-Max,1600.0,1235.0,104.0
33,BMW,216,,1390.0,108.0
34,Opel,Zafira,1600.0,1405.0,109.0
35,Mercedes,SLK,,1395.0,120.0


In [21]:
df.describe()

Unnamed: 0,Volume,Weight,CO2
count,33.0,34.0,33.0
mean,1566.666667,1281.882353,101.909091
std,357.654396,244.941027,7.173578
min,900.0,790.0,90.0
25%,1400.0,1113.75,98.0
50%,1600.0,1327.0,99.0
75%,2000.0,1412.5,105.0
max,2100.0,1746.0,120.0


In [22]:
df.dtypes

Car        object
Model      object
Volume    float64
Weight    float64
CO2       float64
dtype: object

In [23]:
## Handling Missing Values
df.isnull() # returns a boolean dataframe

Unnamed: 0,Car,Model,Volume,Weight,CO2
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [24]:
df.isnull().any()

Car       False
Model     False
Volume     True
Weight     True
CO2        True
dtype: bool

In [25]:
df.isnull().sum()

Car       0
Model     0
Volume    3
Weight    2
CO2       3
dtype: int64

In [26]:
df_filled = df.fillna(0)

In [27]:
## Filling missing values with the mean of the column
df['Volume_fillna'] = df['Volume'].fillna(df['Volume'].mean())
df

Unnamed: 0,Car,Model,Volume,Weight,CO2,Volume_fillna
0,Toyoty,Aygo,1000.0,790.0,99.0,1000.0
1,Mitsubishi,Space Star,1200.0,1160.0,95.0,1200.0
2,Skoda,Citigo,1000.0,929.0,95.0,1000.0
3,Fiat,500,900.0,865.0,90.0,900.0
4,Mini,Cooper,1500.0,1140.0,105.0,1500.0
5,VW,Up!,1000.0,929.0,105.0,1000.0
6,Skoda,Fabia,1400.0,1109.0,90.0,1400.0
7,Mercedes,A-Class,1500.0,1365.0,92.0,1500.0
8,Ford,Fiesta,1500.0,1112.0,98.0,1500.0
9,Audi,A1,1600.0,1150.0,99.0,1600.0


In [29]:
df.dtypes

Car               object
Model             object
Volume           float64
Weight           float64
CO2              float64
Volume_fillna    float64
dtype: object

In [30]:
## Renaming Columns
df = df.rename(columns={'Volume':'Volume_1'})
df.head()

Unnamed: 0,Car,Model,Volume_1,Weight,CO2,Volume_fillna
0,Toyoty,Aygo,1000.0,790.0,99.0,1000.0
1,Mitsubishi,Space Star,1200.0,1160.0,95.0,1200.0
2,Skoda,Citigo,1000.0,929.0,95.0,1000.0
3,Fiat,500,900.0,865.0,90.0,900.0
4,Mini,Cooper,1500.0,1140.0,105.0,1500.0


In [37]:
## Changing the data type of a column
df['Volume_new'] = df['Volume_1'].fillna(df['Volume_1'].mean()).astype(int)
df.head()


Unnamed: 0,Car,Model,Volume_1,Weight,CO2,Volume_fillna,Volume_new
0,Toyoty,Aygo,1000.0,790.0,99.0,1000.0,1000
1,Mitsubishi,Space Star,1200.0,1160.0,95.0,1200.0,1200
2,Skoda,Citigo,1000.0,929.0,95.0,1000.0,1000
3,Fiat,500,900.0,865.0,90.0,900.0,900
4,Mini,Cooper,1500.0,1140.0,105.0,1500.0,1500


In [38]:
df['New Volume'] = df['Volume_1'].apply(lambda x: x*2)
df.head()

Unnamed: 0,Car,Model,Volume_1,Weight,CO2,Volume_fillna,Volume_new,New Volume
0,Toyoty,Aygo,1000.0,790.0,99.0,1000.0,1000,2000.0
1,Mitsubishi,Space Star,1200.0,1160.0,95.0,1200.0,1200,2400.0
2,Skoda,Citigo,1000.0,929.0,95.0,1000.0,1000,2000.0
3,Fiat,500,900.0,865.0,90.0,900.0,900,1800.0
4,Mini,Cooper,1500.0,1140.0,105.0,1500.0,1500,3000.0


In [39]:
# Data Aggregating and Grouping
df.head()

Unnamed: 0,Car,Model,Volume_1,Weight,CO2,Volume_fillna,Volume_new,New Volume
0,Toyoty,Aygo,1000.0,790.0,99.0,1000.0,1000,2000.0
1,Mitsubishi,Space Star,1200.0,1160.0,95.0,1200.0,1200,2400.0
2,Skoda,Citigo,1000.0,929.0,95.0,1000.0,1000,2000.0
3,Fiat,500,900.0,865.0,90.0,900.0,900,1800.0
4,Mini,Cooper,1500.0,1140.0,105.0,1500.0,1500,3000.0


In [41]:
grouped_mean = df.groupby('Car')['Weight'].mean()
print(grouped_mean)

Car
Audi          1455.000000
BMW           1486.666667
Fiat           865.000000
Ford          1274.200000
Honda         1252.000000
Hundai        1326.000000
Hyundai        980.000000
Mazda         1280.000000
Mercedes      1439.000000
Mini          1140.000000
Mitsubishi    1160.000000
Opel          1387.666667
Skoda         1143.000000
Suzuki         990.000000
Toyoty         790.000000
VW             929.000000
Volvo         1746.000000
Name: Weight, dtype: float64


In [42]:
grouped_sum = df.groupby(['Car', 'Model'])['Weight'].sum()
print(grouped_sum)

Car         Model     
Audi        A1            1150.0
            A4            1490.0
            A6            1725.0
BMW         1             1365.0
            216           1390.0
            5             1705.0
Fiat        500            865.0
Ford        B-Max         1235.0
            Fiesta        2224.0
            Focus         1328.0
            Mondeo        1584.0
Honda       Civic         1252.0
Hundai      I30           1326.0
Hyundai     I20            980.0
Mazda       3             1280.0
Mercedes    A-Class       1365.0
            C-Class       1365.0
            CLA           1465.0
            E-Class       1605.0
            SLK           1395.0
Mini        Cooper        1140.0
Mitsubishi  Space Star    1160.0
Opel        Astra         1330.0
            Insignia      1428.0
            Zafira        1405.0
Skoda       Citigo         929.0
            Fabia         1109.0
            Octavia       1415.0
            Rapid         1119.0
Suzuki      Swift   

In [44]:
## Aggregating with multiple functions

grouped_agg = df.groupby('Car')['Weight'].agg(['min', 'max', 'sum'])
print(grouped_agg)

               min     max     sum
Car                               
Audi        1150.0  1725.0  4365.0
BMW         1365.0  1705.0  4460.0
Fiat         865.0   865.0   865.0
Ford        1112.0  1584.0  6371.0
Honda       1252.0  1252.0  1252.0
Hundai      1326.0  1326.0  1326.0
Hyundai      980.0   980.0   980.0
Mazda       1280.0  1280.0  1280.0
Mercedes    1365.0  1605.0  7195.0
Mini        1140.0  1140.0  1140.0
Mitsubishi  1160.0  1160.0  1160.0
Opel        1330.0  1428.0  4163.0
Skoda        929.0  1415.0  4572.0
Suzuki       990.0   990.0   990.0
Toyoty       790.0   790.0   790.0
VW           929.0   929.0   929.0
Volvo       1746.0  1746.0  1746.0


In [45]:
## Merging and Joining DataFrames

# Create sample dataframes
df1 = pd.DataFrame({'Key': ['A', 'B', 'C', 'D'], 'Value': [1, 2, 3, 4]})
df2 = pd.DataFrame({'Key': ['A', 'B', 'C', 'E'], 'Value': [5, 6, 7, 8]})

In [46]:
df1

Unnamed: 0,Key,Value
0,A,1
1,B,2
2,C,3
3,D,4


In [47]:
df2

Unnamed: 0,Key,Value
0,A,5
1,B,6
2,C,7
3,E,8


In [49]:
## Merge the dataframes on the 'Key' column
df_merged = pd.merge(df1, df2, on='Key', how='inner')
df_merged

Unnamed: 0,Key,Value_x,Value_y
0,A,1,5
1,B,2,6
2,C,3,7


In [50]:
pd.merge(df1, df2, on='Key', how='outer')

Unnamed: 0,Key,Value_x,Value_y
0,A,1.0,5.0
1,B,2.0,6.0
2,C,3.0,7.0
3,D,4.0,
4,E,,8.0


In [51]:
pd.merge(df1, df2, on='Key', how='left')

Unnamed: 0,Key,Value_x,Value_y
0,A,1,5.0
1,B,2,6.0
2,C,3,7.0
3,D,4,


In [52]:
pd.merge(df1, df2, on='Key', how='right')

Unnamed: 0,Key,Value_x,Value_y
0,A,1.0,5
1,B,2.0,6
2,C,3.0,7
3,E,,8
