# Pandas: basic processing and data cleaning

In [1]:
import pandas as pd
import numpy as np

## Basic processing exercises

* Load the dataset cantons.csv

In [2]:
df_cant = pd.read_csv("cantons.csv")
df_cant.head()

Unnamed: 0,Code,Name,Population,Area
0,ZH,Zurich,1539275,1729
1,BE,Bern,1039474,5960
2,LU,Lucerne,413120,1494
3,UR,Uri,36703,1077
4,SZ,Schwyz,160480,908


* Add a column named **Density** containing the population density (Population/Area). NOTE: the variable Area is in km^2

In [3]:
df_cant["Density"] = df_cant["Population"]/df_cant["Area"]
df_cant.head(5)

Unnamed: 0,Code,Name,Population,Area,Density
0,ZH,Zurich,1539275,1729,890.268942
1,BE,Bern,1039474,5960,174.408389
2,LU,Lucerne,413120,1494,276.519411
3,UR,Uri,36703,1077,34.078923
4,SZ,Schwyz,160480,908,176.740088


* Sort the Swiss cantons by decreasing density of population

In [4]:
df_cant.sort_values(by="Density", ascending=False, inplace=True) # I used the inplace option...
df_cant.head(5)

Unnamed: 0,Code,Name,Population,Area,Density
11,BS,Basel-City,201469,37,5445.108108
24,GE,Geneva,504128,282,1787.687943
0,ZH,Zurich,1539275,1729,890.268942
8,ZG,Zug,127642,239,534.066946
18,AG,Aarau,685845,1404,488.49359


* Compute the mean of Population, the maximum of Area, and the minimum of Density

In [5]:
df_cant["Population"].mean(), df_cant["Area"].max(), df_cant["Density"].min()

(320866.8076923077, 7105, 28.01140042223786)

* What is the largest Swiss canton (max Area?)

In [6]:
df_cant[df_cant["Area"] == df_cant["Area"].max()]

Unnamed: 0,Code,Name,Population,Area,Density
17,GR,Grisons,199021,7105,28.0114


## Basic cleaning/processing exercises

* Load the dataset auto.csv

In [7]:
df_auto = pd.read_csv("auto.csv")
df_auto.head()

Unnamed: 0,brand,fuel type,aspiration,body style,engine location,length,width,height,weight,engine capacity,compression ratio,horsepower,max rpm,city mpg,highway mpg,num cylinders,drive wheels
0,audi,gasoline,std,sedan,front,,168.7,137.9,1281.0,2.2286,8.0,115.0,5500.0,7.652,9.353,5,4wd
1,audi,gasoline,turbo,hatchback,front,452.6,172.5,132.1,1384.8,2.1467,7.0,160.0,5500.0,6.802,9.353,5,4wd
2,subaru,gasoline,std,hatchback,front,399.5,162.1,141.5,1016.1,1.7698,8.7,73.0,4400.0,11.053,13.179,4,4wd
3,subaru,gasoline,std,sedan,front,436.9,166.1,137.9,1081.8,1.7698,9.0,82.0,4800.0,10.203,10.628,4,4wd
4,subaru,gasoline,turbo,sedan,front,436.9,166.1,137.9,1138.5,1.7698,7.7,111.0,4800.0,10.203,12.329,4,4wd


* How many columns have missing values?

In [8]:
df_auto.isnull().any().value_counts()

True     12
False     5
dtype: int64

* How many rows have missing values?

In [9]:
df_auto.isnull().any(axis="columns").value_counts()

False    200
True       3
dtype: int64

* Imagine you need a dataset without missing values for your analysis. Would you prefer to get rid of rows with missing values of columns with missing values? Why?

I'd rather get rid of the 3 rows with missing values, so I can keep all the columns for the remaining 200 rows.

* Create a new dataframe ``df_auto_nomissing`` containing all the rows of df_auto without missing values

In [10]:
df_auto_nomissing = df_auto.dropna()
df_auto_nomissing.head()

Unnamed: 0,brand,fuel type,aspiration,body style,engine location,length,width,height,weight,engine capacity,compression ratio,horsepower,max rpm,city mpg,highway mpg,num cylinders,drive wheels
1,audi,gasoline,turbo,hatchback,front,452.6,172.5,132.1,1384.8,2.1467,7.0,160.0,5500.0,6.802,9.353,5,4wd
2,subaru,gasoline,std,hatchback,front,399.5,162.1,141.5,1016.1,1.7698,8.7,73.0,4400.0,11.053,13.179,4,4wd
3,subaru,gasoline,std,sedan,front,436.9,166.1,137.9,1081.8,1.7698,9.0,82.0,4800.0,10.203,10.628,4,4wd
4,subaru,gasoline,turbo,sedan,front,436.9,166.1,137.9,1138.5,1.7698,7.7,111.0,4800.0,10.203,12.329,4,4wd
5,subaru,gasoline,std,wagon,front,440.9,166.1,139.4,1097.7,1.7698,9.0,82.0,4800.0,9.778,12.329,4,4wd


* Find the different brands of cars in the dataset.

In [11]:
df_auto["brand"].unique()

array(['audi', 'subaru', 'toyota', 'chevrolet', 'dodge', 'honda', 'isuzu',
       'mazda', 'mitsubishi', 'nissan', 'plymouth', 'saab', 'volkswagen',
       'alfa-romeo', 'bmw', 'jaguar', 'mercedes-benz', 'mercury',
       'peugeot', 'porsche', 'volvo'], dtype=object)

In [12]:
df_auto["drive wheels"].unique()

array(['4wd', 'fwd', 'rwd'], dtype=object)

* Compute common statistics (min, max, mean, std, ...) for all the numeric columns of df_auto

In [13]:
df_auto.describe()

Unnamed: 0,length,width,height,weight,engine capacity,compression ratio,horsepower,max rpm,city mpg,highway mpg,num cylinders
count,201.0,202.0,202.0,203.0,202.0,202.0,202.0,202.0,202.0,202.0,203.0
mean,442.339303,167.466337,136.487129,1159.362069,2.084149,10.16,104.534653,5125.49505,10.685168,13.025465,4.384236
std,31.062479,5.388569,6.221359,237.324998,0.683186,3.998689,39.613823,480.522107,2.722876,2.872592,1.085525
min,367.3,157.0,121.4,675.0,1.1471,7.0,52.0,4150.0,5.527,6.802,2.0
25%,422.4,162.8,132.1,973.0,1.5936,8.525,70.0,4800.0,8.078,10.628,4.0
50%,439.9,166.4,137.4,1095.0,1.95825,9.0,95.0,5200.0,10.203,12.754,4.0
75%,466.1,169.9,141.0,1335.15,2.359725,9.4,116.0,5500.0,12.754,14.455,4.0
max,528.6,183.6,151.9,1844.3,5.3422,23.0,288.0,6600.0,20.832,22.957,12.0
