## Generating summary statistics using pandas and scipy

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import scipy
from scipy import stats

In [3]:
address = '/workspaces/python-for-data-science-and-machine-learning-essential-training-part-1-3006708/data/mtcars.csv'

cars = pd.read_csv(address)
cars.columns = ['car_names','mpg','cyl','disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
cars.index = cars.car_names
cars.head()

Unnamed: 0_level_0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Mazda RX4,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Looking at summary statistics that decribe a variable's numeric values

In [4]:
cars.sum()

car_names    Mazda RX4Mazda RX4 WagDatsun 710Hornet 4 Drive...
mpg                                                      642.9
cyl                                                        198
disp                                                    7383.1
hp                                                        4694
drat                                                    115.09
wt                                                     102.952
qsec                                                    571.16
vs                                                          14
am                                                          13
gear                                                       118
carb                                                        90
dtype: object

In [5]:
cars.sum(axis=1, numeric_only=True)

car_names
Mazda RX4              328.980
Mazda RX4 Wag          329.795
Datsun 710             259.580
Hornet 4 Drive         426.135
Hornet Sportabout      590.310
Valiant                385.540
Duster 360             656.920
Merc 240D              270.980
Merc 230               299.570
Merc 280               350.460
Merc 280C              349.660
Merc 450SE             510.740
Merc 450SL             511.500
Merc 450SLC            509.850
Cadillac Fleetwood     728.560
Lincoln Continental    726.644
Chrysler Imperial      725.695
Fiat 128               213.850
Honda Civic            195.165
Toyota Corolla         206.955
Toyota Corona          273.775
Dodge Challenger       519.650
AMC Javelin            506.085
Camaro Z28             646.280
Pontiac Firebird       631.175
Fiat X1-9              208.215
Porsche 914-2          272.570
Lotus Europa           273.683
Ford Pantera L         670.690
Ferrari Dino           379.590
Maserati Bora          694.710
Volvo 142E             288.89

In [6]:
cars.median(numeric_only=True)

mpg      19.200
cyl       6.000
disp    196.300
hp      123.000
drat      3.695
wt        3.325
qsec     17.710
vs        0.000
am        0.000
gear      4.000
carb      2.000
dtype: float64

In [8]:
cars.mean(numeric_only=True)

mpg      20.090625
cyl       6.187500
disp    230.721875
hp      146.687500
drat      3.596563
wt        3.217250
qsec     17.848750
vs        0.437500
am        0.406250
gear      3.687500
carb      2.812500
dtype: float64

In [9]:
cars.max(numeric_only=True)

mpg      33.900
cyl       8.000
disp    472.000
hp      335.000
drat      4.930
wt        5.424
qsec     22.900
vs        1.000
am        1.000
gear      5.000
carb      8.000
dtype: float64

In [10]:
mpg = cars.mpg
mpg.idxmax()

'Toyota Corolla'

### Looking at summary statistics that describe variable distribution

In [11]:
cars.std(numeric_only=True)

mpg       6.026948
cyl       1.785922
disp    123.938694
hp       68.562868
drat      0.534679
wt        0.978457
qsec      1.786943
vs        0.504016
am        0.498991
gear      0.737804
carb      1.615200
dtype: float64

In [12]:
cars.var(numeric_only=True)

mpg        36.324103
cyl         3.189516
disp    15360.799829
hp       4700.866935
drat        0.285881
wt          0.957379
qsec        3.193166
vs          0.254032
am          0.248992
gear        0.544355
carb        2.608871
dtype: float64

In [18]:
cars.count(numeric_only=True)

mpg     32
cyl     32
disp    32
hp      32
drat    32
wt      32
qsec    32
vs      32
am      32
gear    32
carb    32
dtype: int64

In [15]:
gear = cars.gear
gear.value_counts()

gear
3    15
4    12
5     5
Name: count, dtype: int64

In [19]:
cars.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0
