# Statistical Intro

In [None]:
# https://github.com/mwaskom/seaborn-data
# seaborn data set

In [1]:
import seaborn as sns

In [9]:
import pandas as pd

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [6]:
df = sns.load_dataset("mpg")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [15]:
df.shape

(398, 9)

In [19]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [21]:
df.select_dtypes(include=["object"]) # it only brings object dtypes

Unnamed: 0,origin,name
0,usa,chevrolet chevelle malibu
1,usa,buick skylark 320
2,usa,plymouth satellite
3,usa,amc rebel sst
4,usa,ford torino
...,...,...
393,usa,ford mustang gl
394,europe,vw pickup
395,usa,dodge rampage
396,usa,ford ranger


In [24]:
df.nunique(axis=0)

mpg             129
cylinders         5
displacement     82
horsepower       93
weight          351
acceleration     95
model_year       13
origin            3
name            305
dtype: int64

In [26]:
df["origin"].unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [27]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


# Population and Sample

In [28]:
import numpy as np

In [29]:
from scipy import stats

In [30]:
np.random.seed(101)

population = np.random.randint(0,100,1000)

In [33]:
population

array([95, 11, 81, 70, 63, 87, 75,  9, 77, 40,  4, 63, 40, 60, 92, 64,  5,
       12, 93, 40, 49, 83,  8, 29, 59, 34, 44, 72, 19, 10, 76, 95, 87,  0,
       73,  8, 62, 36, 83, 99, 28, 63,  7, 10, 52, 56, 38, 73, 52, 18, 71,
       15, 44,  0, 12, 17, 75, 79, 97, 93, 24, 36, 63, 19, 35, 30, 10, 60,
       20, 27,  8, 86, 26, 87, 46, 47, 54, 86,  9, 45,  2, 18, 58, 92, 11,
       10, 94, 35, 28,  3, 83, 84, 47, 14, 69, 60, 69, 51,  6, 88, 71, 68,
       23, 35, 79, 98, 67, 82, 57, 77, 46,  3, 46, 29, 86, 21, 21, 81, 23,
       94, 71, 20, 27, 75,  5, 49, 86, 89, 63, 82, 77,  3, 56, 14, 49, 87,
       52, 13, 47, 49, 24, 20, 64, 52, 60, 47, 29, 60, 53, 11, 40, 91, 45,
       97, 24, 36, 38,  9, 52, 67, 43,  1, 79, 68, 68, 61, 18, 51, 14, 28,
       17, 87, 46, 52, 16, 70, 71, 84, 10, 62, 96, 57, 23, 86, 85, 26, 76,
       66, 54, 17, 65, 57, 89,  2, 80, 50, 66, 88, 79, 93,  6, 92, 42, 22,
       20, 25, 97, 54, 71, 72, 80, 93, 64, 63, 80, 38, 45, 35, 25, 95, 75,
       72, 11, 76, 79, 50

In [35]:
np.random.seed(51)
sample = np.random.choice(population, 100)

In [36]:
sample

array([79, 26, 84, 16, 92, 29, 96, 65, 45, 17,  9, 85, 33,  4, 47, 93, 19,
       14, 20, 68, 43, 40, 19, 92, 43,  8, 75, 65, 55, 46, 15, 74, 32, 59,
       87, 12, 87, 43, 19, 19, 26, 49, 57, 47,  5, 68, 52, 56, 78, 34, 11,
       95, 65, 55, 21, 38, 52, 17, 31,  5, 81,  5, 72,  8, 49, 20, 19, 35,
       27, 55, 44, 80, 18, 70, 82, 40, 76,  3, 48, 28, 47, 80, 69, 41, 41,
       84, 54, 60, 62, 16, 64, 41, 28, 77, 90, 39, 27, 16, 20, 28])

In [37]:
population.mean()

50.372

In [38]:
sample.mean()

45.8

In [39]:
sample_means = []
for i in range(100):
  sample = np.random.choice(population, 100)
  sample_means.append(sample.mean())
sample_means

[52.93,
 52.71,
 51.48,
 49.02,
 50.72,
 45.84,
 48.77,
 48.41,
 54.01,
 44.87,
 51.72,
 51.65,
 54.57,
 48.83,
 44.41,
 49.63,
 49.15,
 50.02,
 48.74,
 52.77,
 51.26,
 49.28,
 52.28,
 50.7,
 52.81,
 46.22,
 54.3,
 46.9,
 49.29,
 46.83,
 49.81,
 45.98,
 53.15,
 50.75,
 41.74,
 47.2,
 51.9,
 48.24,
 52.57,
 50.34,
 45.78,
 48.94,
 56.4,
 51.11,
 50.95,
 44.03,
 51.2,
 47.32,
 53.74,
 48.19,
 54.51,
 47.38,
 50.3,
 49.49,
 52.27,
 53.45,
 53.87,
 52.99,
 54.08,
 53.49,
 52.23,
 47.72,
 49.48,
 53.17,
 51.83,
 52.98,
 47.68,
 54.8,
 50.11,
 50.31,
 52.07,
 49.52,
 51.08,
 53.73,
 47.46,
 48.05,
 46.97,
 47.07,
 50.67,
 46.96,
 49.74,
 50.45,
 50.02,
 49.99,
 48.17,
 48.72,
 48.8,
 53.64,
 48.65,
 55.24,
 45.34,
 52.11,
 51.73,
 51.0,
 52.41,
 49.71,
 49.24,
 48.12,
 49.51,
 43.78]

In [40]:
np.mean(sample_means) #closer to the population mean

50.11549999999999

# Mean Median Mode Skewness Kurtosis

In [41]:
age = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31, 121]

In [48]:
np.mean(age).round(2)

34.91

In [49]:
np.median(age)

27.0

In [51]:
stats.mode(age) #we use scipy for mode calculation, 27 mode, 3 is repetition

  stats.mode(age) #we use scipy for mode calculation


ModeResult(mode=array([27]), count=array([3]))

In [53]:
stats.mode(age)[0] # only mode

  stats.mode(age)[0] # only mode


array([27])

In [54]:
data = {"D1":[135, 137, 136, 138, 138],
              "D2":[43,   42, 42, 42, 42],
              "D3":[72, 73, 72, 72, 73],
              "D4":[100, 102, 100, 103, 104]};

In [55]:
df = pd.DataFrame(data)

In [56]:
df

Unnamed: 0,D1,D2,D3,D4
0,135,43,72,100
1,137,42,73,102
2,136,42,72,100
3,138,42,72,103
4,138,42,73,104


In [59]:
df.mean()

D1    136.8
D2     42.2
D3     72.4
D4    101.8
dtype: float64

In [61]:
df["D2"].mean()

42.2

In [62]:
df["D1"].mode()

0    138
Name: D1, dtype: int64

In [63]:
df["D4"].median()

102.0

# Dispersion

In [64]:
age = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31]

In [67]:
np.std(age)

3.2572994949804666

In [69]:
age2 = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31, 121] # 121 is outliner

In [70]:
np.std(age2)

27.400910885504736

In [71]:
np.max(age2) - np.min(age2) # range calc

101

In [72]:
# varians
np.var(age2)

750.809917355372

In [73]:
from scipy.stats import iqr

In [74]:
iqr(age2)

4.5

In [75]:
q1 = np.percentile(age2, 25) #q1 = 25% quarter
q1

25.0

In [76]:
q3 = np.percentile(age2, 75)
q3

29.5

In [77]:
iqr = q3 - q1
iqr

4.5

# Skewness Kurtosis

In [78]:
stats.kurtosis(age2) #basıklık ölçümü

5.876609699981859

In [79]:
stats.skew(age2)

2.778572264050739