In [30]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.stats import zscore
from scipy.stats import iqr
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("C:/Users/go27s/OneDrive/Documents/Udemy/Statistics and Probability/Statistics/winemag-data_first150k.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [8]:
# Find Measures of Central Tendency(mean, median, mode) for columns: Points & Price
# Mean - sum of values / total no of values
# Median - middle value of the dataset(ordered). No of values is odd, median is the middle one. No of values is even, the median is the avg of two middle values.
# Mode - the value that occurs most often in the dataset

print(f"The mean of points columns is: {df['points'].mean()}") # 87 - well ranked wines
print(f"The median of points column is: {df['points'].median()}") # 88- most of the wines are highly rated(at least half)
print(f"The mode of points columns is: {df['points'].mode()}")# 87 - close to mean value

The mean of points columns is: 87.8884184721394
The median of points column is: 88.0
The mode of points columns is: 0    87
Name: points, dtype: int64


In [10]:
print(f"The mean of price columns is: {df['price'].mean()}") # 33 - avg price for a bottle of wine is $33
print(f"The median of price column is: {df['price'].median()}") #24 - at least half of my wines are sold for $24
print(f"The mode of price columns is: {df['price'].mode()}") #20 - close to median value

The mean of price columns is: 33.13148249353299
The median of price column is: 24.0
The mode of price columns is: 0    20.0
Name: price, dtype: float64


In [31]:
df[['price', 'points']].describe()

Unnamed: 0,price,points
count,137235.0,150930.0
mean,33.131482,87.888418
std,36.322536,3.222392
min,4.0,80.0
25%,16.0,86.0
50%,24.0,88.0
75%,40.0,90.0
max,2300.0,100.0


In [34]:
# Find Measures of Spread/Dispersion(Range, IQR, Variance, Standard Deviation) for columns: Points & Price
# print(f"The range of column points is: {np.ptp(df['points'])}")

# Range

df['points'].max() - df['points'].min() # 20 - it's quite spread out as the median value is 84

20

In [35]:
df['price'].max() - df['price'].min()  # 2296 -  price data is extremely spread out considering median price of $24

2296.0

In [38]:
# Standard Deviation

df['points'].std() # 3.22 - low as the data ranges from 80 -100

3.2223917589832167

In [39]:
df['price'].std() # 36.32 - much higher due to outliers

36.32253619648552

In [40]:
# Variance - how far is each of the observation from the mean

df['points'].var() # 10.38 - same pattern as std

10.38380864836295

In [41]:
df['price'].var() # 1319.32 - same pattern as std

1319.326635745001

In [23]:
# Summary statistics - Obtain Skewness & Kutrosis for column: Points
# KURTOSIS - Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. 
# That is, data sets with high kurtosis tend to have heavy tails, or outliers. 
# Data sets with low kurtosis tend to have light tails, or lack of outliers. A uniform distribution would be the extreme case. 

kurtosis(df['points'], fisher = False)

2.713553658335501

In [25]:
# Skewness (symmetry)

skew(df['points'])  # 0.14 - value is greater than 0 so it's left skewed. If the value is 0 -> data normaly distributed

0.14283121423675518

In [27]:
# Obtain z-score for points - distance from the mean
zscore(df['points'])

0         2.517263
1         2.517263
2         2.517263
3         2.517263
4         2.206934
            ...   
150925    0.965616
150926    0.965616
150927    0.965616
150928    0.655286
150929    0.655286
Name: points, Length: 150930, dtype: float64

In [29]:
# Obtain IQR(measure of dispersion/spread) for points. Difference between Q3(75%) and Q1(25%)
iqr(df['points'])

4.0

In [None]:
# A range is a measure of where the beginning and end are in a set, 
# an interquartile range is a measure of where the bulk of the values lie.