In [1]:
import pandas as pd
pd.__version__


'3.0.0'

Chapter 1 - Introduction to data analysis

In [2]:
# Exercise 4: Generate the data by running this cell which will create a random list of numbers (salaries)
import random

random.seed(0)
salaries = [round(random.random()*1000000, -3) for _ in range(100)]

In [3]:
# Excersie 5 - Calulating statistics and verifying

In [4]:
# Calculating statistics - mean 
# I created two variables to separate the different ways of calculating 
# Without Pandas - standard way
python_mean = sum(salaries) / len(salaries)

# With Pandas
salary_series = pd.Series(salaries)
pandas_mean = salary_series.mean()

python_mean, pandas_mean

(585690.0, np.float64(585690.0))

In [5]:
# Calculating statistics - median
# For even number of values, median is the average of the two middle values
sorted_salaries = sorted(salaries)
python_median = (sorted_salaries[49] + sorted_salaries[50]) / 2

pandas_median = salary_series.median()

python_median, pandas_median

(589000.0, np.float64(589000.0))

In [6]:
# Calculating statistics - mode (the most common value(s))

from collections import Counter
# Count how many times each salary value occurs
counts = Counter(salaries)

# Find the highest frequency
max_count = max(counts.values())

# Collect all values that occur with the highest frequency (mode)
python_modes = [
    value
    for value, count in counts.items()
    if count == max_count
]

pandas_modes = salary_series.mode()

python_modes, pandas_modes.tolist()

# If we want to see how often the mode occurs
# counts[python_modes[0]]


([477000.0], [477000.0])

In [7]:
# Sample variation with Bessel's correction (pure Python)
n = len(salaries)  # Number of observations in the sample
mean = sum(salaries) / n # Calculate the sample mean (average salary)

sample_variance_python = (
sum(                           # Sum of squared deviations from the mean
    (x - mean) ** 2            # Squared difference between each salary and the mean
    for x in salaries          # Loop over each salary value in the sample
) / (n-1)                      # Bessels correction: Using (n - 1) instead of n avoids underestimating variability
)

sample_variance_python

70664054444.44444

In [8]:
# Sample variation with Bessel's correction (Pandas)

sample_variance_pandas = salary_series.var()
# series.var() uses ddof=1 (Bessel) as default 
sample_variance_pandas 

# Both the Python and Pandas results show that:
# The sample variance is approximately 70,664,054,444 squared monetary units,
# which quantifies the spread of the data around the mean.

np.float64(70664054444.44444)

In [9]:
# Sample standard deviation with Bessel's correction (pure Python)

import math  # Import math library to access the square root function

n = len(salaries) 
mean = sum(salaries) / n  

# Calculate the sample standard deviation step by step:
sample_std_python = math.sqrt(  # Take the square root to return to the original unit (SEK)
    sum(                         # Sum of squared deviations from the mean
        (x - mean) ** 2          # Squared difference between each salary and the mean
        for x in salaries        # Loop over each salary value in the sample
    ) / (n - 1)                  # Divide by (n - 1) to apply Bessel's correction
)

sample_std_python  # Typical deviation from the mean salary (in monetary units)


265827.11382484

In [10]:
# Sample standard deviation with Bessel's correction (Pandas)

sample_std_pandas = salary_series.std()
sample_std_pandas

# Both the Python and Pandas results show that:
# A typical salary deviates from the average by approximately 266,000 monetary units.

np.float64(265827.11382484)

In [11]:
# Excersie 6 - Calculating more statistics 

In [12]:
# The range represents the total spread of the data and shows the difference
# between the highest and lowest salary values in the sample.

# Range with Python

min_salary = min(salaries) # Smallest value in the sample
max_salary = max(salaries) # Largest value in the sample

range_python = max_salary - min_salary # Difference between max and min 
range_python

995000.0

In [13]:
# Range with Pandas

range_pandas = salary_series.max() - salary_series.min()
range_pandas

# Both results show that the range is 995,000 monetary units. 
# Note: The range is easy to compute but is highly sensitive to extreme values.

np.float64(995000.0)

In [14]:
# The coefficient of variation shows that the standard deviation is
# approximately 45% of the mean, indicating the relative variability of salaries in the sample.

# Coefficient of variation with Python

mean = sum(salaries) / len(salaries) # Sample mean
sample_std = sample_std_python       # Sample standard deviation (with Bessel's correction)

cv_python = sample_std / mean        # Coefficient of variation
cv_python

0.45386998894439035

In [15]:
# Coefficient of variation with Pandas

# The CV is calculated by dividing the sample standard deviation by the sample mean
cv_pandas = salary_series.std() / salary_series.mean() 

cv_pandas_percent = cv_pandas * 100     # Express the coefficient of variation as a percentage
cv_pandas_percent

np.float64(45.38699889443903)

In [16]:
# The interquartile range represents the spread of the middle 50% of the data,
# showing how dispersed the typical salary values are while ignoring extreme values.

# Interquartile range with Python

sorted_salaries = sorted(salaries) # Sort the data in ascending order (required for quartiles)
n = len(sorted_salaries)           # Number of observations in the sample

# Calculate the first and third quartiles using index positions
q1 = sorted_salaries[n // 4]      # 25th percentile (first quartile)
q3 = sorted_salaries[3 * n // 4]  # 75th percentile (third quartile)

iqr_python = q3 - q1              # Interquartile range
iqr_python


420000.0

In [17]:
# Interquartile range with Pandas

q1_pandas = salary_series.quantile(0.25)  # First quartile (25th percentile)
q3_pandas = salary_series.quantile(0.75)  # Third quartile (75th percentile)

iqr_pandas = q3_pandas - q1_pandas        # Interquartile range
iqr_pandas

# Explanation:
# The Python and Pandas IQR results are close but not exactly the same.
# This difference occurs because Pandas computes quantiles using interpolation,
# while the pure Python approach selects values based on index positions.
# Both methods are valid and illustrate different ways of estimating quartiles.


np.float64(413250.0)

In [18]:
# Note:
# Pandas uses the term "quantile" instead of "quartile" because quantiles
# are a general concept that represent any percentile of the data, 
# this supports a more general and flexible approach.
# Quartiles are simply specific quantiles corresponding to the 25th, 50th, and 75th percentiles.


In [19]:
# The quartile coefficient of dispersion measures the relative spread
# of the middle 50% of the data, providing a robust indicator of variability
# that is less sensitive to extreme values.

# Quartile coefficient of dispersion with Python

sorted_salaries = sorted(salaries)
n = len (sorted_salaries)

q1 = sorted_salaries[n // 4]          # First quartile (25th percentile)
q3 = sorted_salaries[3 * n // 4]      # Third quartile (75th percentile)

quartile_dispersion_python = (q3 - q1) / (q3 + q1)

quartile_dispersion_python

0.34146341463414637

In [20]:
# Quartile coefficient of dispersion (Pandas)

q1_pandas = salary_series.quantile(0.25)
q3_pandas = salary_series.quantile(0.75)

quartile_dispersion_pandas = (q3_pandas - q1_pandas) / (q3_pandas + q1_pandas)

quartile_dispersion_pandas

np.float64(0.338660110633067)

In [21]:
# Note:
# Both results are very close because the quartile coefficient
# of dispersion is a relative measure. Small differences in Q1 and Q3 tend to
# cancel out when both are used in the numerator and denominator.


In [22]:
# Excersie 7 - Scaling data

In [23]:
# Min-max scaling rescales the data to a 0â€“1 range,
# preserving relative differences while normalizing the scale.

# Min-Max Scaling (Python)

min_salary = min(salaries)
max_salary = max(salaries)

min_max_scaled = [ 
    (x-min_salary) / (max_salary - min_salary)
    for x in salaries
]

min_max_scaled[:5] # Show first 5 values

[0.8472361809045226,
 0.7608040201005025,
 0.4221105527638191,
 0.2592964824120603,
 0.5125628140703518]

In [25]:
# Min-Max Scaling (Pandas)

min_max_scaled_series = (salary_series - salary_series.min())/ (salary_series.max() - salary_series.min())

min_max_scaled_series.head()

0    0.847236
1    0.760804
2    0.422111
3    0.259296
4    0.512563
dtype: float64

In [None]:
# After scaling, the minimum value becomes 0 and the maximum becomes 1.

In [28]:
# Standardizing (Python)

mean = sum(salaries) / len(salaries) # Sample mean
std = sample_std_python              # Sample standard deviation (Bessel)

standardized = [
    (x - mean) / std
    for x in salaries
]

standardized[:5] # Show first 5 values

[0.9717217942267801,
 0.6482032533127501,
 -0.6195380058503674,
 -1.228956652688424,
 -0.28097209094033604]

In [30]:
# Standardizing (Pandas)

standardized_series = (salary_series - salary_series.mean()) / salary_series.std()
standardized_series.head()

0    0.971722
1    0.648203
2   -0.619538
3   -1.228957
4   -0.280972
dtype: float64

In [31]:
import statistics
statistics.stdev(standardized)


1.0