In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import math

In [19]:
df = pd.read_csv('song_data.csv')
rating = df['song_popularity']
danceability = df['danceability']


Воспользуемся корреляцией Пирсона

- $X$ - рейтинг песни
- $Y$ - танцевальность

$H_0: \rho(X, Y) = 0$

$H_1: \rho(X, Y) \neq 0$

$$ r_{x y} = \frac{\sum (X_i - \overline{X}) (Y_i - \overline{Y}))}{\sqrt{\sum (X_i -  \overline{X})^2 (Y_i -  \overline{Y})^2}} $$

$$ \frac{r_{x y} \sqrt{n - 2}}{\sqrt{1 - r^2_{x y}}} \sim T(n - 2)$$

In [20]:
def pearson_correlation(x, y):
    n = len(x)
    mean_x = sum(x) / n
    mean_y = sum(y) / n

    numerator = sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n))

    denominator = math.sqrt(
        sum((x[i] - mean_x) ** 2 for i in range(n)) *
        sum((y[i] - mean_y) ** 2 for i in range(n))
    )

    r = numerator / denominator
    _df = n - 2
    t_stat = r * math.sqrt(_df / (1 - r**2))

    _p_value = 2 * stats.t.sf(abs(t_stat), _df)

    return r, p_value


In [21]:
corr, p_value = pearson_correlation(rating, danceability)
print(corr, p_value)
alpha = 0.05
if p_value > alpha:
    print("Accept")
else:
    print("Reject")

0.10429014168024182 4.251510400487118e-44
Reject


Воспользуемся корреляцией Спирмена

$H_0: \rho(X, Y) = 0$

$H_1: \rho(X, Y) \neq 0$



In [22]:
corr, p_value = stats.spearmanr(rating, danceability)
print(corr, p_value)
alpha = 0.05
if p_value > alpha:
    print("Accept")
else:
    print("Reject")

0.10123555201396331 4.251510400487118e-44
Reject
