# Загрузка и предобработка данных

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.diagnostic import het_breuschpagan
from tabulate import tabulate

# загрузка данных
file_path = 'диплом (3).xlsx'  # путь к файлу
df = pd.read_excel(file_path)

In [None]:
# оставляем только нужные столбцы
df_usable = df[df['usable'] == 1]
dummies = pd.get_dummies(df_usable['channel'], drop_first=False, dtype = int)
df_dummies = pd.concat([df_usable.drop('channel', axis=1), dummies], axis=1)

df_clear = (df_dummies[['views', 'positive_reactions', 'rational', 'authenticity',
              'happiness', 'love', 'escapism', 'feeling', 'fault', 'guilt', 'nostalgy', 'empathy',
              'confidence', 'mindfullness', 'careful', 'separation', '@yasno_live', '@psyalter_ru', '@zigmund_online']]).copy()

In [None]:
# преобразуем столбцы с любым текстом (который был в кодировке) в бинарные (1 - есть текст, 0 - нет)
for column in df_clear.select_dtypes(include=['object']).columns:
    df_clear[column] = df_clear[column].notna().astype(int)

df_clear

Unnamed: 0,views,positive_reactions,rational,authenticity,happiness,love,escapism,feeling,fault,guilt,nostalgy,empathy,confidence,mindfullness,careful,separation,@yasno_live,@psyalter_ru,@zigmund_online
0,3770,61,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,9813,12,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0
3,60026,114,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,11642,13,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
6,4771,18,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,5978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
574,37425,107,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
575,7849,7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
576,4106,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Задача 1: описательная статистика

In [None]:
df_for_stats = (df_clear[['rational', 'authenticity',
              'happiness', 'love', 'escapism', 'fault', 'guilt', 'nostalgy', 'empathy', 'feeling',
              'confidence', 'mindfullness', 'careful', 'separation', '@yasno_live', '@psyalter_ru', '@zigmund_online']]).copy()

# пропишем для красивого вывода функцию
for column in df_for_stats.columns:
    print(f"\n          {column}          ")

    # получаем абсолютные и процентные значения
    counts = df_for_stats[column].value_counts(dropna=False)
    percents = (df_for_stats[column].value_counts(dropna=False, normalize=True)) * 100

    # таблица для вывода
    table = pd.DataFrame({
        'Значение': counts.index,
         'Количество': counts.values,
         '%': percents.round(1).values
        })

    print(tabulate(table, headers='keys', tablefmt = 'pretty'))



          rational          
+---+----------+------------+------+
|   | Значение | Количество |  %   |
+---+----------+------------+------+
| 0 |   0.0    |   354.0    | 76.0 |
| 1 |   1.0    |   112.0    | 24.0 |
+---+----------+------------+------+

          authenticity          
+---+----------+------------+------+
|   | Значение | Количество |  %   |
+---+----------+------------+------+
| 0 |   0.0    |   419.0    | 89.9 |
| 1 |   1.0    |    47.0    | 10.1 |
+---+----------+------------+------+

          happiness          
+---+----------+------------+------+
|   | Значение | Количество |  %   |
+---+----------+------------+------+
| 0 |   0.0    |   453.0    | 97.2 |
| 1 |   1.0    |    13.0    | 2.8  |
+---+----------+------------+------+

          love          
+---+----------+------------+------+
|   | Значение | Количество |  %   |
+---+----------+------------+------+
| 0 |   0.0    |   434.0    | 93.1 |
| 1 |   1.0    |    32.0    | 6.9  |
+---+----------+------------

In [None]:
# посчитаем покрытие императивами без новых императивов
df_for_stats['min'] = (df_for_stats.iloc[:, :-5].sum(axis=1) != 0).astype(int)
df_for_stats['min'].value_counts()

min
1    382
0     84
Name: count, dtype: int64

In [None]:
# затем с новыми
df_for_stats['min'] = (df_for_stats.iloc[:, :-3].sum(axis=1) != 0).astype(int)
df_for_stats['min'].value_counts()

min
1    448
0     18
Name: count, dtype: int64

# Задача 2: Регрессия

In [None]:
# создаем подвыборки для каждого канала
yasno = df_clear[df_clear['@yasno_live'] == 1].copy()
zigmund = df_clear[df_clear['@zigmund_online'] == 1].copy()
alter = df_clear[df_clear['@psyalter_ru'] == 1].copy()

In [None]:
# создаем датасеты для зависимой и независимых переменных
yasno_y = ((yasno['positive_reactions'] / yasno['views']) * 100).astype(float)
zigmund_y = ((zigmund['positive_reactions'] / zigmund['views'])* 100).astype(float)
alter_y = ((alter['positive_reactions'] / alter['views'])*100).astype(float)

yasno_x = yasno[['rational', 'authenticity', 'happiness', 'love', 'escapism',
         'fault', 'guilt', 'nostalgy', 'empathy', 'feeling',
         'confidence', 'mindfullness', 'careful', 'separation']]
zigmund_x = zigmund[['rational', 'authenticity', 'happiness', 'love', 'escapism',
         'fault', 'guilt', 'nostalgy', 'empathy', 'feeling',
         'confidence', 'mindfullness', 'careful', 'separation']]
alter_x = alter[['rational', 'authenticity', 'happiness', 'love', 'escapism',
         'fault', 'guilt', 'nostalgy', 'empathy', 'feeling',
         'confidence', 'mindfullness', 'careful', 'separation']]

In [None]:
# достаем описательные статистики для всех переменных
datasets = {'yasno_y': yasno_y, 'zigmund_y': zigmund_y, 'alter_y':alter_y,
            'yasno_x': yasno_x, 'zigmund_x': zigmund_x, 'alter_x': alter_x}
for key, value in datasets.items():
    print(f'Описательные статистики для {key} \n')
    display(value.describe().round(2))

Описательные статистики для yasno_y 



count    174.00
mean       0.23
std        0.17
min        0.00
25%        0.11
50%        0.20
75%        0.28
max        1.01
dtype: float64

Описательные статистики для zigmund_y 



count    95.00
mean      0.51
std       0.41
min       0.00
25%       0.24
50%       0.46
75%       0.62
max       2.84
dtype: float64

Описательные статистики для alter_y 



count    197.00
mean       0.18
std        0.19
min        0.00
25%        0.08
50%        0.13
75%        0.20
max        1.23
dtype: float64

Описательные статистики для yasno_x 



Unnamed: 0,rational,authenticity,happiness,love,escapism,fault,guilt,nostalgy,empathy,feeling,confidence,mindfullness,careful,separation
count,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
mean,0.31,0.16,0.02,0.06,0.1,0.01,0.06,0.01,0.15,0.02,0.04,0.05,0.1,0.06
std,0.46,0.37,0.13,0.23,0.3,0.11,0.24,0.08,0.36,0.13,0.2,0.21,0.31,0.24
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Описательные статистики для zigmund_x 



Unnamed: 0,rational,authenticity,happiness,love,escapism,fault,guilt,nostalgy,empathy,feeling,confidence,mindfullness,careful,separation
count,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0
mean,0.14,0.06,0.09,0.03,0.05,0.01,0.03,0.01,0.18,0.04,0.16,0.16,0.13,0.05
std,0.35,0.24,0.29,0.18,0.22,0.1,0.18,0.1,0.39,0.2,0.37,0.37,0.33,0.22
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Описательные статистики для alter_x 



Unnamed: 0,rational,authenticity,happiness,love,escapism,fault,guilt,nostalgy,empathy,feeling,confidence,mindfullness,careful,separation
count,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0
mean,0.23,0.07,0.01,0.1,0.12,0.01,0.05,0.01,0.2,0.01,0.17,0.11,0.18,0.06
std,0.42,0.25,0.07,0.3,0.33,0.1,0.22,0.07,0.4,0.07,0.38,0.31,0.38,0.23
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Регрессия для "Ясно"

In [None]:
yasno_x = sm.add_constant(yasno_x, has_constant='add')  # добавляем константу на случай, если все императивы равны нулю

# проверяем мультиколлинеарность
corr_matrix = yasno_x.corr().abs().round(2)
high_corr = (corr_matrix > 0.8) & (corr_matrix < 1.0)
if high_corr.any().any():
    print("Высокие корреляции (> 0.8):")
    print(corr_matrix[high_corr].stack())
else:
    print("Сильной мультиколлинеарности не обнаружено")

corr_matrix.round(2)

Сильной мультиколлинеарности не обнаружено


Unnamed: 0,const,rational,authenticity,happiness,love,escapism,fault,guilt,nostalgy,empathy,feeling,confidence,mindfullness,careful,separation
const,,,,,,,,,,,,,,,
rational,,1.0,0.19,0.09,0.11,0.01,0.04,0.12,0.05,0.14,0.01,0.01,0.15,0.06,0.07
authenticity,,0.19,1.0,0.06,0.11,0.04,0.1,0.11,0.03,0.18,0.06,0.01,0.1,0.1,0.05
happiness,,0.09,0.06,1.0,0.16,0.04,0.01,0.03,0.01,0.07,0.02,0.03,0.03,0.04,0.03
love,,0.11,0.11,0.16,1.0,0.0,0.03,0.06,0.02,0.03,0.03,0.05,0.05,0.08,0.06
escapism,,0.01,0.04,0.04,0.0,1.0,0.04,0.01,0.03,0.08,0.04,0.07,0.07,0.11,0.01
fault,,0.04,0.1,0.01,0.03,0.04,1.0,0.03,0.01,0.05,0.01,0.02,0.02,0.04,0.03
guilt,,0.12,0.11,0.03,0.06,0.01,0.03,1.0,0.02,0.04,0.03,0.05,0.06,0.01,0.13
nostalgy,,0.05,0.03,0.01,0.02,0.03,0.01,0.02,1.0,0.03,0.01,0.02,0.02,0.22,0.02
empathy,,0.14,0.18,0.07,0.03,0.08,0.05,0.04,0.03,1.0,0.06,0.09,0.02,0.09,0.11


In [None]:
# строим модель линейной регрессии
model_yasno = sm.OLS(yasno_y, yasno_x).fit()

# проверяем остатки на нормальное распределение тестом Шапиро-Уилка
residuals = model_yasno.resid
shapiro_test = stats.shapiro(residuals)
print(f"P-value тест Шапиро-Уилка: {shapiro_test[1]}")

# проверяем гетероскедастичность тестом Бреуша-Пагана
_, pval, _, _ = het_breuschpagan(residuals, yasno_x)
print(f"P-value теста Бреуша-Пагана: {pval}")

# смотрим результат регрессии
print("\n Результаты регрессии:")
print(model_yasno.summary())

P-value тест Шапиро-Уилка: 3.3621067089649954e-10
P-value теста Бреуша-Пагана: 0.027470259489997356

 Результаты регрессии:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.122
Model:                            OLS   Adj. R-squared:                  0.044
Method:                 Least Squares   F-statistic:                     1.575
Date:                Sat, 24 May 2025   Prob (F-statistic):             0.0915
Time:                        20:15:37   Log-Likelihood:                 74.993
No. Observations:                 174   AIC:                            -120.0
Df Residuals:                     159   BIC:                            -72.60
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----

## Регрессия для Zigmund

In [None]:
zigmund_x = sm.add_constant(zigmund_x, has_constant='add')  # добавляем константу на случай, если все императивы равны нулю

# проверяем мультиколлинеарность
corr_matrix = zigmund_x.corr().abs()
high_corr = (corr_matrix > 0.8) & (corr_matrix < 1.0)
if high_corr.any().any():
    print("Высокие корреляции (> 0.8):")
    print(corr_matrix[high_corr].stack())
else:
    print("Сильной мультиколлинеарности не обнаружено")

corr_matrix.round(2)

Сильной мультиколлинеарности не обнаружено


Unnamed: 0,const,rational,authenticity,happiness,love,escapism,fault,guilt,nostalgy,empathy,feeling,confidence,mindfullness,careful,separation
const,,,,,,,,,,,,,,,
rational,,1.0,0.02,0.13,0.1,0.09,0.04,0.07,0.04,0.11,0.08,0.17,0.17,0.15,0.04
authenticity,,0.02,1.0,0.08,0.05,0.06,0.03,0.05,0.03,0.12,0.05,0.01,0.11,0.1,0.06
happiness,,0.13,0.08,1.0,0.06,0.08,0.03,0.06,0.03,0.15,0.07,0.04,0.14,0.12,0.08
love,,0.1,0.05,0.06,1.0,0.04,0.02,0.03,0.02,0.08,0.04,0.08,0.08,0.07,0.04
escapism,,0.09,0.06,0.08,0.04,1.0,0.44,0.04,0.44,0.11,0.05,0.03,0.1,0.09,0.06
fault,,0.04,0.03,0.03,0.02,0.44,1.0,0.02,0.01,0.05,0.02,0.04,0.04,0.04,0.02
guilt,,0.07,0.05,0.06,0.03,0.04,0.02,1.0,0.02,0.08,0.04,0.08,0.08,0.11,0.04
nostalgy,,0.04,0.03,0.03,0.02,0.44,0.01,0.02,1.0,0.05,0.02,0.04,0.04,0.04,0.02
empathy,,0.11,0.12,0.15,0.08,0.11,0.05,0.08,0.05,1.0,0.1,0.2,0.2,0.01,0.11


In [None]:
# строим модель линейной регрессии
model_zigmund = sm.OLS(zigmund_y, zigmund_x).fit()

# проверяем остатки на нормальное распределение тестом Шапиро-Уилка
residuals = model_zigmund.resid
shapiro_test = stats.shapiro(residuals)
print(f"P-value тест Шапиро-Уилка: {shapiro_test[1]}")

# проверяем гетероскедастичность тестом Бреуша-Пагана
_, pval, _, _ = het_breuschpagan(residuals, zigmund_x)
print(f"P-value теста Бреуша-Пагана: {pval}")

# смотрим результат регрессии
print("\n Результаты регрессии:")
print(model_zigmund.summary())

P-value тест Шапиро-Уилка: 6.896806281498979e-09
P-value теста Бреуша-Пагана: 0.5731045894795979

 Результаты регрессии:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                 -0.090
Method:                 Least Squares   F-statistic:                    0.4442
Date:                Sat, 24 May 2025   Prob (F-statistic):              0.954
Time:                        20:15:48   Log-Likelihood:                -46.785
No. Observations:                  95   AIC:                             123.6
Df Residuals:                      80   BIC:                             161.9
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------

## Регрессия для Alter

In [None]:
alter_x = sm.add_constant(alter_x, has_constant='add')  # добавляем константу на случай, если все императивы равны нулю

# проверяем мультиколлинеарность
corr_matrix = alter_x.corr().abs()
high_corr = (corr_matrix > 0.8) & (corr_matrix < 1.0)
if high_corr.any().any():
    print("Высокие корреляции (> 0.8):")
    print(corr_matrix[high_corr].stack())
else:
    print("Сильной мультиколлинеарности не обнаружено")

corr_matrix.round(2)

Сильной мультиколлинеарности не обнаружено


Unnamed: 0,const,rational,authenticity,happiness,love,escapism,fault,guilt,nostalgy,empathy,feeling,confidence,mindfullness,careful,separation
const,,,,,,,,,,,,,,,
rational,,1.0,0.05,0.04,0.05,0.09,0.07,0.02,0.13,0.12,0.04,0.01,0.07,0.13,0.08
authenticity,,0.05,1.0,0.02,0.02,0.1,0.03,0.06,0.02,0.03,0.02,0.12,0.03,0.02,0.02
happiness,,0.04,0.02,1.0,0.02,0.03,0.01,0.02,0.01,0.04,0.01,0.03,0.02,0.15,0.02
love,,0.05,0.02,0.02,1.0,0.12,0.03,0.08,0.22,0.05,0.02,0.15,0.06,0.11,0.08
escapism,,0.09,0.1,0.03,0.12,1.0,0.04,0.02,0.03,0.15,0.03,0.13,0.03,0.03,0.09
fault,,0.07,0.03,0.01,0.03,0.04,1.0,0.02,0.01,0.05,0.01,0.05,0.03,0.05,0.02
guilt,,0.02,0.06,0.02,0.08,0.02,0.02,1.0,0.02,0.0,0.02,0.02,0.08,0.01,0.15
nostalgy,,0.13,0.02,0.01,0.22,0.03,0.01,0.02,1.0,0.14,0.01,0.03,0.02,0.03,0.02
empathy,,0.12,0.03,0.04,0.05,0.15,0.05,0.0,0.14,1.0,0.04,0.16,0.17,0.14,0.01


In [None]:
# строим модель линейной регрессии
model_alter = sm.OLS(alter_y, alter_x).fit()

# проверяем остатки на нормальное распределение тестом Шапиро-Уилка
residuals = model_alter.resid
shapiro_test = stats.shapiro(residuals)
print(f"P-value тест Шапиро-Уилка: {shapiro_test[1]}")

# проверяем гетероскедастичность тестом Бреуша-Пагана
_, pval, _, _ = het_breuschpagan(residuals, alter_x)
print(f"P-value теста Бреуша-Пагана: {pval}")

# смотрим результат регрессии
print("\n Результаты регрессии:")
print(model_alter.summary())

P-value тест Шапиро-Уилка: 3.438369542992346e-16
P-value теста Бреуша-Пагана: 0.30587010877934023

 Результаты регрессии:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.063
Model:                            OLS   Adj. R-squared:                 -0.010
Method:                 Least Squares   F-statistic:                    0.8676
Date:                Sat, 24 May 2025   Prob (F-statistic):              0.595
Time:                        20:16:00   Log-Likelihood:                 52.718
No. Observations:                 197   AIC:                            -75.44
Df Residuals:                     182   BIC:                            -26.19
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-------