# Алкогольная зависимость у студентов

[датасет](https://www.kaggle.com/datasets/uciml/student-alcohol-consumption)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
pd.set_option("display.max_columns", None)

In [None]:
df = pd.read_csv("student-mat.csv")
df.head()


1. school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
1. sex - student's sex (binary: 'F' - female or 'M' - male)
1. age - student's age (numeric: from 15 to 22)
1. address - student's home address type (binary: 'U' - urban or 'R' - rural)
1. famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
1. Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
1. Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – 1. secondary education or 4 – higher education)
1. Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary 1. education or 4 – higher education)
1. Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
1. Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
1. reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
1. guardian - student's guardian (nominal: 'mother', 'father' or 'other')
1. traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
1. studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
1. failures - number of past class failures (numeric: n if 1<=n<3, else 4)
1. schoolsup - extra educational support (binary: yes or no)
1. famsup - family educational support (binary: yes or no)
1. paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
1. activities - extra-curricular activities (binary: yes or no)
1. nursery - attended nursery school (binary: yes or no)
1. higher - wants to take higher education (binary: yes or no)
1. internet - Internet access at home (binary: yes or no)
1. romantic - with a romantic relationship (binary: yes or no)
1. famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
1. freetime - free time after school (numeric: from 1 - very low to 5 - very high)
1. goout - going out with friends (numeric: from 1 - very low to 5 - very high)
1. Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
1. Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
1. health - current health status (numeric: from 1 - very bad to 5 - very good)
1. absences - number of school absences (numeric: from 0 to 93)


In [None]:
# change no-yes to binary values 0-1
df = df.map(lambda value: {"yes": 1, "no": 0}.get(value, value))
df.Pstatus = df.Pstatus.apply({"A": "отдельно", "T": "вместе"}.get)

In [None]:
# суммарная оценка за три периода
df["G"] = df["G1"] + df["G2"] + df["G3"]
# суммарная алкогольная зависимость
df["Salc"] = df["Dalc"] + df["Walc"]

## Общая информация

In [None]:
df.info()
# нанов нет, всего 395 строчек

In [None]:
print(df["sex"].value_counts(), "\n")
print(df["famsize"].value_counts(), "\n")
print(df["Pstatus"].value_counts())

In [None]:
columns = ["age", "studytime", "freetime", "traveltime", "Medu", "Fedu", "G", "failures", "famrel", "absences", "famrel", "health"]
nrows, ncols = 3, 4
fig, axes = plt.subplots(nrows, ncols, figsize=(20, 15))
for i, col in enumerate(columns):
    ax = axes[i // ncols, i % ncols]
    if isinstance(col, str): col = (col,)
    ax.set_title(', '.join(col))
    for c in col:
        sns.histplot(data=df, x=c, discrete=True, ax=ax)
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, sharey=True, figsize=(10, 5))
axes[0].grid(alpha=0.3)
axes[0].set_title("Распитие на буднях")
sns.histplot(data=df, x="Dalc", discrete=True, ax=axes[0])
axes[1].grid(alpha=0.3)
axes[1].set_title("Распитие на выходных")
sns.histplot(data=df, x="Walc", discrete=True, ax=axes[1])
plt.show()

In [None]:

plt.title("Суммарная алкогольная зависимость")
sns.histplot(data=df, x="Salc", discrete=True)
plt.show()
# похоже на экспоненциальное распределение

In [None]:
plt.ylabel("алк.")
plt.xlabel("оценка")
plt.scatter(df["G"], df["Salc"])
plt.show()
g_mean = df.groupby("Salc")["G"].mean()
sns.barplot(g_mean)
plt.show()
# всё одинаково

In [None]:
sns.barplot(data=df, x="paid", y="studytime") # больше платишь, больше учишься
plt.show()
# примерное одинаково

In [None]:
sns.barplot(data=df, x="paid", y="Salc")
plt.show()
# примерно одно и то же

In [None]:
sns.barplot(data=df, x="age", y="Salc")
filtered = df[df["age"] <= 19]
print(filtered[["age", "Salc"]].corr())
plt.show()
# нельзя доверять после 19, так как там буквально пару человек
# в общем, зависимости нет

In [None]:
sns.barplot(data=df, x="famrel", y="Salc")
print(df[["famrel", "Salc"]].corr())
plt.show()

### Оценки распределены нормально


In [None]:
sns.histplot(data=df, x="G", discrete=True, kde=True)
plt.title("Распределение суммарных оценок")
plt.show()
sns.boxplot(data=df, x="G")
plt.show()

In [None]:
mean, std = df["G"].describe()[["mean", "std"]]
print(mean, std)

In [None]:
# правило трёх сигм
print(((mean - std < df["G"]) & (df["G"] < mean + std)).sum() / df["G"].count())
print(((mean - 2*std < df["G"]) & (df["G"] < mean + 2*std)).sum() / df["G"].count())
print(((mean - 3*std < df["G"]) & (df["G"] < mean + 3*std)).sum() / df["G"].count())
# выполняется

In [None]:
stats.probplot(df["G"], dist="norm", plot=plt)
plt.show()
# левый хвост легче, правый - тяжелее

In [None]:
# тест шапиро-уилка
pvalue = stats.shapiro(df["G"]).pvalue
print(pvalue)
# p_value > 0.05 => нет основания отвергуть гипотезу

In [None]:
stat, p_value = stats.normaltest(df["G"])
print(f"D'Agostino: p-value={p_value:.4f}")

#### Похоже, нормально

### Cтуденты в семьях, с разведенными родителями, пьют больше
$$
H_0 = \{E(x|T) = E(x|A)\}\\
H_1 = \{E(x|T) \le E(x|A)\}\\
\alpha = 0.05
$$
Проверять будем тестом Манна-Уитни, так как распределение явно имеет вид не-нормального.

In [None]:
data_T = df[df["Pstatus"] == "вместе"]
data_A = df[df["Pstatus"] == "отдельно"]

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 4))
ax[0].set_title("Вместе")
sns.histplot(data=data_T, x="Salc", stat='percent', discrete=True, ax=ax[0])
ax[1].set_title("Отдельно")
sns.histplot(data=data_A, x="Salc", stat='percent', discrete=True, ax=ax[1])
plt.show()

In [None]:
sns.boxplot(data=df, x="Pstatus", y="Salc")
plt.show()

In [None]:
data_A["Salc"].describe()

In [None]:
data_T["Salc"].describe()

In [None]:
# Тест Манна-Уитни
statistic, p_value = stats.mannwhitneyu(data_T["Salc"], data_A["Salc"], alternative='less')
print(p_value)

**Принимаем $H_0$.**

### Влияние алкоголя на успеваемость

In [None]:
print(df[['Dalc', 'Walc', 'Salc', 'G']].corr())

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.scatterplot(data=df, x='Dalc', y='G', ax=axes[0])
sns.scatterplot(data=df, x='Walc', y='G', ax=axes[1])
sns.scatterplot(data=df, x='Salc', y='G', ax=axes[2])
plt.show()

drinkers = df[df['Salc'] > df['Salc'].median()]
non_drinkers = df[df['Salc'] <= df['Salc'].median()]

t_stat, p_value = stats.ttest_ind(drinkers['G'], non_drinkers['G'])
print(f"t-test p-value: {p_value:.4f}")

### Гендерные различия в потреблении алкоголя

In [None]:
sns.boxplot(data=df, x='sex', y='Salc')
plt.title('Алкоголь по полу')
plt.show()

male_alc = df[df['sex'] == 'M']['Salc']
female_alc = df[df['sex'] == 'F']['Salc']

### Влияние дополнительных занятий (activities) на успеваемость

In [None]:
active = df[df['activities'] == 1]
non_active = df[df['activities'] == 0]

print("Успеваемость с доп. занятиями:", active['G'].mean())
print("Успеваемость без доп. занятий:", non_active['G'].mean())

t_stat, p_value = stats.ttest_ind(active['G'], non_active['G'], equal_var=True)
print(f"Различие не значимо (t-test): p-value = {p_value:.4f}")

statistic, p_value = stats.mannwhitneyu(non_active['G'], active['G'], alternative='less')
print(f"Различие не значимо (mannwhitneyu-test): p-value = {p_value:.4f}")


In [None]:
active["G"].std(), non_active["G"].std()

### Влияние отношений в семье на алкоголь и успеваемость

In [None]:
family_vars = ['famrel', 'Medu', 'Fedu', 'famsup', 'Salc', 'G']
print(df[family_vars].corr())

sns.heatmap(df[family_vars].corr(), annot=True, cmap='coolwarm')
plt.title('Корреляции семейных факторов с алкоголем и успеваемостью')
plt.show()

### Анализ пропусков занятий

In [None]:
print("Корреляция пропусков с алкоголем:", df[['absences', 'Salc']].corr().iloc[0,1])
print("Корреляция пропусков с оценками:", df[['absences', 'G']].corr().iloc[0,1])

# Разделим на группы по пропускам
high_absence = df[df['absences'] > df['absences'].median()]
low_absence = df[df['absences'] <= df['absences'].median()]

print("Алкоголь у часто пропускающих:", high_absence['Salc'].mean())
print("Алкоголь у редко пропускающих:", low_absence['Salc'].mean())

stat, p_value = stats.mannwhitneyu(high_absence['Salc'], low_absence['Salc'], alternative='greater')
# t_stat, p_value = stats.ttest_ind(high_absence['Salc'], low_absence['Salc'])
print(f"Различие значимо: p-value = {p_value:.4f}")

### Профессия учителя у родителей

In [None]:
father_teacher = df["Fjob"] == "teacher"
mother_teacher = df["Mjob"] == "teacher"
zero = ~father_teacher & ~mother_teacher
one = father_teacher | mother_teacher
both = father_teacher & mother_teacher

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
ax[0].set_title("Нет учителя")
ax[1].set_title("Есть учитель")
ax[2].set_title("Оба учители")
sns.histplot(data=df[zero], x="G", discrete=True, kde=True, ax=ax[0])
sns.histplot(data=df[one], x="G", discrete=True, kde=True, ax=ax[1])
sns.histplot(data=df[both], x="G", discrete=True, kde=True, ax=ax[2])
plt.show()

fig, ax = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
sns.boxplot(data=df[zero], y="G", ax=ax[0])
sns.boxplot(data=df[one], y="G", ax=ax[1])
sns.boxplot(data=df[both], y="G", ax=ax[2])
plt.show()

In [None]:
no_teacher = df[zero]
teacher = df[one]
both_teacher = df[both]
stat, p_value = stats.mannwhitneyu(no_teacher['G'], teacher['G'], alternative='less')
_, t_p_value = stats.ttest_ind(no_teacher['G'], teacher['G'])
print(f"no-teacher < teacher, p-value: mw {p_value}, t {t_p_value}") # не стат значимо
stat, p_value = stats.mannwhitneyu(no_teacher['G'], both_teacher['G'], alternative='less')
_, t_p_value = stats.ttest_ind(no_teacher['G'], both_teacher['G'])
print(f"no teacher < both teacher, p-value: mw {p_value}, t {t_p_value}")