### 1- Associations: Quantitative and Categorical Variables


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, chi2_contingency

In [None]:
students_df = pd.read_csv("../learn_pandas/csv/students.csv")
students_df.head()

In [None]:
# separate out scores for students who live in urban and rural locations:
students_urban = students_df.G3[students_df.address == "U"]
students_rural = students_df.G3[students_df.address == "R"]

#### Mean and Median Differences


In [None]:
scores_urban_mean = np.mean(students_urban)  # 10.67
scores_rural_mean = np.mean(students_rural)  # 9.51

mean_diff = scores_urban_mean - scores_rural_mean
mean_diff  # output: 1.1629034646135619

In [None]:
scores_urban_median = np.median(students_urban)
scores_rural_median = np.median(students_rural)

median_diff = scores_urban_median - scores_rural_median
median_diff  # output: 1.0

#### Side-by-Side Box Plots


In [None]:
# Generate side-by-side boxplots for student scores (G3) by address.
sns.boxplot(data=students_df, x="address", y="G3")
plt.show()

#### Inspecting Overlapping Histograms


In [None]:
plt.hist(students_urban, color="blue", label="U", density=True, alpha=0.5)
plt.hist(students_rural, color="red", label="R", density=True, alpha=0.5)
plt.legend()
plt.show()

#### Exploring Non-Binary Categorical Variables


In [None]:
sns.boxplot(data=students_df, x="Fjob", y="G3")
plt.show()

### 2- Associations: Two Quantitative Variables


In [None]:
housing_df = pd.read_csv("../learn_pandas/csv/housing_sample.csv")
housing_df.head(10)

In [None]:
plt.scatter(x=housing_df.beds, y=housing_df.sqfeet)
plt.xlabel("number of beds")
plt.ylabel("area of a rental")
plt.show()

#### Exploring Covariance

Covariance can range from negative infinity to positive infinity. A positive covariance indicates that a larger value of one variable is associated with a larger value of the other. A negative covariance indicates a larger value of one variable is associated with a smaller value of the other. A covariance of 0 indicates no linear relationship.


In [None]:
np.set_printoptions(suppress=True, precision=1)
cov_mat_sqfeet_beds = np.cov(housing_df.sqfeet, housing_df.beds)
cov_mat_sqfeet_beds

In [None]:
cov_sqfeet_beds = 228.2
cov_sqfeet_beds

#### Correlation

Like covariance, Pearson Correlation (often referred to simply as “correlation”) is a scaled form of covariance. It also measures the strength of a linear relationship, but ranges from -1 to +1, making it more interpretable.

Generally, a correlation larger than about .3 indicates a linear association. A correlation greater than about .6 suggestions a strong linear association.


In [None]:
corr_sqfeet_beds, p = pearsonr(housing_df.sqfeet, housing_df.beds)

plt.scatter(x=housing_df.beds, y=housing_df.sqfeet)
plt.xlabel("number of beds")
plt.ylabel("area of a rental")
plt.show()
corr_sqfeet_beds

It’s important to note that there are some limitations to using correlation or covariance as a way of assessing whether there is an association between two variables. Because correlation and covariance both measure the strength of linear relationships with non-zero slopes, but not other kinds of relationships, correlation can be misleading.


In [None]:
sleep_df = pd.read_csv("../learn_pandas/csv/sleep_performance.csv")
sleep_df.head()

In [None]:
plt.scatter(x=sleep_df.hours_sleep, y=sleep_df.performance)
plt.show()

In [None]:
corr_sleep_performance, p = pearsonr(sleep_df.hours_sleep, sleep_df.performance)
corr_sleep_performance  # output: 0.2814978189049413
# The correlation is only 0.28 (a relatively small correlation)

### 3- Associations: Two Categorical Variables


In [None]:
# influence: yes = I have a natural talent for influencing people; no = I am not good at influencing people.
# blend_in: yes = I prefer to blend in with the crowd; no = I like to be the center of attention.
# special: yes = I think I am a special person; no = I am no better or worse than most people.
# leader: yes = I see myself as a good leader; no = I am not sure if I would make a good leader.
# authority: yes = I like to have authority over other people; no = I don’t mind following orders.

npi_df = pd.read_csv(
    "../learn_pandas/csv/npi_sample.csv"
)  # Narcissistic Personality Inventory
npi_df.head()

#### Contingency Tables: Frequencies


In [None]:
special_authority_freq = pd.crosstab(npi_df.special, npi_df.authority)
special_authority_freq

#### Contingency Tables: Proportions


In [None]:
special_authority_prop = special_authority_freq / len(npi_df)
special_authority_prop

#### Marginal Proportions


In [None]:
authority_marginal = special_authority_prop.sum(axis=0)
authority_marginal

In [None]:
special_marginal = special_authority_prop.sum(axis=1)
special_marginal

#### Expected Contingency Tables


In [None]:
chi2, pval, dof, expected = chi2_contingency(special_authority_freq)
print("expected contingency table (no association):")
print(np.round(expected))

#### The Chi-Square Statistic

In [None]:
chi2, pval, dof, expected = chi2_contingency(special_authority_freq)
chi2