# Simple test recap and Correlations

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt

#for statistical tests
import statsmodels.api as sm
from scipy import stats
import pingouin as pg

#for fetching repositories from the UC Irvine Machine Learning Repository
from ucimlrepo import fetch_ucirepo 

## Recap

In [None]:
#from github load this dataset
username = "datagus"
repository = "statstutorial2025"
directory = "week4/survey25cleaned.csv"
github_url = f"https://raw.githubusercontent.com/{username}/{repository}/main/{directory}"
df = pd.read_csv(github_url)
pd.set_option('display.max_columns', None)

**Dataset Description**
This dataset is from a survey applied to Leuphana Bachelor students in their second semester. Their study program  is environmental sciences and global environmental and sustainability studies.

- **timestamp** — time of survey completion  
- **happy1** — How happy are you?  
- **ID** — ID: First 5 letters of the street you grew up in and house number  
- **tutorial** — Tutorial: which tutorial slot are you currently in?  
- **travel** — Distance: How far did you travel today to get here (km)?  
- **direction** — Direction: Where did you come from? The direction from your home to this classroom?  
- **born** — Distance 2: How far is the distance to the place you were born (km)?  
- **sports** — How many hours of sports do you do each week?  
- **aware** — Awake: How are you now?  
- **morning_drink** — Morning_Drink: What is your favourite morning drink? (coffee, tea, etc.)  
- **breakfast** — Breakfast: Do you usually have breakfast?  
- **sleep** — Sleep: Your average hours of sleep in the past month  
- **siblings** — Siblings_Any: Do you have any siblings?  
- **siblings_number** — Siblings_Number: Number of siblings (brothers and sisters)  
- **pets** — Pets: How many pets have you had growing up?  
- **beer** — Order_Beer: In how many foreign languages could you order a beer without hesitating?  
- **coffee** — Order_Coffee: In how many foreign languages could you order a coffee without hesitating?  
- **Pineapple_Pizza** — Pineapple_Pizza: Do you think pineapple belongs on a pizza?  
- **Pinky** — Pinky: How long is your pinky finger (in cm)?  
- **OS** — Mac_or_PC: Apple or Microsoft?  
- **hand** — Hand: What is the distance between your thumb and pinky finger (in cm)? (distance 4 in the picture)  
- **energy** — Energy: How much energy do you think you still have for today?  
- **shoes** — Shoes: How many pairs of shoes do you own?  
- **phone** — Phone: How big is your phone's display (diagonal measurement in cm)?  
- **Apartment_Size** — Apartment_Size: How big is your current apartment (in m²)? Entire apartment, including all rooms.  
- **rooms** — How many roommates do you have?  
- **apps** — How many apps have you installed yourself on your phone?  
- **countries** — How many countries have you traveled to?  
- **CB** — Do you like the central building at Leuphana?  
- **Food_Money** — Food_Money: How much do you spend on food per month (estimated, in €)?  
- **season** — Season: Which season do you like the best?  
- **Drink_Coffee** — Drink_Coffee: How do you drink your coffee?  
- **height** — Height: How tall are you (in cm)?  
- **bother** — Bother: On a scale of 1 to 10, how much do these pictures bother you?  
- **happy2** — How happy are you now?  
- **tutorial_batch** — which year they took the tutorial  
- **time** — timeslot of the seminar  
- **major** — UWI or GESS  

In [None]:
df.describe(include="object")

## Testing normality

<code>sports — How many hours of sports do you do each week?</code>

In [None]:
#first a histogram
sns.histplot(df["sports"], color="purple")

In [None]:
#let's apply the test
#Ho: the data is normally distributed
stats.shapiro(df["sports"])

In [None]:
#now with penguin
pg.normality(df["sports"])

In [None]:
# what happens if we transform the data.
#square-root transformation
sports_sqrt = np.sqrt(df["sports"])

In [None]:
#natural log transformation
sports_log = np.log(df["sports"])

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
colors = ["yellow", "purple", "green"]

# Plot 1
sns.histplot(df["sports"], color=colors[0], ax=axes[0])
axes[0].set_title("no transformation")

# Plot 2 (square root)
sns.histplot(sports_sqrt, color=colors[1], ax=axes[1])
axes[1].set_title("transformed with square root")

# Plot 3 (natural log)
sns.histplot(sports_log, color=colors[2], ax=axes[2])
axes[2].set_title("transformed with natural log")

plt.tight_layout()
plt.show()

In [None]:
#now with penguin
pg.normality(sports_sqrt)

### Task 
Check the normality for other numeric variables, especially for Apartment_Size

## Simple tests

### Checking normality

In [None]:
df["major"].value_counts()

In [None]:
sns.histplot(np.sqrt(df["countries"]))

In [None]:
pg.normality(df["countries"])

### Checking equal variances

In [None]:
sns.boxplot(x="major", y="countries", data=df)

In [None]:
#running the test
pg.homoscedasticity(data=df, dv="countries", group="major", method='levene')

### Running the t-test

In [None]:
group1 = df.loc[df["major"] == "UWI", "countries"]
group2 = df.loc[df["major"] == "GESS", "countries"]

pg.ttest(group1, group2)

### Task
Run a t test to see if there is a significative difference between students whole like Pinepple Pizza and those who don't regarding the variable born.

## Chi square test

Is there a pattern between the two majors and the OS preference?

In [None]:
#creating a contingengy table or cross table
tab = pd.crosstab(df["OS"], df["major"])
tab

In [None]:
expected, observed, stats = pg.chi2_independence(data=df, x="OS", y="major", correction=False)
stats

In [None]:
new_df = df.copy().loc[df["OS"]!="Neither",]
expected, observed, stats = pg.chi2_independence(data=new_df, x="OS", y="major", correction=False)
stats

### Task
Are the variables Drink_Coffee and seasons independent?

## Correlations

### What not to do

In [None]:
num_df = df.select_dtypes(include=['number'])
sns.pairplot(num_df, kind="scatter", diag_kind="hist")

### Choosing two numeric variables

In [None]:
sns.scatterplot(data=df, x="rooms", y="apps")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
colors = ["pink", "purple"]

# Plot 1
sns.histplot(df["rooms"], color=colors[0], ax=axes[0])
axes[0].set_title("distribution of rooms")

# Plot 2 (square root)
sns.histplot(df["apps"], color=colors[1], ax=axes[1])
axes[1].set_title("distribution of apps")

### Checking outliers in rooms and apps

In [None]:
df[df["rooms"]==df["rooms"].max()]

In [None]:
df[df["apps"]==df["apps"].max()]

### Removing outliers in rooms and apps

In [None]:
new_df = df.copy()
condition1 = df["apps"]<df["apps"].max()
condition2 = df["rooms"]<df["rooms"].max()
new_df = new_df.loc[condition1 & condition2]

In [None]:
df.shape, new_df.shape

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
colors = ["pink", "purple"]

# Plot 1
sns.histplot(new_df["rooms"], color=colors[0], ax=axes[0])
axes[0].set_title("distribution of rooms")

# Plot 2 (square root)
sns.histplot(new_df["apps"], color=colors[1], ax=axes[1])
axes[1].set_title("distribution of apps")

### We can test the normality of these two variables

In [None]:
pg.normality(new_df["rooms"])

In [None]:
pg.normality(new_df["apps"])

In [None]:
sns.scatterplot(data=new_df, x="rooms", y="apps", color="orange")

### Running the correlation test

In [None]:
#with outliers
pg.corr(df["rooms"],df["apps"], method="spearman")

In [None]:
#without outliers
pg.corr(new_df["rooms"],new_df["apps"], method="spearman")

### With log transformation

In [None]:
new_df["log_rooms"] = np.log(new_df["rooms"]+1)
new_df["log_apps"] = np.log(new_df["apps"]+1)

In [None]:
sns.scatterplot(data=new_df, x="log_rooms", y="log_apps", color="green")

In [None]:
#without outliers and log transform
pg.corr(new_df["log_rooms"],new_df["log_apps"], method="spearman")

### Task
Try out another numeric variables, for example, is there a correlation, between sports and sleep?

## Fetching datasets from  UC Irvine Machine Learning Repository

### Fetching the dataset

In [None]:
# fetch dataset 
wine_quality = fetch_ucirepo(name='Wine Quality')

In [None]:
wine_quality.metadata.abstract

In [None]:
wine_quality.variables

In [None]:
wine_df = wine_quality.data.original

In [None]:
wine_df.head(5)

### Productig scatterplots

In [None]:
sns.scatterplot(data=wine_df, x="fixed_acidity", y="pH")
plt.title(f'Correlation between citric_acid and residual sugar')
plt.show()

In [None]:
wine_df.head()

In [None]:
sns.pairplot(wine_df, vars=["fixed_acidity","residual_sugar","chlorides", "pH"], kind="scatter")

In [None]:
sns.heatmap(wine_df.corr(method = 'spearman', numeric_only=True), annot=True,cmap='coolwarm', linewidths=0.5)

### Task
Create a dataframe with the correlation for each pair of numeric variables. That is, run the correlation plots with pengouin library and create a dataframes with the outpus