In [None]:
# Algemeen

# Installing packages

%pip install --upgrade matplotlib
%pip install --upgrade scipy
%pip install --upgrade statsmodels
%pip install --upgrade scikit-learn

# Importing the necessary packages

import numpy as np                                  # "Scientific computing"
import scipy.stats as stats                         # Statistical tests

import pandas as pd                                 # Data Frame
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt                     # Basic visualisation
from statsmodels.graphics.mosaicplot import mosaic  # Mosaic diagram
import seaborn as sns                               # Advanced data visualisation

# Reading a file

filename = pd.read_csv('https://raw.githubusercontent.com/HoGentTIN/dsai-labs/main/data/Cats.csv')
# Show the first few records of the Data Frame
filename.head()

# Hoofdstuk 1 Theorie

## 1.1 Measurement Levels

### 1.1.1 Qualitative scales

#### Nominal  
Categories (gender, race, country, shape)  

#### Ordinal  
Order, Rank (Military Rank, level of education)  


### 1.1.2 Quantitative scales

#### Interval
No fixed zero point => no proportions (°C,°F)

#### Ratio
absolute zero point => proportions (distance, energy, weight)

## 1.2 Causal Relationships

Examples:  
- Frustration leads to agression  
- Alcohol leads to decreased alertness  

Cause => Independent variable  
Consequence => Dependent variable

**A relationship between variables does not necessarily indicate a causal relation!**  

Examples:  
- Violent video games lead to violent behaviour  
- Vaccines can cause autism  
- Relationship cola light and obesitas  

## 1.3 Sample & Population

### Sample

a subset of the population from which measurements can be taken

### Population

the collection of all objects/people that need to be investigated

### Sampling Errors

Accidental sampling errors:
- Pure coincidence

Systematic sampling errors:
- Online survey: people without internet are excluded
- Street survey: only who is currently there
- Voluntary survey: only interested parties

### Non Sampling Errors

Accidental non-sampling errors:
- Incorrectly ticked answers

Systematic non-sampling errors:
- Poor or non-calibrated measuring equipment
- Value can be influenced by the fact that you measure
- Respondents lie






# Hoofdstuk 1 code

In [None]:
# PROPERTIES

# How many  rows does the DataFrame have?
print(f"Number of rows: {len(titanic)}")
# How many columns?
print(f"Number of columns: {len(titanic.columns)}")
# How many rows and columns, i.e. the shape
print(f"The shape of the Data Frame is: {titanic.shape}")
# General information about the DataFrame
print("*"*50)
titanic.info()

# Give the data type of each column.
print("*"*50)
print(titanic.dtypes)

# How many columns of each data type are there?
#   Watch it! The book says to use get_dtype_counts(), but this method no longer exists
print("*"*50)
print(titanic.dtypes.value_counts())


In [None]:
# SET COLUMN AS INDEX (INDICES)

titanic.set_index(['PassengerId'])

In [None]:
# QUALITATIVE VARIABLES THAT ARE INCORRECTLY CONSIDERED AS QUANTITATIVE VARIABLES

# Describe the variable Survived -> is considered to be quantitative
print(titanic.Survived.describe())
# Convert to a categorical variable
titanic.Survived = titanic.Survived.astype('category')
# Ask to describe once more -> now it is considered to be qualitative
print(titanic.Survived.describe())


In [None]:
 MARK VARIABLE AS ORDINAL

print(titanic.Embarked.unique())

embarked_type = CategoricalDtype(categories=['S', 'C', 'Q'], ordered=True)
titanic.Embarked = titanic.Embarked.astype(embarked_type)
titanic.Embarked.describe()

In [None]:
# MAKE A COUNTPLOT

sns.countplot(data=titanic, x='Embarked');

# Hoofdstuk 7
Time Series = Een sequentie van observaties van een bepaalde variable over een bepaalde tijd

Level = de waarde op een specifiek tijdsstip  
Seasonal effect = Veranderingen op frequente basis (voorbeeld: Stijging voorjaar, dipje in zomer, daarna daling in najaar)  
Random noise = onvoorspelbare veranderingen (covid 19 bijvoorbeeld)  

# Type modellen
**1. Simple Model no linear relationship**  
X<sub>t</sub> = b + 𝜀<sub>𝑡</sub>

𝑋<sub>𝑡</sub>: estimate for time series, at time 𝑡  
𝑏: the level (a constant), based onobservations 𝑥<sub>𝑡</sub>  
𝜀<sub>𝑡</sub>: randomnoise. We assume that 𝜀<sub>𝑡</sub> ∼ 𝑁𝑜𝑟(𝜇 = 0;𝜎)


**2. Simple model with linear relationship**

We could also assume that there is a linear relationship:
𝑋<sub>𝑡</sub>= 𝑏<sub>0</sub>+ 𝑏<sub>1</sub>𝑡 + 𝜀<sub>t</sub>  
with level 𝑏<sub>0</sub> and trend 𝑏<sub>1</sub>.  
Equation 1 and 2 are special cases of the polynomial case:  
𝑋<sub>𝑡</sub>= 𝑏<sub>0</sub>+ 𝑏<sub>1</sub>𝑡 + 𝑏<sub>2</sub>𝑡²+ ⋯ + 𝑏<sub>n</sub>𝑡<sup>n</sup>+ 𝜀<sub>𝑡</sub>  

