# **PART 1: Python Basics**


## Variables and Data Types

In [1]:
# Define variables
x = 10  # Integer
pi = 3.14  # Float
name = "Alice"  # String
is_student = True  # Boolean

In [2]:
# Print variables
print("x:", x)
print("pi:", pi)
print("name:", name)
print("is_student:", is_student)

x: 10
pi: 3.14
name: Alice
is_student: True


In [3]:
# Check data types
print(type(x))
print(type(pi))
print(type(name))
print(type(is_student))

<class 'int'>
<class 'float'>
<class 'str'>
<class 'bool'>


In [5]:
# List, Tuple, Dictionary, Set
my_list = [1, 2, 3, 4]
my_tuple = (5, 6, 7, 8)
my_dict = {"key1": "value1", "key2": "value2"}
my_set = {9, 10, 11}

print("List:", my_list)
print("Tuple:", my_tuple)
print("Dictionary:", my_dict)
print("Set:", my_set)

List: [1, 2, 3, 4]
Tuple: (5, 6, 7, 8)
Dictionary: {'key1': 'value1', 'key2': 'value2'}
Set: {9, 10, 11}


## Conditional Statements (if-else)


In [6]:
# Example of if-else
x = 15
if x > 10:
    print("x is greater than 10")
elif x == 10:
    print("x is equal to 10")
else:
    print("x is less than 10")

x is greater than 10


## Loops

In [8]:
# For loop example
for i in range(5):
    print("For loop iteration:", i)

For loop iteration: 0
For loop iteration: 1
For loop iteration: 2
For loop iteration: 3
For loop iteration: 4


In [9]:
# While loop example
counter = 0
while counter < 5:
    print("While loop iteration:", counter)
    counter += 1

While loop iteration: 0
While loop iteration: 1
While loop iteration: 2
While loop iteration: 3
While loop iteration: 4


## Functions

In [10]:
# Define a simple function
def greet(name):
    return f"Hello, {name}!"


# Test the function
print(greet("Alice"))

Hello, Alice!


## Run a script

In [47]:
# Example of writing a script (this part would normally go in a .py file)
# Save this content as script.py
# Then run it in the terminal with: python script.py
if __name__ == "__main__":
    print("This script is being run directly.")

This script is being run directly.


# PART 2: Python Libraries for Data Analysis

## NUMPY

In [11]:
# Importing the NumPy library
import numpy as np

### 1. Create Arrays

In [42]:
# Creating a 1D array
array_1d = np.array([1, 2, 3, 4, 5])
print("1D Array:", array_1d)
print("Shape: ", array_1d.shape)
print("Num dimension: ", array_1d.ndim)

1D Array: [1 2 3 4 5]
Shape:  (5,)
Dimension:  1


In [43]:
# Creating a 2D array
array_2d = np.array([[1, 2, 3], [4, 5, 6]])
print("\n2D Array:\n", array_2d)
print("Shape: ", array_2d.shape)
print("Num dimension: ", array_2d.ndim)


2D Array:
 [[1 2 3]
 [4 5 6]]
Shape:  (2, 3)
Num dimension:  2


In [46]:
# Creating arrays with zeros, ones, or a range of numbers
zeros_array = np.zeros((2, 3))
ones_array = np.ones((3, 2))
range_array = np.arange(0, 10, 2)
print("\nZeros Array:\n", zeros_array)
print("\nOnes Array:\n", ones_array)
print("\nRange Array:", range_array)

print("Shape zeros_array: ", zeros_array.shape)
print("Shape ones_array: ", ones_array.shape)
print("Shape range_array: ", range_array.shape)
print("\n")

print("Num dimension zeros_array: ", zeros_array.ndim)
print("Num dimension ones_array: ", ones_array.ndim)
print("Num dimension range_array: ", range_array.ndim)


Zeros Array:
 [[0. 0. 0.]
 [0. 0. 0.]]

Ones Array:
 [[1. 1.]
 [1. 1.]
 [1. 1.]]

Range Array: [0 2 4 6 8]
Shape zeros_array:  (2, 3)
Shape ones_array:  (3, 2)
Shape range_array:  (5,)


Num dimension zeros_array:  2
Num dimension ones_array:  2
Num dimension range_array:  1


In [36]:
# type of the elements in the array
array_1d.dtype

dtype('int64')

In [37]:
np.array(['Cat', 1, True])  # arrays can be composed of different types

array(['Cat', '1', 'True'], dtype='<U21')

In [38]:
array_1d.shape

(5,)

### 2. Array Operations

In [19]:
# Element-wise addition, subtraction, multiplication, and division
array_a = np.array([1, 22, 3])
array_b = np.array([37, 6, 15])

print("Addition:", array_a + array_b)
print("Subtraction:", array_a - array_b)
print("Multiplication:", array_a * array_b)
print("Division:", array_a / array_b)

Addition: [38 28 18]
Subtraction: [-36  16 -12]
Multiplication: [ 37 132  45]
Division: [0.02702703 3.66666667 0.2       ]


In [21]:
# Addition, subtraction, multiplication, and division between array and scalar
array = np.array([1, 22, 3])
scalar = 5

print("Addition:", array + scalar)
print("Subtraction:", array - scalar)
print("Multiplication:", array * scalar)
print("Division:", array / scalar)

Addition: [ 6 27  8]
Subtraction: [-4 17 -2]
Multiplication: [  5 110  15]
Division: [0.2 4.4 0.6]


In [None]:
# Exercise: where is the difference? Can i obtain the same with an array? If yes, which one?

**Dot Product**

The dot product of two vectors $$a = [a_1, a_2, a_3] $$ and $$b = [b_1, b_2, b_3] $$  specified with respect to an orthonormal basis, is defined as:

$$ a \cdot b = ∑_{i=1}^{n} a_i \cdot b_i = a_1*b_1 + a_2*b_2 + ... + a_n*b_n $$



In [25]:
dot_product = np.dot(array_a, array_b)
print("\nDot Product:", dot_product)


Dot Product: 214


### 3. Statistical Functions

**Sum and Max**

In [27]:
array_stats = np.array([10, 20, 30, 40, 50])

print("Sum:", np.sum(array_stats))
print("Maximum:", np.max(array_stats))

Sum: 150
Maximum: 50


**Mean**
$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$

In [28]:
print("Mean:", np.mean(array_stats))


Mean: 30.0


**Standard deviation**

$$\sigma = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^2}$$

In [29]:
print("Standard Deviation:", np.std(array_stats))

Standard Deviation: 14.142135623730951


### 4. Indexing and Slicing

In [30]:
print("\nElement at index 1:", array_1d[1])  # Accessing element
print("First row of 2D array:", array_2d[0])  # Accessing a row
print("Element at (1, 2):", array_2d[1, 2])  # Accessing specific element


Element at index 1: 2
First row of 2D array: [1 2 3]
Element at (1, 2): 6


### 5. Reshaping Arrays

In [31]:
reshaped_array = np.reshape(array_1d, (1, 5))  # Reshape to 1 row, 5 columns
print("\nReshaped Array:\n", reshaped_array)


Reshaped Array:
 [[1 2 3 4 5]]


In [32]:
reshaped_array = np.reshape(array_1d, (5, 1))  # Reshape to 5 row, 1 columns
print("\nReshaped Array:\n", reshaped_array)


Reshaped Array:
 [[1]
 [2]
 [3]
 [4]
 [5]]


In [34]:
reshaped_array.flatten()  # flatten, return to original

array([1, 2, 3, 4, 5])

### 6. Boolean Indexing

In [35]:
bool_array = array_stats > 25  # Elements greater than 25
print("\nBoolean Array:", bool_array)
print("Filtered Array (values > 25):", array_stats[bool_array])


Boolean Array: [False False  True  True  True]
Filtered Array (values > 25): [30 40 50]


**Further reading: https://numpy.org/devdocs/user/quickstart.html Recommended!**

## PANDAS

In [53]:
# Define a dataframe
import pandas as pd

age = [20, 22, 25]
sex = ['M', 'F', 'M']
degree = ['BSc', 'MSc', 'PhD']
dataframe = pd.DataFrame({'Age': age, 'Sex': sex, 'Degree': degree})
print(dataframe.head())

   Age Sex Degree
0   20   M    BSc
1   22   F    MSc
2   25   M    PhD


In [51]:
print(dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Age     3 non-null      int64 
 1   Sex     3 non-null      object
 2   Degree  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None


In [52]:
print(dataframe.describe())

             Age
count   3.000000
mean   22.333333
std     2.516611
min    20.000000
25%    21.000000
50%    22.000000
75%    23.500000
max    25.000000


### Basic operations: filtering, sorting, indexing

In [55]:
# Filtering
# Only retain the rows where the value of column age is more than 20
filtered_df = dataframe[dataframe['Age'] > 20]
print("\nFiltered DataFrame (Age > 20):\n", filtered_df)

filtered_df = dataframe[dataframe['Sex'] == 'F']
print("\nFiltered DataFrame (Sex = F):\n", filtered_df)


Filtered DataFrame (Age > 20):
    Age Sex Degree
1   22   F    MSc
2   25   M    PhD

Filtered DataFrame (Sex = F):
    Age Sex Degree
1   22   F    MSc


In [60]:
# We can also use it to count the values
f_count = len(dataframe[dataframe['Sex'] == 'F'])
print(f"Number of rows where sex = F: {f_count}")

Number of rows where sex = F: 1


In [61]:
# Sorting
sorted_df = dataframe.sort_values(by='Age', ascending=False)
print("\nSorted DataFrame (Age descending):\n", sorted_df)


Sorted DataFrame (Age descending):
    Age Sex Degree
2   25   M    PhD
1   22   F    MSc
0   20   M    BSc


In [72]:
# Indexing
# Accessing the first row
print("First row using index 0:")
print(dataframe.iloc[0])

# Accessing the column 'Age'
print("\nAccessing the column 'Age' using its label:")
print(dataframe['Age'])

# Accessing rows and columns using their positions (index-based)
print("\nAccessing rows from index 0 up to but excluding 2, and columns from index 0 up to but excluding 2")
print(dataframe.iloc[0:2, 0:2])

# Accessing rows based on a condition and selecting specific column(s) by label
print("\nAccessing rows where Age > 20 and taking the 'Sex' column:")
print(dataframe.loc[dataframe['Age'] > 20, 'Sex'])

First row using index 0:
Age        20
Sex         M
Degree    BSc
Name: 0, dtype: object

Accessing the column 'Age' using its label:
0    20
1    22
2    25
Name: Age, dtype: int64

Accessing rows from index 0 up to but excluding 2, and columns from index 0 up to but excluding 2
   Age Sex
0   20   M
1   22   F

Accessing rows where Age > 20 and taking the 'Sex' column:
1    F
2    M
Name: Sex, dtype: object


In [None]:
# experiment here

### Grouping, aggregation, and merging datasets

In [74]:
# Grouping data by 'Sex' and calculating the mean age for each group
grouped_df = dataframe.groupby('Sex')['Age'].mean()

print("Grouped DataFrame (Mean Age by Sex):")
print(grouped_df)

Grouped DataFrame (Mean Age by Sex):
Sex
F    22.0
M    22.5
Name: Age, dtype: float64


In [75]:
# Aggregation: Calculate multiple statistics for the 'Age' column
aggregated_df = dataframe.agg({'Age': ['min', 'max', 'mean', 'std']})

print("Aggregated DataFrame (Statistics of Age):")
print(aggregated_df)


Aggregated DataFrame (Statistics of Age):
            Age
min   20.000000
max   25.000000
mean  22.333333
std    2.516611


In [82]:
dataframe

Unnamed: 0,Age,Sex,Degree
0,20,M,BSc
1,22,F,MSc
2,25,M,PhD


In [101]:
# Merging datasets
# Create another DataFrame
data1 = {'Age': [24, 27, 20], 'Sex': ['M', 'F', 'M'], 'Degree': ['MSc', 'PhD', 'BSc']}
df1 = pd.DataFrame(data1)

data2 = {'City': ['London', 'Cambridge', 'Nice'], 'University': ['UCL', 'MIT', 'Université Côte d Azur']}
df2 = pd.DataFrame(data2)

# create a new column 'id'
df1['id'] = range(len(df1))
df2['id'] = range(len(df2))

print("DF1:\n ", df1)
print("\nDF2: \n ", df2)

# Merge the two DataFrames based on 'id'
print(
    "\nThe 'inner' join produces a DataFrame where each row has corresponding 'id', 'Age', 'Sex', 'Degree', 'City' and 'University' from both original DataFrames.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='inner')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

DF1:
     Age Sex Degree  id
0   24   M    MSc   0
1   27   F    PhD   1
2   20   M    BSc   2

DF2: 
          City              University  id
0     London                     UCL   0
1  Cambridge                     MIT   1
2       Nice  Université Côte d Azur   2

The 'inner' join produces a DataFrame where each row has corresponding 'id', 'Age', 'Sex', 'Degree', 'City' and 'University' from both original DataFrames.

   id  Age Sex Degree       City              University
0   0   24   M    MSc     London                     UCL
1   1   27   F    PhD  Cambridge                     MIT
2   2   20   M    BSc       Nice  Université Côte d Azur


In [103]:
# The 'how' parameter in pd.merge() controls which rows are included in the merged DataFrame based on the join keys.
# Here's a breakdown of the differences between 'inner', 'left', 'right', and 'outer' joins:

# 1. Inner Join (how='inner'):
#   - Includes only the rows where the join keys exist in BOTH DataFrames.
#   - Rows with join keys present in one DataFrame but not the other are excluded from the merged result.
#   - This is the default join type if you do not specify the 'how' parameter.

# 2. Left Join (how='left'):
#   - Includes ALL rows from the left DataFrame (df1 in the example).
#   - Includes matching rows from the right DataFrame (df2).
#   - If a join key from the left DataFrame does not have a corresponding match in the right DataFrame, the columns from the right DataFrame for that row will contain NaN (Not a Number) values.

# 3. Right Join (how='right'):
#   - Includes ALL rows from the right DataFrame (df2 in the example).
#   - Includes matching rows from the left DataFrame (df1).
#   - If a join key from the right DataFrame does not have a match in the left DataFrame, the left DataFrame columns will contain NaN values for that row.

# 4. Outer Join (how='outer'):
#   - Includes ALL rows from BOTH DataFrames.
#   - If a join key exists in one DataFrame but not the other, the missing values in the other DataFrame's columns will be filled with NaN.
#   - It combines all the unique join keys from both DataFrames.

data1 = {'Age': [24, 27, 20], 'Sex': ['M', 'F', 'M'], 'Degree': ['MSc', 'PhD', 'BSc']}
df1 = pd.DataFrame(data1)

data2 = {'City': ['London', 'Cambridge', 'Nice'], 'University': ['UCL', 'MIT', 'Université Côte d Azur']}
df2 = pd.DataFrame(data2)

df1['id'] = [0, 1, 2]
df2['id'] = [2, 3, 4]

print(
    "The 'left' join keeps all rows from df1 and matches those that have corresponding 'id' values in df2. Rows in df1 without a match in df2 have NaNs for City and University.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='left')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

print(
    "\nThe 'right' join keeps all rows from df2 and matches those that have corresponding 'id' values in df1. Rows in df2 without a match in df1 have NaNs for Age, Sex, and Degree.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='right')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

print(
    "\nThe 'outer' join includes all rows from both df1 and df2. Rows unique to either df1 or df2 have NaN for columns from the other DataFrame.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='outer')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

The 'left' join keeps all rows from df1 and matches those that have corresponding 'id' values in df2. Rows in df1 without a match in df2 have NaNs for City and University.

   id  Age Sex Degree    City University
0   0   24   M    MSc     NaN        NaN
1   1   27   F    PhD     NaN        NaN
2   2   20   M    BSc  London        UCL

The 'right' join keeps all rows from df2 and matches those that have corresponding 'id' values in df1. Rows in df2 without a match in df1 have NaNs for Age, Sex, and Degree.

   id   Age  Sex Degree       City              University
0   2  20.0    M    BSc     London                     UCL
1   3   NaN  NaN    NaN  Cambridge                     MIT
2   4   NaN  NaN    NaN       Nice  Université Côte d Azur

The 'outer' join includes all rows from both df1 and df2. Rows unique to either df1 or df2 have NaN for columns from the other DataFrame.

   id   Age  Sex Degree       City              University
0   0  24.0    M    MSc        NaN                  