# **PART 1: Python Basics**


## Variables and Data Types

In [1]:
# Define variables
x = 10  # Integer
pi = 3.14  # Float
name = "Alice"  # String
is_student = True  # Boolean

In [2]:
# Print variables
print("x:", x)
print("pi:", pi)
print("name:", name)
print("is_student:", is_student)

x: 10
pi: 3.14
name: Alice
is_student: True


In [3]:
# Check data types
print(type(x))
print(type(pi))
print(type(name))
print(type(is_student))

<class 'int'>
<class 'float'>
<class 'str'>
<class 'bool'>


In [4]:
# List, Tuple, Dictionary, Set
my_list = [1, 2, 3, 4]
my_tuple = (5, 6, 7, 8)
my_dict = {"key1": "value1", "key2": "value2"}
my_set = {9, 10, 11}

print("List:", my_list)
print("Tuple:", my_tuple)
print("Dictionary:", my_dict)
print("Set:", my_set)

List: [1, 2, 3, 4]
Tuple: (5, 6, 7, 8)
Dictionary: {'key1': 'value1', 'key2': 'value2'}
Set: {9, 10, 11}


## Conditional Statements (if-else)


In [5]:
# Example of if-else
x = 15
if x > 10:
    print("x is greater than 10")
elif x == 10:
    print("x is equal to 10")
else:
    print("x is less than 10")

x is greater than 10


## Loops

In [6]:
# For loop example
for i in range(5):
    print("For loop iteration:", i)

For loop iteration: 0
For loop iteration: 1
For loop iteration: 2
For loop iteration: 3
For loop iteration: 4


In [7]:
# While loop example
counter = 0
while counter < 5:
    print("While loop iteration:", counter)
    counter += 1

While loop iteration: 0
While loop iteration: 1
While loop iteration: 2
While loop iteration: 3
While loop iteration: 4


## Functions

In [8]:
# Define a simple function
def greet(name):
    return f"Hello, {name}!"


# Test the function
print(greet("Alice"))

Hello, Alice!


## Run a script

In [9]:
# Example of writing a script (this part would normally go in a .py file)
# Save this content as script.py
# Then run it in the terminal with: python script.py
if __name__ == "__main__":
    print("This script is being run directly.")

This script is being run directly.


# PART 2: Python Libraries for Data Analysis

## NUMPY

In [10]:
# Importing the NumPy library
import numpy as np

### 1. Create Arrays

In [11]:
# Creating a 1D array
array_1d = np.array([1, 2, 3, 4, 5])
print("1D Array:", array_1d)
print("Shape: ", array_1d.shape)
print("Num dimension: ", array_1d.ndim)

1D Array: [1 2 3 4 5]
Shape:  (5,)
Num dimension:  1


In [12]:
# Creating a 2D array
array_2d = np.array([[1, 2, 3], [4, 5, 6]])
print("\n2D Array:\n", array_2d)
print("Shape: ", array_2d.shape)
print("Num dimension: ", array_2d.ndim)


2D Array:
 [[1 2 3]
 [4 5 6]]
Shape:  (2, 3)
Num dimension:  2


In [13]:
# Creating arrays with zeros, ones, or a range of numbers
zeros_array = np.zeros((2, 3))
ones_array = np.ones((3, 2))
range_array = np.arange(0, 10, 2)
print("\nZeros Array:\n", zeros_array)
print("\nOnes Array:\n", ones_array)
print("\nRange Array:", range_array)

print("Shape zeros_array: ", zeros_array.shape)
print("Shape ones_array: ", ones_array.shape)
print("Shape range_array: ", range_array.shape)
print("\n")

print("Num dimension zeros_array: ", zeros_array.ndim)
print("Num dimension ones_array: ", ones_array.ndim)
print("Num dimension range_array: ", range_array.ndim)


Zeros Array:
 [[0. 0. 0.]
 [0. 0. 0.]]

Ones Array:
 [[1. 1.]
 [1. 1.]
 [1. 1.]]

Range Array: [0 2 4 6 8]
Shape zeros_array:  (2, 3)
Shape ones_array:  (3, 2)
Shape range_array:  (5,)


Num dimension zeros_array:  2
Num dimension ones_array:  2
Num dimension range_array:  1


In [14]:
# type of the elements in the array
array_1d.dtype

dtype('int64')

In [15]:
np.array(['Cat', 1, True])  # arrays can be composed of different types

array(['Cat', '1', 'True'], dtype='<U21')

In [16]:
array_1d.shape

(5,)

### 2. Array Operations

In [17]:
# Element-wise addition, subtraction, multiplication, and division
array_a = np.array([1, 22, 3])
array_b = np.array([37, 6, 15])

print("Addition:", array_a + array_b)
print("Subtraction:", array_a - array_b)
print("Multiplication:", array_a * array_b)
print("Division:", array_a / array_b)

Addition: [38 28 18]
Subtraction: [-36  16 -12]
Multiplication: [ 37 132  45]
Division: [0.02702703 3.66666667 0.2       ]


In [18]:
# Addition, subtraction, multiplication, and division between array and scalar
array = np.array([1, 22, 3])
scalar = 5

print("Addition:", array + scalar)
print("Subtraction:", array - scalar)
print("Multiplication:", array * scalar)
print("Division:", array / scalar)

Addition: [ 6 27  8]
Subtraction: [-4 17 -2]
Multiplication: [  5 110  15]
Division: [0.2 4.4 0.6]


In [19]:
# Exercise: where is the difference? Can i obtain the same with an array? If yes, which one?
array_a = np.array([1, 22, 3])
array_b = np.array([5, 5, 5])

print("Addition:", array_a + array_b)
print("Subtraction:", array_a - array_b)
print("Multiplication:", array_a * array_b)
print("Division:", array_a / array_b)

Addition: [ 6 27  8]
Subtraction: [-4 17 -2]
Multiplication: [  5 110  15]
Division: [0.2 4.4 0.6]


**Dot Product**

The dot product of two vectors $$a = [a_1, a_2, a_3] $$ and $$b = [b_1, b_2, b_3] $$  specified with respect to an orthonormal basis, is defined as:

$$ a \cdot b = ∑_{i=1}^{n} a_i \cdot b_i = a_1*b_1 + a_2*b_2 + ... + a_n*b_n $$



In [None]:
dot_product = np.dot(array_a, array_b)
print("\nDot Product:", dot_product)

### 3. Statistical Functions

**Sum and Max**

In [None]:
array_stats = np.array([10, 20, 30, 40, 50])

print("Sum:", np.sum(array_stats))
print("Maximum:", np.max(array_stats))

**Mean**
$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$

In [None]:
print("Mean:", np.mean(array_stats))

**Standard deviation**

$$\sigma = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^2}$$

In [None]:
print("Standard Deviation:", np.std(array_stats))

### 4. Indexing and Slicing

In [None]:
print("\nElement at index 1:", array_1d[1])  # Accessing element
print("First row of 2D array:", array_2d[0])  # Accessing a row
print("Element at (1, 2):", array_2d[1, 2])  # Accessing specific element

### 5. Reshaping Arrays

In [None]:
reshaped_array = np.reshape(array_1d, (1, 5))  # Reshape to 1 row, 5 columns
print("\nReshaped Array:\n", reshaped_array)

In [None]:
reshaped_array = np.reshape(array_1d, (5, 1))  # Reshape to 5 row, 1 columns
print("\nReshaped Array:\n", reshaped_array)

In [None]:
reshaped_array.flatten()  # flatten, return to original

### 6. Boolean Indexing

In [None]:
bool_array = array_stats > 25  # Elements greater than 25
print("\nBoolean Array:", bool_array)
print("Filtered Array (values > 25):", array_stats[bool_array])

**Further reading: https://numpy.org/devdocs/user/quickstart.html Recommended!**

## PANDAS

In [None]:
# Define a dataframe
import pandas as pd

age = [20, 22, 25]
sex = ['M', 'F', 'M']
degree = ['BSc', 'MSc', 'PhD']
dataframe = pd.DataFrame({'Age': age, 'Sex': sex, 'Degree': degree})
print(dataframe.head())

In [None]:
print(dataframe.info())

In [None]:
print(dataframe.describe())

### Basic operations: filtering, sorting, indexing

In [None]:
# Filtering
# Only retain the rows where the value of column age is more than 20
filtered_df = dataframe[dataframe['Age'] > 20]
print("\nFiltered DataFrame (Age > 20):\n", filtered_df)

filtered_df = dataframe[dataframe['Sex'] == 'F']
print("\nFiltered DataFrame (Sex = F):\n", filtered_df)

In [None]:
# We can also use it to count the values
f_count = len(dataframe[dataframe['Sex'] == 'F'])
print(f"Number of rows where sex = F: {f_count}")

In [None]:
# Sorting
sorted_df = dataframe.sort_values(by='Age', ascending=False)
print("\nSorted DataFrame (Age descending):\n", sorted_df)

In [None]:
# Indexing
# Accessing the first row
print("First row using index 0:")
print(dataframe.iloc[0])

# Accessing the column 'Age'
print("\nAccessing the column 'Age' using its label:")
print(dataframe['Age'])

# Accessing rows and columns using their positions (index-based)
print("\nAccessing rows from index 0 up to but excluding 2, and columns from index 0 up to but excluding 2")
print(dataframe.iloc[0:2, 0:2])

# Accessing rows based on a condition and selecting specific column(s) by label
print("\nAccessing rows where Age > 20 and taking the 'Sex' column:")
print(dataframe.loc[dataframe['Age'] > 20, 'Sex'])

### Grouping, aggregation, and merging datasets

In [None]:
# Grouping data by 'Sex' and calculating the mean age for each group
grouped_df = dataframe.groupby('Sex')['Age'].mean()

print("Grouped DataFrame (Mean Age by Sex):")
print(grouped_df)

In [None]:
# Aggregation: Calculate multiple statistics for the 'Age' column
aggregated_df = dataframe.agg({'Age': ['min', 'max', 'mean', 'std']})

print("Aggregated DataFrame (Statistics of Age):")
print(aggregated_df)

In [None]:
dataframe

In [None]:
# Merging datasets
# Create another DataFrame
data1 = {'Age': [24, 27, 20], 'Sex': ['M', 'F', 'M'], 'Degree': ['MSc', 'PhD', 'BSc']}
df1 = pd.DataFrame(data1)

data2 = {'City': ['London', 'Cambridge', 'Nice'], 'University': ['UCL', 'MIT', 'Université Côte d Azur']}
df2 = pd.DataFrame(data2)

# create a new column 'id'
df1['id'] = range(len(df1))
df2['id'] = range(len(df2))

print("DF1:\n ", df1)
print("\nDF2: \n ", df2)

# Merge the two DataFrames based on 'id'
print(
    "\nThe 'inner' join produces a DataFrame where each row has corresponding 'id', 'Age', 'Sex', 'Degree', 'City' and 'University' from both original DataFrames.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='inner')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

In [None]:
# The 'how' parameter in pd.merge() controls which rows are included in the merged DataFrame based on the join keys.
# Here's a breakdown of the differences between 'inner', 'left', 'right', and 'outer' joins:

# 1. Inner Join (how='inner'):
#   - Includes only the rows where the join keys exist in BOTH DataFrames.
#   - Rows with join keys present in one DataFrame but not the other are excluded from the merged result.
#   - This is the default join type if you do not specify the 'how' parameter.

# 2. Left Join (how='left'):
#   - Includes ALL rows from the left DataFrame (df1 in the example).
#   - Includes matching rows from the right DataFrame (df2).
#   - If a join key from the left DataFrame does not have a corresponding match in the right DataFrame, the columns from the right DataFrame for that row will contain NaN (Not a Number) values.

# 3. Right Join (how='right'):
#   - Includes ALL rows from the right DataFrame (df2 in the example).
#   - Includes matching rows from the left DataFrame (df1).
#   - If a join key from the right DataFrame does not have a match in the left DataFrame, the left DataFrame columns will contain NaN values for that row.

# 4. Outer Join (how='outer'):
#   - Includes ALL rows from BOTH DataFrames.
#   - If a join key exists in one DataFrame but not the other, the missing values in the other DataFrame's columns will be filled with NaN.
#   - It combines all the unique join keys from both DataFrames.

data1 = {'Age': [24, 27, 20], 'Sex': ['M', 'F', 'M'], 'Degree': ['MSc', 'PhD', 'BSc']}
df1 = pd.DataFrame(data1)

data2 = {'City': ['London', 'Cambridge', 'Nice'], 'University': ['UCL', 'MIT', 'Université Côte d Azur']}
df2 = pd.DataFrame(data2)

df1['id'] = [0, 1, 2]
df2['id'] = [2, 3, 4]

print(
    "The 'left' join keeps all rows from df1 and matches those that have corresponding 'id' values in df2. Rows in df1 without a match in df2 have NaNs for City and University.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='left')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

print(
    "\nThe 'right' join keeps all rows from df2 and matches those that have corresponding 'id' values in df1. Rows in df2 without a match in df1 have NaNs for Age, Sex, and Degree.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='right')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

print(
    "\nThe 'outer' join includes all rows from both df1 and df2. Rows unique to either df1 or df2 have NaN for columns from the other DataFrame.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='outer')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

# Exercises (Python)

In [25]:
"""
String Manipulation: Write a Python function that takes a string as input
and returns a dictionary with the counts of each vowel ('a', 'e', 'i', 'o', 'u').
"""


def return_count_vowel(s):
    vowels = "a e i o u"
    return {v: s.lower().count(v) for v in vowels}


return_count_vowel("ciao")

{'a': 1, ' ': 0, 'e': 0, 'i': 1, 'o': 1, 'u': 0}

In [27]:
"""
List Comprehension: Create a list of all even numbers from 1 to 100 using list comprehension.
"""
l = [n for n in range(100)]
l

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]

In [30]:
"""
Dictionary Operations: Given a dictionary { "Alice": 85, "Bob": 90, "Charlie": 78 },
write a function to find and return the name of the student with the highest score.
"""


def max_score(dict):
    return max(dict, key=dict.get)


dict = {"Alice": 85, "Bob": 90, "Charlie": 78}
max_score(dict)

'Bob'

# Exercises (Numpy and Pandas)

In [31]:
"""
NumPy Array Operations: Create a NumPy array of shape (3,3) filled with random
integers between 1 and 100.
Replace all even numbers with -1.
"""
array = np.random.randint(1, 100, size=(3, 3))
array[array % 2 == 0] = -1
array

array([[67, -1, 11],
       [-1,  9, 11],
       [-1, -1, 17]])

In [35]:
"""
Pandas DataFrame Basics: Create a Pandas DataFrame with 3 columns ("Name", "Age", "Score")
and at least 5 rows of data. Display the average age of all individuals.
"""
import pandas as pd

df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
                   'Age': [25, 30, 22, 28, 35],
                   'Score': [85, 67, 90, 100, 50]})
df['Age']

Unnamed: 0,Age
0,25
1,30
2,22
3,28
4,35


In [36]:
"""
Data Filtering in Pandas: Using a DataFrame of employees (columns: "Name", "Department", "Salary"),
write a function that returns all employees in the "IT" department earning more than $50,000.
"""
import pandas as pd
import random

# Sample data
names = ['John', 'Sarah', 'David', 'Emily', 'Michael']
departments = ['HR', 'IT', 'Marketing', 'Finance', 'Sales']
salaries = [random.randint(50000, 120000) for _ in range(5)]

# Create the DataFrame
df = pd.DataFrame({
    'Name': names,
    'Department': departments,
    'Salary': salaries
})


def filter_employees(df):
    return df[(df['Department'] == 'IT') & (df['Salary'] > 50000)]


filter_employees(df)

Unnamed: 0,Name,Department,Salary
1,Sarah,IT,102462


In [37]:
"""
Grouping Data in Pandas: Given a DataFrame with columns "City", "Temperature",
and "Month", write code to compute the average temperature per city.
"""
import pandas as pd
import random

# Sample data
cities = ["New York", "London", "Tokyo", "Paris", "Sydney", "Toronto", "Berlin", "Dubai", "Mumbai", "Los Angeles"]
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November",
          "December"]

data = {
    "City": random.choices(cities, k=10),
    "Temperature": [random.uniform(-10, 40) for _ in range(10)],  # Random temp between -10 and 40°C
    "Month": random.choices(months, k=10)
}

df = pd.DataFrame(data)


def compute_average_temperature(df):
    return df.groupby("City")["Temperature"].mean()


compute_average_temperature(df)

Unnamed: 0_level_0,Temperature
City,Unnamed: 1_level_1
Dubai,8.204184
London,2.036578
Mumbai,-2.894107
Paris,14.3205
Tokyo,17.018951
Toronto,17.635796


In [38]:
"""
Merging DataFrames: Create two DataFrames, one with student names and IDs,
and another with student IDs and grades. Merge them into a single DataFrame.
"""
df1 = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'],
                    'ID': [101, 102, 103]})

df2 = pd.DataFrame({'ID': [102, 103, 104],
                    'Grade': ['A', 'B', 'C']})

df_merged = pd.merge(df1, df2, on='ID', how='inner')
df_merged

Unnamed: 0,Name,ID,Grade
0,Bob,102,A
1,Charlie,103,B


In [39]:
"""
Applying Functions in Pandas: Given a column of dates in a DataFrame,
convert them to datetime format and extract the month name.
"""
import pandas as pd
import numpy as np

# Generate 15 random dates within a range
date_range = pd.date_range(start="2020-01-01", end="2025-01-01", freq="D")
random_dates = np.random.choice(date_range, size=15, replace=False)

# Create the DataFrame
df = pd.DataFrame({"dates": random_dates})
df['dates'] = pd.to_datetime(df['dates'])

# Extract the month name
df['month_name'] = df['dates'].dt.month_name()
df

Unnamed: 0,dates,month_name
0,2023-11-12,November
1,2021-02-02,February
2,2021-01-10,January
3,2024-01-15,January
4,2020-07-13,July
5,2020-12-12,December
6,2024-11-17,November
7,2024-10-27,October
8,2024-12-28,December
9,2021-03-03,March


# Hard Exercise

In [None]:
""" Complex DataFrame Manipulation:
1. Create a DataFrame with at least 100 rows containing the following columns: "CustomerID", "PurchaseDate", "ProductCategory", "PurchaseAmount".
2. Convert "PurchaseDate" to datetime format.
3. Compute the total amount spent per customer.
4. Identify the top 5 customers with the highest total spending.
5. For each product category, find the month with the highest total sales.
"""
import pandas as pd
import numpy as np

# Step 1: Create a DataFrame with 100 rows
np.random.seed(42)

customer_ids = np.random.randint(1, 21, size=100)  # 20 unique customers
purchase_dates = pd.date_range(start='2023-01-01', periods=100, freq='D')
product_categories = np.random.choice(['Electronics', 'Clothing', 'Groceries', 'Books', 'Toys'], size=100)
purchase_amounts = np.random.uniform(5, 500, size=100)  # Random amounts between 5 and 500

df = pd.DataFrame({
    'CustomerID': customer_ids,
    'PurchaseDate': purchase_dates,
    'ProductCategory': product_categories,
    'PurchaseAmount': purchase_amounts
})

# Step 2: Convert "PurchaseDate" to datetime format
df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])

# Step 3: Compute the total amount spent per customer
total_spent_per_customer = df.groupby('CustomerID')['PurchaseAmount'].sum().reset_index()

# Step 4: Identify the top 5 customers with the highest total spending
top_5_customers = total_spent_per_customer.sort_values(by='PurchaseAmount', ascending=False).head(5)

# Step 5: For each product category, find the month with the highest total sales
df['Month'] = df['PurchaseDate'].dt.month
category_month_sales = df.groupby(['ProductCategory', 'Month'])['PurchaseAmount'].sum().reset_index()

# For each category, find the month with the highest sales
highest_sales_per_category = category_month_sales.loc[
    category_month_sales.groupby('ProductCategory')['PurchaseAmount'].idxmax()]

total_spent_per_customer, top_5_customers, highest_sales_per_category