# **PART 1: Python Basics**


## Variables and Data Types

In [None]:
# Define variables
x = 10  # Integer
pi = 3.14  # Float
name = "Alice"  # String
is_student = True  # Boolean

In [None]:
# Print variables
print("x:", x)
print("pi:", pi)
print("name:", name)
print("is_student:", is_student)

In [None]:
# Check data types
print(type(x))
print(type(pi))
print(type(name))
print(type(is_student))

In [None]:
# List, Tuple, Dictionary, Set
my_list = [1, 2, 3, 4]
my_tuple = (5, 6, 7, 8)
my_dict = {"key1": "value1", "key2": "value2"}
my_set = {9, 10, 11}

print("List:", my_list)
print("Tuple:", my_tuple)
print("Dictionary:", my_dict)
print("Set:", my_set)

## Conditional Statements (if-else)


In [None]:
# Example of if-else
x = 15
if x > 10:
    print("x is greater than 10")
elif x == 10:
    print("x is equal to 10")
else:
    print("x is less than 10")

## Loops

In [None]:
# For loop example
for i in range(5):
    print("For loop iteration:", i)

In [None]:
# While loop example
counter = 0
while counter < 5:
    print("While loop iteration:", counter)
    counter += 1

## Functions

In [None]:
# Define a simple function
def greet(name):
    return f"Hello, {name}!"


# Test the function
print(greet("Alice"))

## Run a script

In [None]:
# Example of writing a script (this part would normally go in a .py file)
# Save this content as script.py
# Then run it in the terminal with: python script.py
if __name__ == "__main__":
    print("This script is being run directly.")

# PART 2: Python Libraries for Data Analysis

## NUMPY

In [None]:
# Importing the NumPy library
import numpy as np

### 1. Create Arrays

In [None]:
# Creating a 1D array
array_1d = np.array([1, 2, 3, 4, 5])
print("1D Array:", array_1d)
print("Shape: ", array_1d.shape)
print("Num dimension: ", array_1d.ndim)

In [None]:
# Creating a 2D array
array_2d = np.array([[1, 2, 3], [4, 5, 6]])
print("\n2D Array:\n", array_2d)
print("Shape: ", array_2d.shape)
print("Num dimension: ", array_2d.ndim)

In [None]:
# Creating arrays with zeros, ones, or a range of numbers
zeros_array = np.zeros((2, 3))
ones_array = np.ones((3, 2))
range_array = np.arange(0, 10, 2)
print("\nZeros Array:\n", zeros_array)
print("\nOnes Array:\n", ones_array)
print("\nRange Array:", range_array)

print("Shape zeros_array: ", zeros_array.shape)
print("Shape ones_array: ", ones_array.shape)
print("Shape range_array: ", range_array.shape)
print("\n")

print("Num dimension zeros_array: ", zeros_array.ndim)
print("Num dimension ones_array: ", ones_array.ndim)
print("Num dimension range_array: ", range_array.ndim)

In [None]:
# type of the elements in the array
array_1d.dtype

In [None]:
np.array(['Cat', 1, True])  # arrays can be composed of different types

In [None]:
array_1d.shape

### 2. Array Operations

In [None]:
# Element-wise addition, subtraction, multiplication, and division
array_a = np.array([1, 22, 3])
array_b = np.array([37, 6, 15])

print("Addition:", array_a + array_b)
print("Subtraction:", array_a - array_b)
print("Multiplication:", array_a * array_b)
print("Division:", array_a / array_b)

In [None]:
# Addition, subtraction, multiplication, and division between array and scalar
array = np.array([1, 22, 3])
scalar = 5

print("Addition:", array + scalar)
print("Subtraction:", array - scalar)
print("Multiplication:", array * scalar)
print("Division:", array / scalar)

In [None]:
# Exercise: where is the difference? Can i obtain the same with an array? If yes, which one?

**Dot Product**

The dot product of two vectors $$a = [a_1, a_2, a_3] $$ and $$b = [b_1, b_2, b_3] $$  specified with respect to an orthonormal basis, is defined as:

$$ a \cdot b = ∑_{i=1}^{n} a_i \cdot b_i = a_1*b_1 + a_2*b_2 + ... + a_n*b_n $$



In [None]:
dot_product = np.dot(array_a, array_b)
print("\nDot Product:", dot_product)

### 3. Statistical Functions

**Sum and Max**

In [None]:
array_stats = np.array([10, 20, 30, 40, 50])

print("Sum:", np.sum(array_stats))
print("Maximum:", np.max(array_stats))

**Mean**
$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$

In [None]:
print("Mean:", np.mean(array_stats))

**Standard deviation**

$$\sigma = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^2}$$

In [None]:
print("Standard Deviation:", np.std(array_stats))

### 4. Indexing and Slicing

In [None]:
print("\nElement at index 1:", array_1d[1])  # Accessing element
print("First row of 2D array:", array_2d[0])  # Accessing a row
print("Element at (1, 2):", array_2d[1, 2])  # Accessing specific element

### 5. Reshaping Arrays

In [None]:
reshaped_array = np.reshape(array_1d, (1, 5))  # Reshape to 1 row, 5 columns
print("\nReshaped Array:\n", reshaped_array)

In [None]:
reshaped_array = np.reshape(array_1d, (5, 1))  # Reshape to 5 row, 1 columns
print("\nReshaped Array:\n", reshaped_array)

In [None]:
reshaped_array.flatten()  # flatten, return to original

### 6. Boolean Indexing

In [None]:
bool_array = array_stats > 25  # Elements greater than 25
print("\nBoolean Array:", bool_array)
print("Filtered Array (values > 25):", array_stats[bool_array])

**Further reading: https://numpy.org/devdocs/user/quickstart.html Recommended!**

## PANDAS

In [None]:
# Define a dataframe
import pandas as pd

age = [20, 22, 25]
sex = ['M', 'F', 'M']
degree = ['BSc', 'MSc', 'PhD']
dataframe = pd.DataFrame({'Age': age, 'Sex': sex, 'Degree': degree})
print(dataframe.head())

In [None]:
print(dataframe.info())

In [None]:
print(dataframe.describe())

### Basic operations: filtering, sorting, indexing

In [None]:
# Filtering
# Only retain the rows where the value of column age is more than 20
filtered_df = dataframe[dataframe['Age'] > 20]
print("\nFiltered DataFrame (Age > 20):\n", filtered_df)

filtered_df = dataframe[dataframe['Sex'] == 'F']
print("\nFiltered DataFrame (Sex = F):\n", filtered_df)

In [None]:
# We can also use it to count the values
f_count = len(dataframe[dataframe['Sex'] == 'F'])
print(f"Number of rows where sex = F: {f_count}")

In [None]:
# Sorting
sorted_df = dataframe.sort_values(by='Age', ascending=False)
print("\nSorted DataFrame (Age descending):\n", sorted_df)

In [None]:
# Indexing
# Accessing the first row
print("First row using index 0:")
print(dataframe.iloc[0])

# Accessing the column 'Age'
print("\nAccessing the column 'Age' using its label:")
print(dataframe['Age'])

# Accessing rows and columns using their positions (index-based)
print("\nAccessing rows from index 0 up to but excluding 2, and columns from index 0 up to but excluding 2")
print(dataframe.iloc[0:2, 0:2])

# Accessing rows based on a condition and selecting specific column(s) by label
print("\nAccessing rows where Age > 20 and taking the 'Sex' column:")
print(dataframe.loc[dataframe['Age'] > 20, 'Sex'])

In [None]:
# experiment here

### Grouping, aggregation, and merging datasets

In [None]:
# Grouping data by 'Sex' and calculating the mean age for each group
grouped_df = dataframe.groupby('Sex')['Age'].mean()

print("Grouped DataFrame (Mean Age by Sex):")
print(grouped_df)

In [None]:
# Aggregation: Calculate multiple statistics for the 'Age' column
aggregated_df = dataframe.agg({'Age': ['min', 'max', 'mean', 'std']})

print("Aggregated DataFrame (Statistics of Age):")
print(aggregated_df)

In [None]:
dataframe

In [None]:
# Merging datasets
# Create another DataFrame
data1 = {'Age': [24, 27, 20], 'Sex': ['M', 'F', 'M'], 'Degree': ['MSc', 'PhD', 'BSc']}
df1 = pd.DataFrame(data1)

data2 = {'City': ['London', 'Cambridge', 'Nice'], 'University': ['UCL', 'MIT', 'Université Côte d Azur']}
df2 = pd.DataFrame(data2)

# create a new column 'id'
df1['id'] = range(len(df1))
df2['id'] = range(len(df2))

print("DF1:\n ", df1)
print("\nDF2: \n ", df2)

# Merge the two DataFrames based on 'id'
print(
    "\nThe 'inner' join produces a DataFrame where each row has corresponding 'id', 'Age', 'Sex', 'Degree', 'City' and 'University' from both original DataFrames.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='inner')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

In [None]:
# The 'how' parameter in pd.merge() controls which rows are included in the merged DataFrame based on the join keys.
# Here's a breakdown of the differences between 'inner', 'left', 'right', and 'outer' joins:

# 1. Inner Join (how='inner'):
#   - Includes only the rows where the join keys exist in BOTH DataFrames.
#   - Rows with join keys present in one DataFrame but not the other are excluded from the merged result.
#   - This is the default join type if you do not specify the 'how' parameter.

# 2. Left Join (how='left'):
#   - Includes ALL rows from the left DataFrame (df1 in the example).
#   - Includes matching rows from the right DataFrame (df2).
#   - If a join key from the left DataFrame does not have a corresponding match in the right DataFrame, the columns from the right DataFrame for that row will contain NaN (Not a Number) values.

# 3. Right Join (how='right'):
#   - Includes ALL rows from the right DataFrame (df2 in the example).
#   - Includes matching rows from the left DataFrame (df1).
#   - If a join key from the right DataFrame does not have a match in the left DataFrame, the left DataFrame columns will contain NaN values for that row.

# 4. Outer Join (how='outer'):
#   - Includes ALL rows from BOTH DataFrames.
#   - If a join key exists in one DataFrame but not the other, the missing values in the other DataFrame's columns will be filled with NaN.
#   - It combines all the unique join keys from both DataFrames.

data1 = {'Age': [24, 27, 20], 'Sex': ['M', 'F', 'M'], 'Degree': ['MSc', 'PhD', 'BSc']}
df1 = pd.DataFrame(data1)

data2 = {'City': ['London', 'Cambridge', 'Nice'], 'University': ['UCL', 'MIT', 'Université Côte d Azur']}
df2 = pd.DataFrame(data2)

df1['id'] = [0, 1, 2]
df2['id'] = [2, 3, 4]

print(
    "The 'left' join keeps all rows from df1 and matches those that have corresponding 'id' values in df2. Rows in df1 without a match in df2 have NaNs for City and University.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='left')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

print(
    "\nThe 'right' join keeps all rows from df2 and matches those that have corresponding 'id' values in df1. Rows in df2 without a match in df1 have NaNs for Age, Sex, and Degree.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='right')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

print(
    "\nThe 'outer' join includes all rows from both df1 and df2. Rows unique to either df1 or df2 have NaN for columns from the other DataFrame.\n")
merged_df = pd.merge(df1, df2, on=['id'], how='outer')
print(merged_df[['id', 'Age', 'Sex', 'Degree', 'City', 'University']])

# Exercises (Python)

In [None]:
"""
String Manipulation: Write a Python function that takes a string as input
and returns a dictionary with the counts of each vowel ('a', 'e', 'i', 'o', 'u').
"""



In [None]:
"""
List Comprehension: Create a list of all even numbers from 1 to 100 using list comprehension.
"""


In [None]:
"""
Dictionary Operations: Given a dictionary { "Alice": 85, "Bob": 90, "Charlie": 78 },
write a function to find and return the name of the student with the highest score.
"""
dict = {"Alice": 85, "Bob": 90, "Charlie": 78}

# Exercises (Numpy and Pandas)

In [None]:
"""
NumPy Array Operations: Create a NumPy array of shape (3,3) filled with random
integers between 1 and 100.
Replace all even numbers with -1.
"""


In [None]:
"""
Pandas DataFrame Basics: Create a Pandas DataFrame with 3 columns ("Name", "Age", "Score")
and at least 5 rows of data. Display the average age of all individuals.
"""


In [None]:
"""
Data Filtering in Pandas: Using a DataFrame of employees (columns: "Name", "Department", "Salary"),
write a function that returns all employees in the "IT" department earning more than $50,000.
"""
import pandas as pd
import random

# Sample data
names = ['John', 'Sarah', 'David', 'Emily', 'Michael']
departments = ['HR', 'Engineering', 'Marketing', 'Finance', 'Sales']
salaries = [random.randint(50000, 120000) for _ in range(5)]

# Create the DataFrame
df = pd.DataFrame({
    'Name': names,
    'Department': departments,
    'Salary': salaries
})

print(df)

In [None]:
"""
Grouping Data in Pandas: Given a DataFrame with columns "City", "Temperature",
and "Month", write code to compute the average temperature per city.
"""
import pandas as pd
import random

# Sample data
cities = ["New York", "London", "Tokyo", "Paris", "Sydney", "Toronto", "Berlin", "Dubai", "Mumbai", "Los Angeles"]
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November",
          "December"]

data = {
    "City": random.choices(cities, k=10),
    "Temperature": [random.uniform(-10, 40) for _ in range(10)],  # Random temp between -10 and 40°C
    "Month": random.choices(months, k=10)
}

df = pd.DataFrame(data)
print(df)


In [None]:
"""
Merging DataFrames: Create two DataFrames, one with student names and IDs,
and another with student IDs and grades. Merge them into a single DataFrame.
"""

In [None]:
"""
Applying Functions in Pandas: Given a column of dates in a DataFrame,
convert them to datetime format and extract the month name.
"""
import pandas as pd
import numpy as np

# Generate 15 random dates within a range
date_range = pd.date_range(start="2020-01-01", end="2025-01-01", freq="D")
random_dates = np.random.choice(date_range, size=15, replace=False)

# Create the DataFrame
df = pd.DataFrame({"dates": random_dates})
df

# Hard Exercise

In [None]:
""" Complex DataFrame Manipulation:
1. Create a DataFrame with at least 100 rows containing the following columns: "CustomerID", "PurchaseDate", "ProductCategory", "PurchaseAmount".
2. Convert "PurchaseDate" to datetime format.
3. Compute the total amount spent per customer.
4. Identify the top 5 customers with the highest total spending.
5. For each product category, find the month with the highest total sales.
"""


In [None]:
#@title Solution
import pandas as pd
import numpy as np

# Step 1: Create a DataFrame with 100 rows
np.random.seed(42)

customer_ids = np.random.randint(1, 21, size=100)  # 20 unique customers
purchase_dates = pd.date_range(start='2023-01-01', periods=100, freq='D')
product_categories = np.random.choice(['Electronics', 'Clothing', 'Groceries', 'Books', 'Toys'], size=100)
purchase_amounts = np.random.uniform(5, 500, size=100)  # Random amounts between 5 and 500

df = pd.DataFrame({
    'CustomerID': customer_ids,
    'PurchaseDate': purchase_dates,
    'ProductCategory': product_categories,
    'PurchaseAmount': purchase_amounts
})

# Step 2: Convert "PurchaseDate" to datetime format
df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])

# Step 3: Compute the total amount spent per customer
total_spent_per_customer = df.groupby('CustomerID')['PurchaseAmount'].sum().reset_index()

# Step 4: Identify the top 5 customers with the highest total spending
top_5_customers = total_spent_per_customer.sort_values(by='PurchaseAmount', ascending=False).head(5)

# Step 5: For each product category, find the month with the highest total sales
df['Month'] = df['PurchaseDate'].dt.month
category_month_sales = df.groupby(['ProductCategory', 'Month'])['PurchaseAmount'].sum().reset_index()

# For each category, find the month with the highest sales
highest_sales_per_category = category_month_sales.loc[
    category_month_sales.groupby('ProductCategory')['PurchaseAmount'].idxmax()]

total_spent_per_customer, top_5_customers, highest_sales_per_category