# Python for Machine Learning (Revision)

## Learning Objectives
- Review NumPy and Pandas for data handling
- Master Matplotlib and Seaborn for visualization
- Understand Scikit-learn basics for ML workflows

## NumPy for Numerical Computing

In [None]:
import numpy as np

# Array creation and basic operations
arr = np.array([1, 2, 3, 4, 5])
matrix = np.array([[1, 2], [3, 4]])

print("1D Array:", arr)
print("2D Matrix:\n", matrix)
print("Shape:", matrix.shape)
print("Data type:", matrix.dtype)

# Mathematical operations
print("\nMath Operations:")
print("Mean:", np.mean(arr))
print("Standard deviation:", np.std(arr))
print("Matrix multiplication:\n", np.dot(matrix, matrix))

## Pandas for Data Manipulation

In [None]:
import pandas as pd

# Create sample dataset
data = {
    'age': [25, 30, 35, 40, 45],
    'salary': [50000, 60000, 70000, 80000, 90000],
    'department': ['IT', 'HR', 'IT', 'Finance', 'HR'],
    'experience': [2, 5, 8, 12, 15]
}

df = pd.DataFrame(data)
print("Dataset:")
print(df)

# Basic operations
print("\nDataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Data types:\n{df.dtypes}")

# Statistical summary
print("\nStatistical Summary:")
print(df.describe())

In [None]:
# Data manipulation operations
print("Data Manipulation Examples:")

# Filtering
high_salary = df[df['salary'] > 65000]
print("\nHigh salary employees:")
print(high_salary)

# Grouping
dept_stats = df.groupby('department').agg({
    'salary': ['mean', 'count'],
    'age': 'mean'
})
print("\nDepartment statistics:")
print(dept_stats)

# Adding new columns
df['salary_per_year_exp'] = df['salary'] / df['experience']
print("\nDataset with new column:")
print(df[['salary', 'experience', 'salary_per_year_exp']])

## Matplotlib for Basic Visualization

In [None]:
import matplotlib.pyplot as plt

# Basic plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Line plot
axes[0, 0].plot(df['experience'], df['salary'], 'o-')
axes[0, 0].set_title('Salary vs Experience')
axes[0, 0].set_xlabel('Experience (years)')
axes[0, 0].set_ylabel('Salary')

# Bar plot
dept_counts = df['department'].value_counts()
axes[0, 1].bar(dept_counts.index, dept_counts.values)
axes[0, 1].set_title('Employees by Department')
axes[0, 1].set_ylabel('Count')

# Histogram
axes[1, 0].hist(df['age'], bins=5, alpha=0.7)
axes[1, 0].set_title('Age Distribution')
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Frequency')

# Scatter plot
colors = {'IT': 'blue', 'HR': 'red', 'Finance': 'green'}
for dept in df['department'].unique():
    dept_data = df[df['department'] == dept]
    axes[1, 1].scatter(dept_data['age'], dept_data['salary'], 
                      c=colors[dept], label=dept, alpha=0.7)
axes[1, 1].set_title('Age vs Salary by Department')
axes[1, 1].set_xlabel('Age')
axes[1, 1].set_ylabel('Salary')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## Seaborn for Statistical Visualization

In [None]:
import seaborn as sns

# Set style
sns.set_style("whitegrid")

# Create larger dataset for better visualization
np.random.seed(42)
n_samples = 100
large_data = {
    'age': np.random.normal(35, 8, n_samples),
    'salary': np.random.normal(65000, 15000, n_samples),
    'experience': np.random.normal(8, 4, n_samples),
    'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing'], n_samples)
}
large_df = pd.DataFrame(large_data)

# Seaborn plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution plot
sns.histplot(data=large_df, x='salary', hue='department', ax=axes[0, 0])
axes[0, 0].set_title('Salary Distribution by Department')

# Box plot
sns.boxplot(data=large_df, x='department', y='salary', ax=axes[0, 1])
axes[0, 1].set_title('Salary Distribution by Department')
axes[0, 1].tick_params(axis='x', rotation=45)

# Correlation heatmap
numeric_cols = ['age', 'salary', 'experience']
correlation = large_df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, ax=axes[1, 0])
axes[1, 0].set_title('Correlation Matrix')

# Pair plot (using subplot)
sns.scatterplot(data=large_df, x='experience', y='salary', hue='department', ax=axes[1, 1])
axes[1, 1].set_title('Experience vs Salary')

plt.tight_layout()
plt.show()

## Scikit-learn Basics

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Generate sample dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, 
                          n_informative=5, random_state=42)

print(f"Dataset shape: {X.shape}")
print(f"Target distribution: {np.bincount(y)}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Original data statistics:")
print(f"Mean: {X_train.mean():.3f}, Std: {X_train.std():.3f}")
print("\nScaled data statistics:")
print(f"Mean: {X_train_scaled.mean():.3f}, Std: {X_train_scaled.std():.3f}")

# Model training and evaluation
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    # Use scaled data for logistic regression, original for random forest
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.3f}")

# Compare models
print("\nModel Comparison:")
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.3f}")

## Complete ML Pipeline Example

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

# Final evaluation
best_model = grid_search.best_estimator_
y_pred_final = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)

print(f"\nFinal test accuracy: {final_accuracy:.3f}")

# Confusion matrix visualization
cm = confusion_matrix(y_test, y_pred_final)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final))

## Key Takeaways

### NumPy
- Efficient numerical operations
- Foundation for other ML libraries
- Array manipulation and mathematical functions

### Pandas
- Data loading, cleaning, and manipulation
- Handling structured data (CSV, Excel, databases)
- Grouping, filtering, and aggregation operations

### Matplotlib
- Basic plotting and visualization
- Customizable plots for presentations
- Foundation for other visualization libraries

### Seaborn
- Statistical visualization
- Beautiful default styles
- Easy exploration of relationships in data

### Scikit-learn
- Consistent API across algorithms
- Built-in preprocessing and evaluation tools
- Pipeline support for clean workflows

## Next Steps
- Learn mathematical foundations for ML
- Explore deep learning frameworks (TensorFlow, Keras)
- Practice with real-world datasets