1)	Implement a simple linear regression model for the salary.csv dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

data = pd.read_csv('salary.csv')

print(data.head())

X = data[['months']] 
y = data['salary']  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', label='Actual Salary (Training set)')
plt.plot(X_train, model.predict(X_train), color='red', label='Regression Line')
plt.title('Salary vs Experience in Months (Training set)')
plt.xlabel('Months of Experience')
plt.ylabel('Salary')
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='green', label='Actual Salary (Test set)')
plt.plot(X_train, model.predict(X_train), color='red', label='Regression Line')
plt.title('Salary vs Experience in Months (Test set)')
plt.xlabel('Months of Experience')
plt.ylabel('Salary')
plt.legend()
plt.show()

2)	Implement a simple linear regression model for the rent.csv dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

data = pd.read_csv('rent.csv')

print(data.head())

X = data[['month']]
y = data['rent']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', label='Actual Rent (Training set)')
plt.plot(X_train, model.predict(X_train), color='red', label='Regression Line')
plt.title('Rent vs Month (Training set)')
plt.xlabel('Month')
plt.ylabel('Rent')
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='green', label='Actual Rent (Test set)')
plt.plot(X_train, model.predict(X_train), color='red', label='Regression Line')
plt.title('Rent vs Month (Test set)')
plt.xlabel('Month')
plt.ylabel('Rent')
plt.legend()
plt.show()


3)	Implement a simple linear regression model for the sales.csv dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

data = pd.read_csv('sales.csv')
X = data['fahrenheit'].values.reshape(-1, 1)
y = data['sales'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print("\nModel Evaluation Metrics:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {model.score(X_test_scaled, y_test):.4f}")

plt.figure(figsize=(12, 6))

plt.scatter(X_train, y_train, color='blue', label='Training Data', alpha=0.5)
# Plot test data
plt.scatter(X_test, y_test, color='green', label='Test Data', alpha=0.5)

X_sorted = np.sort(X)
X_sorted_scaled = scaler.transform(X_sorted.reshape(-1, 1))
y_pred_sorted = model.predict(X_sorted_scaled)
plt.plot(X_sorted, y_pred_sorted, color='red', label='Regression Line')

plt.xlabel('Temperature (Fahrenheit)')
plt.ylabel('Sales')
plt.title('Linear Regression: Sales vs Temperature')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nModel Coefficients:")
print(f"Slope: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")

4)	Implement a multiple linear regression model for the house.csv dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('house.csv')

# If the first row is indeed the header, pandas should have recognized it
# If not, we can set the column names manually
if 'Square_Foot' in data.columns:
    data.columns = ['size', 'bedrooms', 'bathrooms', 'year', 'lot_size', 'garage_size', 'neighborhood_quality', 'price']
else:
    # If pandas didn't recognize the header, we set it manually and skip the first row
    column_names = ['size', 'bedrooms', 'bathrooms', 'year', 'lot_size', 'garage_size', 'neighborhood_quality', 'price']
    data = pd.read_csv('house.csv', header=None, names=column_names, skiprows=1)

# Split features and target
X = data.drop('price', axis=1)
y = data['price']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

# Visualizations
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted House Prices")
plt.tight_layout()
plt.show()

residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.plot([y_pred.min(), y_pred.max()], [0, 0], 'r--', lw=2)
plt.xlabel("Predicted Price")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.tight_layout()
plt.show()

feature_importance = pd.DataFrame({'feature': X.columns, 'importance': abs(model.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(feature_importance['feature'], feature_importance['importance'])
plt.xlabel("Features")
plt.ylabel("Absolute Coefficient Value")
plt.title("Feature Importance")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

5)	Implement a multiple linear regression model for the income.csv dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('income.csv')

# Split features and target
X = data[['age', 'experience']]
y = data['income']

# i. Data scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ii. Training and testing of the model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# iii. Create the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# iv. Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

# v. Visualize the results

# Actual vs Predicted plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Income")
plt.ylabel("Predicted Income")
plt.title("Actual vs Predicted Income")
plt.tight_layout()
plt.show()

# Residual plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.plot([y_pred.min(), y_pred.max()], [0, 0], 'r--', lw=2)
plt.xlabel("Predicted Income")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.tight_layout()
plt.show()

# Feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': abs(model.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(feature_importance['feature'], feature_importance['importance'])
plt.xlabel("Features")
plt.ylabel("Absolute Coefficient Value")
plt.title("Feature Importance")
plt.tight_layout()
plt.show()

# 3D scatter plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X['age'], X['experience'], y, c=y, cmap='viridis')
ax.set_xlabel('Age')
ax.set_ylabel('Experience')
ax.set_zlabel('Income')
plt.colorbar(scatter)
plt.title('3D Scatter Plot: Age, Experience, and Income')
plt.tight_layout()
plt.show()

In [None]:
6)	Implement a logistic regression model for the built-in digits dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Load the digits dataset
digits = load_digits()
X, y = digits.data, digits.target

# i. Data scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ii. Training and testing of the model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# iii. Create the regression model
base_model = LogisticRegression(max_iter=1000)
model = OneVsRestClassifier(base_model)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# iv. Display confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# v. Display k-fold cross-validation score
cv_scores = cross_val_score(model, X_scaled, y, cv=5)
print("\nCross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation of CV scores:", cv_scores.std())

In [None]:
7)	Implement a Support Vector Machine model for the built-in iris dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# i. Data scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ii. Training and testing of the model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# iii. Create the SVM model
model = SVC(kernel='rbf', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# iv. Display confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# v. Display k-fold cross-validation score
cv_scores = cross_val_score(model, X_scaled, y, cv=5)
print("\nCross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation of CV scores:", cv_scores.std())

# Function to plot decision boundaries
def plot_decision_boundaries(X, y, ax=None):
    model = SVC(kernel='rbf', random_state=42)
    model.fit(X, y)
    
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    if ax is None:
        ax = plt.gca()
    
    ax.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolor='black')
    ax.set_xlabel(iris.feature_names[0])
    ax.set_ylabel(iris.feature_names[1])
    return scatter

# Visualize decision boundaries for different feature pairs
fig, axs = plt.subplots(3, 2, figsize=(15, 20))
feature_pairs = [(0, 1), (0, 2), ( 0, 3), (1, 2), (1, 3), (2, 3)]
for i, (ax, pair) in enumerate(zip(axs.flatten(), feature_pairs)):
    X_pair = X_scaled[:, pair]
    scatter = plot_decision_boundaries(X_pair, y, ax=ax)
    ax.set_title(f'Decision Boundaries (Features {pair[0]} and {pair[1]})')
plt.tight_layout()
plt.show()

8)	Implement a Bagging model for the built-in wine dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Load the wine dataset
wine = load_wine()
X, y = wine.data, wine.target

# i. Data scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ii. Training and testing of the model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# iii. Create the Bagging model
base_estimator = DecisionTreeClassifier(random_state=42)
try:
    # For newer versions of scikit-learn
    model = BaggingClassifier(estimator=base_estimator, n_estimators=10, random_state=42)
except TypeError:
    # For older versions of scikit-learn
    model = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# iv. Display confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=wine.target_names))

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=wine.target_names, 
            yticklabels=wine.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# v. Display k-fold cross-validation score
cv_scores = cross_val_score(model, X_scaled, y, cv=5)
print("\nCross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation of CV scores:", cv_scores.std())

# Feature importance
feature_importance = np.mean([tree.feature_importances_ for tree in model.estimators_], axis=0)
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(wine.feature_names)[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

9) Implement a PCA to visualize the built-in wine dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Plot the cumulative explained variance ratio
plt.figure(figsize=(10, 6))
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio vs. Number of Components')
plt.grid(True)
plt.show()

# 2D visualization
plt.figure(figsize=(12, 8))
colors = ['r', 'g', 'b']
for color, i, target_name in zip(colors, [0, 1, 2], wine.target_names):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=color, alpha=.8, lw=2,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of Wine Dataset (2 components)')
plt.xlabel(f'First Principal Component ({explained_variance_ratio[0]:.2f})')
plt.ylabel(f'Second Principal Component ({explained_variance_ratio[1]:.2f})')
plt.show()

# 3D visualization
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
for color, i, target_name in zip(colors, [0, 1, 2], wine.target_names):
    ax.scatter(X_pca[y == i, 0], X_pca[y == i, 1], X_pca[y == i, 2], color=color, alpha=.8,
               label=target_name)
ax.legend(loc='best', shadow=False, scatterpoints=1)
ax.set_title('PCA of Wine Dataset (3 components)')
ax.set_xlabel(f'First Principal Component ({explained_variance_ratio[0]:.2f})')
ax.set_ylabel(f'Second Principal Component ({explained_variance_ratio[1]:.2f})')
ax.set_zlabel(f'Third Principal Component ({explained_variance_ratio[2]:.2f})')
plt.show()

# Print the explained variance ratio for each component
for i, ratio in enumerate(explained_variance_ratio):
    print(f"PC{i+1} explained variance ratio: {ratio:.4f}")

# Print the cumulative explained variance ratio for 2 and 3 components
print(f"\nCumulative explained variance ratio (2 components): {cumulative_variance_ratio[1]:.4f}")
print(f"Cumulative explained variance ratio (3 components): {cumulative_variance_ratio[2]:.4f}")

In [None]:
10) Implement a Singular Value Decomposition (SVD) on the given input matrix.  Display U matrix, the singular values, and the V transpose matrix. Finally, reconstruct the original matrix and display  the results.

In [None]:
import numpy as np

# Define the input matrix
# For this example, let's use a 4x3 matrix
A = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [10, 11, 12]
])

print("Original Matrix A:")
print(A)
print()

# Perform SVD
U, s, Vt = np.linalg.svd(A, full_matrices=False)

# Display U matrix
print("U matrix:")
print(U)
print()

# Display singular values
print("Singular values:")
print(s)
print()

# Display V transpose matrix
print("V transpose matrix:")
print(Vt)
print()

# Reconstruct the original matrix
# We need to create a diagonal matrix from the singular values
S = np.diag(s)

# Reconstruct A = U * S * Vt
A_reconstructed = np.dot(U, np.dot(S, Vt))

print("Reconstructed Matrix A:")
print(A_reconstructed)
print()

# Check if the reconstruction is close to the original
if np.allclose(A, A_reconstructed):
    print("The reconstruction is successful!")
else:
    print("There might be some numerical differences due to floating-point arithmetic.")

# Calculate and print the difference between original and reconstructed matrices
diff = np.abs(A - A_reconstructed)
print("\nDifference between original and reconstructed matrices:")
print(diff)
print("\nMaximum difference:", np.max(diff))