# 1. Linear Regression using scikit-learn

In [1]:
# Import needed packages for regression
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Silence warning from sklearn
import warnings
warnings.filterwarnings('ignore')

# Input feature values for a sample instance
carat = float(input())
table = float(input())

# Load the diamonds dataset
diamonds = pd.read_csv('diamonds.csv')

# Define input and output features
X = diamonds[['carat', 'table']]
y = diamonds['price']

# Initialize a multiple linear regression model
model = LinearRegression()

# Fit the multiple linear regression model to the input and output features
model.fit(X, y)

# Get estimated intercept weight
intercept = model.intercept_
print('Intercept is', round(intercept, 3))

# Get estimated weights for carat and table features
coefficients = model.coef_
print('Weights for carat and table features are', np.round(coefficients, 3))

# Predict the price of a diamond with the user-input carat and table values
prediction = model.predict(np.array([[carat, table]]))
print('Predicted price is', np.round(prediction, 2))


0.5
60
Intercept is 1961.992
Weights for carat and table features are [7820.038  -74.301]
Predicted price is [1413.97]


# 2. Logistic Regression using scikit-learn

In [3]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load nbaallelo_log.csv into a dataframe
NBA = pd.read_csv('nbaallelo_log.csv')

# Create binary feature for game_result with 0 for L and 1 for W
NBA['win'] = NBA['game_result'].apply(lambda x: 1 if x == 'W' else 0)

# Store relevant columns as variables
X = NBA[['elo_i']]
y = NBA['win'].ravel()  # Flatten to a 1-D array

# Initialize and fit the logistic model using the LogisticRegression() function
model = LogisticRegression()
model.fit(X, y)

# Print the weights for the fitted model
print('w1:', model.coef_)

# Print the intercept of the fitted model
print('w0:', model.intercept_)

# Find the proportion of instances correctly classified
y_pred = model.predict(X)
score = accuracy_score(y, y_pred)
print(round(score, 3))


w1: [[0.00437785]]
w0: [-6.54757518]
0.599


# 3. Support Vector Classifier using scikit-learn

In [4]:
# Import the necessary packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# Load the heart dataset
heart = pd.read_csv('heart.csv')

# Input features: thalach and age
X = heart[['thalach', 'age']]

# Output feature: target
y = heart[['target']]

# Create training and testing data with 75% training data and 25% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# Scale the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize a support vector classifier with C=0.2 and a maximum of 500 iterations
SVC = LinearSVC(C=0.2, max_iter=500, random_state=123)

# Fit the support vector classifier according to the training data
SVC.fit(X_train, np.ravel(y_train))

# Evaluate model on testing data
score = SVC.score(X_test, np.ravel(y_test))
print(np.round(score, 3))

# Print the model weights
# w0 (intercept)
print('w0:', np.round(SVC.intercept_, 3))

# w1 and w2 (coefficients)
print('w1 and w2:', np.round(SVC.coef_, 3))


0.671
w0: [0.125]
w1 and w2: [[ 0.39  -0.084]]


# 4. k-Nearest Neighbors using scikit-learn

In [5]:
# Import needed packages for classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Import packages for evaluation
from sklearn.metrics import accuracy_score

# Load the dataset
skySurvey = pd.read_csv('SDSS.csv')

# Create a new feature from u - g
skySurvey['u_g'] = skySurvey['u'] - skySurvey['g']

# Create dataframe X with features redshift and u_g
X = skySurvey[['redshift', 'u_g']]

# Create dataframe y with feature class
y = skySurvey['class']

# Set random seed for reproducibility
np.random.seed(42)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Initialize model with k=3
skySurveyKnn = KNeighborsClassifier(n_neighbors=3)

# Fit model using X_train and y_train
skySurveyKnn.fit(X_train, y_train)

# Find the predicted classes for X_test
y_pred = skySurveyKnn.predict(X_test)

# Calculate accuracy score
score = accuracy_score(y_test, y_pred)

# Print accuracy score
print('Accuracy score is ', end="")
print('%.3f' % score)


Accuracy score is 0.984


# 5. Naive Bayes using scikit-learn

In [6]:
# Import the necessary modules
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
skySurvey = pd.read_csv('SDSS.csv')

# Create a new feature from u - g
skySurvey['u_g'] = skySurvey['u'] - skySurvey['g']

# Create dataframe X with features redshift and u_g
X = skySurvey[['redshift', 'u_g']]

# Create dataframe y with feature class
y = skySurvey['class'].ravel()  # Flatten to a 1-D array

# Initialize a Gaussian naive Bayes model
skySurveyNBModel = GaussianNB()

# Fit the model
skySurveyNBModel.fit(X, y)

# Calculate the proportion of instances correctly classified
y_pred = skySurveyNBModel.predict(X)
score = accuracy_score(y, y_pred)

# Print accuracy score
print('Accuracy score is ', end="")
print('%.3f' % score)


Accuracy score is 0.989


# 6.1. Bagging using scikit-learn

In [7]:
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor

# Load the dataset
df = pd.read_csv('msleep_clean.csv')

# Create a dataframe X containing the features awake, brainwt, and bodywt, in that order
X = df[['awake', 'brainwt', 'bodywt']]

# Create a dataframe y containing sleep_rem
y = df['sleep_rem']

# Initialize and fit bagging regressor with 30 base estimators, a random state of 10, and oob_score=True
sleepModel = BaggingRegressor(n_estimators=30, random_state=10, oob_score=True)
sleepModel.fit(X, y)

# Calculate out-of-bag accuracy
print(np.round(sleepModel.oob_score_, 4))

# Calculate predictions from out-of-bag estimate
print(np.round(sleepModel.oob_prediction_, 4))


0.3144
[3.1    2.98   0.8    1.6867 0.7167 1.8533 2.3091 2.0727 1.5231 2.1727
 2.95   0.5375 2.9417 2.8727 0.9    0.7765 1.9818 2.5    1.8692 1.57
 0.9692 1.1778 1.5769 2.75   2.4    2.1364 4.1727 1.6765 0.71   1.3909
 2.13   1.9733 3.1429 3.3533 0.51   2.0077 1.4    2.0143 2.45   1.975
 2.6    2.3    1.2    0.58   2.8222 1.9222 0.7417 1.24  ]


## 6.2. Random forests using scikit-learn

In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('mpg_clean.csv')

# Create a dataframe X containing the input features
X = df.drop(columns=['name', 'origin'])
# Create a dataframe y containing the output feature origin
y = df['origin']

# Get user-input for n_estimators and max_features
estimators = int(input())
max_features = int(input())

# Initialize and fit a random forest classifier with user-input parameters
rfModel = RandomForestClassifier(n_estimators=estimators, max_features=max_features, random_state=123)
rfModel.fit(X, y)

# Calculate prediction accuracy
score = rfModel.score(X, y)
print(round(score, 4))

# Calculate the permutation importance using the default parameters and a random state of 123
result = permutation_importance(rfModel, X, y, random_state=123)

# Variable importance table
importance_table = pd.DataFrame(
    data={'feature': rfModel.feature_names_in_, 'permutation importance': result.importances_mean}
).sort_values('permutation importance', ascending=False)

print(importance_table)


5
3
0.9796
        feature  permutation importance
2  displacement                0.453571
0           mpg                0.160204
4        weight                0.133673
3    horsepower                0.107653
5  acceleration                0.057143
6    model_year                0.051531
1     cylinders                0.012245


# 6.3. Boosting using scikit-learn

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Load the dataset
mpg = pd.read_csv('mpg.csv')

# Create a dataframe X containing cylinders, weight, and mpg
X = mpg[['cylinders', 'weight', 'mpg']]
# Create a dataframe y containing origin
y = mpg['origin']

# Get user-input for learning rate
lr = float(input())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Initialize and fit an adaptive boosting classifier with the user-input learning rate
adaBoostModel = AdaBoostClassifier(learning_rate=lr, random_state=123)
adaBoostModel.fit(X_train, y_train)

# Initialize and fit a gradient boosting classifier with the user-input learning rate
gradientBoostModel = GradientBoostingClassifier(learning_rate=lr, random_state=123)
gradientBoostModel.fit(X_train, y_train)

# Calculate the prediction accuracy for the adaptive boosting classifier
adaBoostScore = adaBoostModel.score(X_test, y_test)
print(round(adaBoostScore, 4))

# Calculate the prediction accuracy for the gradient boosting classifier
gradientBoostScore = gradientBoostModel.score(X_test, y_test)
print(round(gradientBoostScore, 4))


0.6
0.6417
0.6417
