In [None]:
#### Fit, Evaluate, and Interpret Logistic Regression ####

# Importing necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Import data
data_train = pd.read_csv("college_train.csv")
data_test = pd.read_csv("college_test.csv")

y_train = data_train['Private'].astype('category').cat.codes
y_test = data_test['Private'].astype('category').cat.codes


## Logistic Regression (1)
X_train = pd.DataFrame(data_train["F.Undergrad"])
X_test = pd.DataFrame(data_test["F.Undergrad"])

lr_pipeline = Pipeline(
  [("lr", LogisticRegression())]
).set_output(transform="pandas")

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)
acc_1 = accuracy_score(y_test, y_pred)
print("Accuracy of Model 1:")
print(acc_1)

Accuracy of Model 1:
0.8653846153846154


In [None]:
## Logistic Regression (2)
X_train = data_train[["F.Undergrad", "Room.Board"]]
X_test = data_test[["F.Undergrad", "Room.Board"]]

lr_pipeline = Pipeline(
  [("lr", LogisticRegression())]
).set_output(transform="pandas")

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)
acc_2 = accuracy_score(y_test, y_pred)
print("Accuracy of Model 2:")
print(acc_2)

Accuracy of Model 2:
0.9230769230769231


In [None]:
## Logistic Regression (3)
X_train = data_train[["F.Undergrad", "Room.Board", "Top10perc"]]
X_test = data_test[["F.Undergrad", "Room.Board", "Top10perc"]]

lr_pipeline = Pipeline(
  [("lr", LogisticRegression())]
).set_output(transform="pandas")

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)
acc_3 = accuracy_score(y_test, y_pred)
print("Accuracy of Model 3:")
print(acc_3)

Accuracy of Model 3:
0.9230769230769231


In [None]:
## Logistic Regression (4)
X_train = data_train[["F.Undergrad", "Room.Board", "Top10perc", "Accept"]]
X_test = data_test[["F.Undergrad", "Room.Board", "Top10perc", "Accept"]]

lr_pipeline = Pipeline(
  [("lr", LogisticRegression())]
).set_output(transform="pandas")

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)
acc_4 = accuracy_score(y_test, y_pred)
print("Accuracy of Model 4:")
print(acc_4)

coefs = pd.DataFrame({
    "Variable": X_train.columns,
    "Coefficient": lr_fitted["lr"].coef_[0]
})
print("Coefficients for Model 4:")
print(coefs)

Accuracy of Model 4:
0.9230769230769231
Coefficients for Model 4:
      Variable  Coefficient
0  F.Undergrad    -0.001525
1   Room.Board     0.000553
2    Top10perc     0.053456
3       Accept     0.001320


In [None]:
# Predictions for two Top10perc rates

# Create data frame for new observations
new_school = {'F.Undergrad': [3680, 3680],
            'Room.Board': [4350, 4350],
            'Top10perc': [10, 35],
            'Accept': [2000, 2000]
            }
new_school = pd.DataFrame(data = new_school)

# Build logistic regression model
lr_pipeline = Pipeline(
  [("lr", LogisticRegression())]
).set_output(transform="pandas")

lr_fitted = lr_pipeline.fit(X_train, y_train)

# Compute and print predictions
y_pred = lr_fitted.predict_proba(new_school)
print("Predictions of Private, for Two Top10perc rates:")
print(y_pred)

Predictions of Private, for Two Top10perc rates:
[[0.5076264  0.4923736 ]
 [0.21317597 0.78682403]]
