In [148]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [149]:
applicants_files = [
    "2018Applicants.csv", "2019Applicants.csv", "2020Applicants.csv", 
    "2021Applicants.csv", "2022Applicants.csv", "2023Applicants.csv"
]
matriculants_files = [
    "2018Matriculants.csv", "2019Matriculants.csv", "2020Matriculants.csv", 
    "2021Matriculants.csv", "2022Matriculants.csv", "2023Matriculants.csv"
]
file_path_dicts = [
    "/Users/lonely/Documents/classes/cs445/finalproj/dataset_repo/final/COSC445FP/DataSets/apps_mats_state_and_race/",
    "/Users/lonely/Documents/classes/cs445/finalproj/dataset_repo/final/COSC445FP/DataSets/mcats_gpas_apps_mats_race/",
    "/Users/lonely/Documents/classes/cs445/finalproj/dataset_repo/final/COSC445FP/DataSets/mcats_gpas_apps_mats_states/"
]


In [150]:

data_path = "/Users/lonely/Documents/classes/cs445/finalproj/dataset_repo/final/COSC445FP/DataSets/mcats_gpas_apps_mats_states/"

# Initialize an empty DataFrame to combine all years
all_data = []

# Process applicants and matriculants for all years
for app_file, mat_file in zip(applicants_files, matriculants_files):
    # Load datasets
    applicants = pd.read_csv(data_path + app_file)
    matriculants = pd.read_csv(data_path + mat_file)
    
    # Add a 'Status' column: 0 for applicants, 1 for matriculants
    applicants['Status'] = 0
    matriculants['Status'] = 1
    
    # Combine the datasets
    combined = pd.concat([applicants, matriculants], ignore_index=True)
    all_data.append(combined)

# Combine all years into a single dataset
data = pd.concat(all_data, ignore_index=True)

# Define the target variable: 'Status' (0 = did not get in, 1 = got in)
target = 'Status'

# Define features for logistic regression (exclude non-numeric and redundant columns)
features = ['MCAT CPBS Mean', 'MCAT CARS Mean', 'MCAT BBLS Mean', 
            'MCAT PSBB Mean', 'GPA Science Mean', 'GPA Non-Science Mean', 'GPA Total Mean']

# Drop rows with missing values in the selected features or target
data = data.dropna(subset=features + [target])

# Ensure all features are numeric
X = data[features].apply(pd.to_numeric, errors='coerce')
X = X.dropna()  # Drop rows with non-numeric values after coercion
y = data.loc[X.index, target]  # Ensure target matches filtered rows

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = log_model.predict(X_test)

# Output results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Display the logistic regression coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': log_model.coef_[0]})
print("\nLogistic Regression Coefficients:\n", coefficients)


Accuracy: 0.9315068493150684

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.92        61
           1       0.95      0.93      0.94        85

    accuracy                           0.93       146
   macro avg       0.93      0.93      0.93       146
weighted avg       0.93      0.93      0.93       146


Logistic Regression Coefficients:
                 Feature  Coefficient
0        MCAT CPBS Mean     0.297256
1        MCAT CARS Mean    -0.565234
2        MCAT BBLS Mean     1.505108
3        MCAT PSBB Mean     0.886017
4      GPA Science Mean     3.244847
5  GPA Non-Science Mean     2.203159
6        GPA Total Mean     2.829632


In [151]:
data_path = "/Users/lonely/Documents/classes/cs445/finalproj/dataset_repo/final/COSC445FP/DataSets/apps_mats_state_and_race/"

# Initialize an empty DataFrame to combine all years
all_data = []

# Process applicants and matriculants for all years
for app_file, mat_file in zip(applicants_files, matriculants_files):
    # Load datasets
    applicants = pd.read_csv(data_path + app_file)
    matriculants = pd.read_csv(data_path + mat_file)
    
    # Add a 'Status' column: 0 for applicants, 1 for matriculants
    applicants['Status'] = 0
    matriculants['Status'] = 1
    
    # Combine the datasets
    combined = pd.concat([applicants, matriculants], ignore_index=True)
    all_data.append(combined)

# Combine all years into a single dataset
data = pd.concat(all_data, ignore_index=True)

# Define the target variable: 'Status' (0 = applicant, 1 = matriculant)
target = 'Status'

# Define features for logistic regression
features = ['American Indian or Alaska Native', 'Asian', 'Black or African American',
            'Hispanic, Latino, or of Spanish Origin', 'Native Hawaiian or Other Pacific Islander',
            'White', 'Other', 'Multiple Race/Ethnicity', 'Unknown Race/Ethnicity',
            'Non-U.S. Citizen and Non-Permanent Resident', 'Total']

# Drop rows with missing values in the selected features or target
data = data.dropna(subset=features + [target])

# Ensure all features are numeric
X = data[features].apply(pd.to_numeric, errors='coerce')
X = X.dropna()  # Drop rows with non-numeric values after coercion
y = data.loc[X.index, target]  # Ensure target matches filtered rows

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = log_model.predict(X_test)

# Output results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Display the logistic regression coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': log_model.coef_[0]})
print("\nLogistic Regression Coefficients:\n", coefficients)


Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.56      0.71        18
           1       0.64      1.00      0.78        14

    accuracy                           0.75        32
   macro avg       0.82      0.78      0.75        32
weighted avg       0.84      0.75      0.74        32


Logistic Regression Coefficients:
                                         Feature  Coefficient
0              American Indian or Alaska Native    -0.082521
1                                         Asian     0.136789
2                     Black or African American     0.125911
3        Hispanic, Latino, or of Spanish Origin     0.146298
4     Native Hawaiian or Other Pacific Islander    -0.611308
5                                         White     0.111187
6                                         Other     0.147749
7                       Multiple Race/Ethnicity     0.090818
8                        Unknown Race/Ethnicity 

In [152]:
# Aggregate statistics for applicants and matriculants
applicants_summary = data[data['Status'] == 0][features].mean()
matriculants_summary = data[data['Status'] == 1][features].mean()

# Combine into a summary table
summary = pd.DataFrame({
    'Applicants': applicants_summary,
    'Matriculants': matriculants_summary,
    'Difference': matriculants_summary - applicants_summary
})
print(summary)


                                             Applicants Matriculants  \
American Indian or Alaska Native               1.711538     0.692308   
Asian                                        241.153846   113.346154   
Black or African American                     89.576923    35.365385   
Hispanic, Latino, or of Spanish Origin        61.057692    28.673077   
Native Hawaiian or Other Pacific Islander      1.057692     0.365385   
White                                        405.923077   183.288462   
Other                                         24.173077     9.269231   
Multiple Race/Ethnicity                      115.346154    52.384615   
Unknown Race/Ethnicity                        32.384615    12.596154   
Non-U.S. Citizen and Non-Permanent Resident   13.096154     3.038462   
Total                                        985.480769   439.019231   

                                             Difference  
American Indian or Alaska Native              -1.019231  
Asian              

In [153]:
# File paths for applicants and matriculants
data_path = "/Users/lonely/Documents/classes/cs445/finalproj/dataset_repo/final/COSC445FP/DataSets/mcats_gpas_apps_mats_race/"

# Initialize an empty list to combine data from all years
all_data = []

# Loop through applicant and matriculant files
for app_file, mat_file in zip(applicants_files, matriculants_files):
    # Construct full file paths
    app_path = os.path.join(data_path, app_file)
    mat_path = os.path.join(data_path, mat_file)
    
    # Check if files exist
    if not os.path.exists(app_path):
        print(f"File not found: {app_path}")
        continue
    if not os.path.exists(mat_path):
        print(f"File not found: {mat_path}")
        continue
    
    # Load the datasets
    applicants = pd.read_csv(app_path)
    matriculants = pd.read_csv(mat_path)
    
    # Add a 'Status' column: 0 for applicants, 1 for matriculants
    applicants['Status'] = 0
    matriculants['Status'] = 1
    
    # Combine the datasets
    combined = pd.concat([applicants, matriculants], ignore_index=True)
    all_data.append(combined)

# Combine all years into a single dataset
if all_data:
    data = pd.concat(all_data, ignore_index=True)
else:
    raise ValueError("No data was loaded. Please check the file paths and contents.")

# Define features for logistic regression (ensure these match your dataset columns)
features = ['American Indian or Alaska Native', 'Asian', 'Black or African American',
            'Hispanic, Latino, or of Spanish Origin', 'Native Hawaiian or Other Pacific Islander',
            'White', 'Other', 'Multiple Race/Ethnicity', 'Unknown Race/Ethnicity',
            'Non-U.S. Citizen and Non-Permanent Resident', 'Total']
target = 'Status'

# Ensure selected features and target are present in the data
data = data.dropna(subset=features + [target])

# Ensure all features are numeric
X = data[features].apply(pd.to_numeric, errors='coerce')
X = X.dropna()  # Drop rows with non-numeric values after coercion
y = data.loc[X.index, target]  # Ensure target matches filtered rows

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = log_model.predict(X_test)

# Output results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Display the logistic regression coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': log_model.coef_[0]})
print("\nLogistic Regression Coefficients:\n", coefficients)


Accuracy: 0.6666666666666666

Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80        10
           1       0.00      0.00      0.00         5

    accuracy                           0.67        15
   macro avg       0.33      0.50      0.40        15
weighted avg       0.44      0.67      0.53        15


Logistic Regression Coefficients:
                                         Feature  Coefficient
0              American Indian or Alaska Native     0.917228
1                                         Asian    -0.097448
2                     Black or African American     0.721369
3        Hispanic, Latino, or of Spanish Origin    -0.208682
4     Native Hawaiian or Other Pacific Islander    -0.351564
5                                         White    -0.003833
6                                         Other    -0.042380
7                       Multiple Race/Ethnicity    -0.377995
8                        Unknown R

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
