'''
For Development, I knew that adding features would take a good amount of normalization and standardization, as there are several categories that don't simply fall between the GPA 0-4 scale. So, I began by only taking the features that were between these scales and training based of those. However there came the issue of missing values, so what I initially did was replacing them with zero, and then going from there after the model was built. I figured that since it was another Regression model, I'd use a structure similar to the one I did last time, and that is the SVR model with a linear kernel. Once this was figured out, I returned to my problem of missing values. After some research, I figured the SimpleImputer class with a mean estimate would get the job done nicely for replacing these missing values, and did just that. After creating and submitting my model, I was quite surprised to see I got a good score. Now to further optimize, I attempted to port in additional metrics such as the student's major or age. At first, I would convert all major's to a value from a range of 0 to 1 in a single column and appended that to my features list, but it did not seem to significantly impact my scores. However, upon a new attempt, I have instead opted to separate each major into a separate column and assign it either a 0 or a 1. Results seemed to improve.
'''


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/project-2-cosc-220-spring-2025/train.csv
/kaggle/input/project-2-cosc-220-spring-2025/test.csv
/kaggle/input/project-2-cosc-220-spring-2025/archive/train.json
/kaggle/input/project-2-cosc-220-spring-2025/archive/test.json


In [2]:
#Read data from the file and add it
import json
import numpy as np
import pandas as pd

train_path = '/kaggle/input/project-2-cosc-220-spring-2025/train.csv'
test_path = '/kaggle/input/project-2-cosc-220-spring-2025/test.csv'

trainData = pd.read_csv(train_path)
testData = pd.read_csv(test_path)

#data loaded
print("Data has been loaded")


#print(testData[:5])

#to hold the end gpa
y = trainData["GPA"]

#features to look for
features = ["Age at First Term","GPA Term 1","GPA Term 2","GPA Term 2", "GPA Term 3", "GPA Term 4", "GPA Term 5"]


print(testData)


Data has been loaded
       ID Gender Admit Type  Age at First Term                     Major  \
0    1001      M         FR                 18                Psychology   
1    1002      M        TRN                 22  Integrated Marketing Com   
2    1003      F         FR                 18  Integrated Marketing Com   
3    1004      F        TRN                 20  Integrated Marketing Com   
4    1005      M        TRN                 20                 Economics   
..    ...    ...        ...                ...                       ...   
720  1721      F        TRN                 20   Business Administration   
721  1722      M        IFR                 23   Business Administration   
722  1723      M        IFR                 20     Computer Science/Math   
723  1724      F        ITR                 20      Sports Medicine - BA   
724  1725      M        TRN                 19         Political Science   

     High School GPA  SAT Verbal  SAT Math  ACT Reading  ACT Engli

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [3]:
#data now needs to be loaded to a variable and preprocessed, my current plan is to simply replace missing values with the mean, but if that proves largely innacurate I may switch
#to a more precise prediction method
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

#model of imputing data
imputer = SimpleImputer(strategy='mean')
knn_imputer = KNNImputer(n_neighbors=5)

X = trainData[features]
X.replace(0.00, np.nan, inplace=True)
Xtest = testData[features]
Xtest.replace(0.00, np.nan, inplace=True)
'''
Previous method of replacing values:

#zeroes must first be replaced with NaN to be detected by the simple imputer


X.replace(0.00, np.nan, inplace=True)
#print(X.iloc[:,:])

#for [:, 1:], change the 1 depending on how many categorical features I have
#X.iloc[:, :] = imputer.fit_transform(X.iloc[:, :])


Xtest = pd.get_dummies(testData[features])
Xtest.replace(0.00, np.nan, inplace=True)
#print(X.iloc[:,:])

#for [:, 1:], change the 1 depending on how many categorical features I have
Xtest.iloc[:, :] = imputer.fit_transform(Xtest.iloc[:, :])

#code above doesn't seem to transform term 3-5, trying a second fit_transform
#trainData.iloc[:, 5:] = imputer.fit_transform(trainData.iloc[:, 5:])

print(Xtest.iloc[:,:])
'''
print(X)
#new method of replacing values with knearestneighbor:
X = knn_imputer.fit_transform(X) 
X = pd.DataFrame(X, columns = features)
Xtest = knn_imputer.fit_transform(Xtest) 
Xtest = pd.DataFrame(Xtest, columns = features)

print(X)


     Age at First Term  GPA Term 1  GPA Term 2  GPA Term 2  GPA Term 3  \
0                   18       3.811       3.352       3.352         NaN   
1                   20       2.889       3.032       3.032       2.700   
2                   22       2.473       2.358       2.358         NaN   
3                   21       3.093       3.366       3.366       3.390   
4                   18       3.656       3.691       3.691         NaN   
..                 ...         ...         ...         ...         ...   
701                 22       2.873       1.667       1.667         NaN   
702                 18       3.048       2.344       2.344       3.000   
703                 20       2.364       2.327       2.327       3.364   
704                 20       3.829       2.958       2.958         NaN   
705                 20       3.517       3.606       3.606       3.580   

     GPA Term 4  GPA Term 5  
0         3.761       3.933  
1         2.845       2.764  
2           NaN      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace(0.00, np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xtest.replace(0.00, np.nan, inplace=True)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [4]:
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder with 'handle_unknown' to ignore unseen labels
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Fit on train data and transform both train & test
Xmajor = onehot_encoder.fit_transform(trainData[["Major"]])  
Xmajor_test = onehot_encoder.transform(testData[["Major"]])  # Transforms without errors

# Convert to DataFrame
Xmajor = pd.DataFrame(Xmajor, columns=onehot_encoder.get_feature_names_out(["Major"]))
Xmajor_test = pd.DataFrame(Xmajor_test, columns=onehot_encoder.get_feature_names_out(["Major"]))

print(f"After encoding:\n{Xmajor.head()}")


X = X.join(Xmajor)
Xtest = Xtest.join(Xmajor_test)




After encoding:
   Major_Accounting  Major_Advertising  Major_Art  Major_Art History  \
0               0.0                0.0        0.0                0.0   
1               0.0                0.0        0.0                0.0   
2               0.0                0.0        0.0                0.0   
3               0.0                0.0        0.0                0.0   
4               0.0                0.0        0.0                0.0   

   Major_Biology -- BA  Major_Biology -- BS  Major_Business Administration  \
0                  0.0                  0.0                            0.0   
1                  0.0                  0.0                            0.0   
2                  0.0                  0.0                            0.0   
3                  0.0                  0.0                            0.0   
4                  0.0                  0.0                            0.0   

   Major_Chemistry -- BA  Major_Chemistry -- BS  Major_Cinematic Arts  ...  \
0   

In [5]:
print(X)
print(Xtest)

     Age at First Term  GPA Term 1  GPA Term 2  GPA Term 2  GPA Term 3  \
0                 18.0       3.811       3.352       3.352      3.9450   
1                 20.0       2.889       3.032       3.032      2.7000   
2                 22.0       2.473       2.358       2.358      3.4726   
3                 21.0       3.093       3.366       3.366      3.3900   
4                 18.0       3.656       3.691       3.691      3.8800   
..                 ...         ...         ...         ...         ...   
701               22.0       2.873       1.667       1.667      2.8384   
702               18.0       3.048       2.344       2.344      3.0000   
703               20.0       2.364       2.327       2.327      3.3640   
704               20.0       3.829       2.958       2.958      3.5132   
705               20.0       3.517       3.606       3.606      3.5800   

     GPA Term 4  GPA Term 5  Major_Accounting  Major_Advertising  Major_Art  \
0        3.7610      3.9330     

In [15]:
#train test split for some validation
#create model:
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import GridSearchCV

#imported models
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

'''
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.45, random_state = 35)


#make a function to find most optimal parameters

#SVR Parameters to check:

params_grid = {'kernel':['linear','rbf','poly'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma':[0.0001, 0.001, 0.01, 0.1],}
             
'''
#RandomForestRegressor Parameters to check:
'''
params_grid = {
    "n_estimators": [50, 100, 200, 300],  # Different numbers of trees
    "max_depth": [None, 10, 20, 30],      # Different tree depths
    "min_samples_split": [2, 5, 10],      # Minimum samples required to split a node
    "min_samples_leaf": [1, 2, 4],        # Minimum samples required at a leaf node
    "random_state": [12, 36, 153]        # Different random seeds
}


model = RandomForestRegressor()
#serves to tell me which estimators are best for my given parameters : might introduce overfitting into my data
print(f"GridSearchCV:")
grid_clf = GridSearchCV(model, params_grid)

print(f"Fitting:")
check = grid_clf.fit(X_train, y_train)


print(f"Best Parameters: {check.best_params_}")
print(f"Best Score: {check.best_score_:.4f}")


'''
'''
parameters = SVR(kernel = 'linear', C = 100, gamma = 0.0001)
#parameters = RandomForestRegressor(max_depth= 10, min_samples_leaf= 4, min_samples_split= 10, n_estimators = 200,random_state = 36)
#parameters = RandomForestRegressor(max_depth = None, min_samples_leaf= 4, min_samples_split=2, n_estimators= 50, random_state=36)


pipeline = make_pipeline(StandardScaler(), parameters)

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)
print('Pipeline created!')


print(f"Parameter: {parameters}\nMean Squared Error: {mean_squared_error(y_test, y_pred)}\n\n")
'''


Pipeline created!
Parameter: SVR(C=100, gamma=0.0001, kernel='linear')
Mean Squared Error: 0.19687099083987805




In [None]:

'''
Results of fitting:

Best Parameters: {'C': 100, 'gamma': 0.0001, 'kernel': 'linear'}
Best Score: 0.5289

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50, 'random_state': 36}
Best Score: 0.4778

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200, 'random_state': 36}
Best Score: 0.4671
'''

In [None]:
#find the best parameters from the above code:
'''
pd = pd.DataFrame(grid_clf.cv_results_)

print(pd)

parameters = {'C': 0.001, 'gamma': 0.0001, 'kernel': 'poly'}

'''

In [None]:

#create model:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

#parameters for SVR model
#pipeline = SVR(kernel = 'linear', C = 0.1, gamma = 0.0001)
parameters = SVR(kernel = 'linear', C = 100, gamma = 0.0001)
#pipeline = RandomForestRegressor(max_depth = None, min_samples_leaf= 4, min_samples_split=2, n_estimators= 50, random_state=36)
pipeline = make_pipeline(StandardScaler(), parameters)


pipeline.fit(X,y)

y_pred = pipeline.predict(Xtest)
print('Pipeline created!')

test_df = pd.DataFrame(testData)

output = pd.DataFrame({'ID': test_df["ID"], 'GPA': y_pred})

#print(output)

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

