In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import joblib

df = pd.read_csv('cleanedData.csv')

# Drop rows with missing target values
df = df.dropna(subset=['degerlendirme_puani'])

# Debugging step: Print column names to verify
print("Columns in the DataFrame:", df.columns)

# Extract relevant columns for the classification model
cat_features = [
    'cinsiyet', 'dogum_yeri', 'ikametgah_sehri', 'universite_adi', 'universite_turu',
    'burslu_ise_burs_yuzdesi', 'burs_aliyor_mu?', 'bolum', 'universite_kacinci_sinif',
    'universite_not_ortalamasi', 'daha_once_baska_bir_universiteden_mezun_olmus',
    'lise_adi', 'lise_adi_diger', 'lise_sehir', 'lise_turu', 'lise_bolumu',
    'lise_bolum_diger', 'lise_mezuniyet_notu', 'baska_bir_kurumdan_burs_aliyor_mu?',
    'burs_aldigi_baska_kurum', 'baska_kurumdan_aldigi_burs_miktari', 'anne_egitim_durumu',
    'anne_calisma_durumu', 'baba_egitim_durumu', 'baba_calisma_durumu', 'kardes_sayisi',
    'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz?', 'uye_oldugunuz_kulubun_ismi',
    'profesyonel_bir_spor_daliyla_mesgul_musunuz?', 'spor_dalindaki_rolunuz_nedir?',
    'aktif_olarak_bir_stk_uyesi_misiniz?', "hangi_stk'nin_uyesisiniz?",
    'stk_projesine_katildiniz_mi?', 'girisimcilikle_ilgili_deneyiminiz_var_mi?',
    'ingilizce_biliyor_musunuz?', 'ingilizce_seviyeniz?', 
    'daha_onceden_mezun_olunduysa,_mezun_olunan_universite', 'anne_sektor_encoded',
    'baba_sektor_encoded', 'anne_Unknown', 'anne_diger', 'anne_kamu',
    'anne_ozel sektor', 'baba_Unknown', 'baba_diger', 'baba_kamu', 'baba_ozel sektor',
    'age'
]

text_columns = ['girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz?']
target_column = 'degerlendirme_puani'

# Handle missing values in text columns
df[text_columns] = df[text_columns].fillna('')

# Combine text columns into one for processing
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Tokenize and preprocess text data for Word2Vec
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

# Prepare Word2Vec model
sentences = [preprocess_text(text) for text in df['combined_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create document vectors by averaging word vectors
def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

df['text_vector'] = df['combined_text'].apply(vectorize_text)
word2vec_features = np.array(df['text_vector'].tolist())

# TF-IDF Vectorization
text_data = df['combined_text']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf_vectorizer.fit_transform(text_data)

# Combine Word2Vec and TF-IDF features
combined_features = hstack([tfidf_vectors, word2vec_features])

# Debugging step: Verify columns in df and missing columns in cat_features
missing_cols = [col for col in cat_features if col not in df.columns]
if missing_cols:
    print("Warning: The following columns are missing from the DataFrame:", missing_cols)

# Prepare data for classification model
try:
    X_cat = df[cat_features]
except KeyError as e:
    print("Error: One or more columns are missing:", e)
    print("Available columns:", df.columns)
    raise

y = df[target_column]

# ColumnTransformer for encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

# Classification pipeline
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split data for classification model
X_cat_train, X_cat_test, y_train, y_test = train_test_split(X_cat, y, test_size=0.2, random_state=42)

# Train classification model
clf_pipeline.fit(X_cat_train, y_train)

# Evaluate classification model
y_pred_cat = clf_pipeline.predict(X_cat_test)
print("Classification Model Accuracy:", accuracy_score(y_test, y_pred_cat))
print("Classification Report:\n", classification_report(y_test, y_pred_cat))

# Prepare data for regression model
X_combined = combined_features

# Split data for regression model
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
# Standardize features for regression
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Early stopping parameters
alpha = 0.001  # Lasso regularization parameter
max_epochs = 1000  # Maximum number of iterations
tolerance = 1e-4  # Minimum improvement to keep going
patience = 10  # Number of epochs with no improvement before stopping

best_mse = float('inf')
best_model = None
no_improvement_count = 0

for epoch in range(1, max_epochs + 1):
    # Create and fit the Lasso model
    lasso_model = Lasso(alpha=alpha, max_iter=epoch, warm_start=True)  # warm_start allows iterative fitting
    lasso_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred_lasso = lasso_model.predict(X_test)
    
    # Calculate the Mean Squared Error
    mse_lasso = mean_squared_error(y_test, y_pred_lasso)

    # Check for improvement
    if mse_lasso < best_mse - tolerance:
        best_mse = mse_lasso
        best_model = lasso_model
        no_improvement_count = 0
    else:
        no_improvement_count += 1
    
    # Print progress for each epoch
    print(f'Epoch {epoch}: MSE = {mse_lasso:.6f}, Best MSE = {best_mse:.6f}')
    
    # Early stopping if no improvement for 'patience' epochs
    if no_improvement_count >= patience:
        print(f"No improvement for {patience} epochs. Stopping early.")
        break

# Use the best model for final predictions
y_pred_lasso_final = best_model.predict(X_test)
print(f'\nFinal Best Lasso Regression Model Mean Squared Error: {best_mse:.6f}')

# Print sample actual vs predicted values for Lasso
print("\nSample Actual vs. Predicted Values (Lasso):")
for actual, predicted in zip(y_test.head(10), y_pred_lasso_final[:10]):
    print(f"Actual: {actual}, Predicted: {predicted}")


Columns in the DataFrame: Index(['Unnamed: 0', 'basvuru_yili', 'degerlendirme_puani', 'cinsiyet',
       'dogum_yeri', 'ikametgah_sehri', 'universite_adi', 'universite_turu',
       'burslu_ise_burs_yuzdesi', 'burs_aliyor_mu?', 'bolum',
       'universite_kacinci_sinif', 'universite_not_ortalamasi',
       'daha_once_baska_bir_universiteden_mezun_olmus', 'lise_adi',
       'lise_adi_diger', 'lise_sehir', 'lise_turu', 'lise_bolumu',
       'lise_bolum_diger', 'lise_mezuniyet_notu',
       'baska_bir_kurumdan_burs_aliyor_mu?', 'burs_aldigi_baska_kurum',
       'baska_kurumdan_aldigi_burs_miktari', 'anne_egitim_durumu',
       'anne_calisma_durumu', 'baba_egitim_durumu', 'baba_calisma_durumu',
       'kardes_sayisi', 'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz?',
       'uye_oldugunuz_kulubun_ismi',
       'profesyonel_bir_spor_daliyla_mesgul_musunuz?',
       'spor_dalindaki_rolunuz_nedir?', 'aktif_olarak_bir_stk_uyesi_misiniz?',
       'hangi_stk'nin_uyesisiniz?', 'stk_projesin

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 1: MSE = 217.774284, Best MSE = 217.774284


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 2: MSE = 210.559748, Best MSE = 210.559748


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 3: MSE = 208.882194, Best MSE = 208.882194


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 4: MSE = 208.168357, Best MSE = 208.168357


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 5: MSE = 207.680977, Best MSE = 207.680977


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 6: MSE = 207.282497, Best MSE = 207.282497


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 7: MSE = 206.974465, Best MSE = 206.974465


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 8: MSE = 206.762862, Best MSE = 206.762862


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 9: MSE = 206.640952, Best MSE = 206.640952


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 10: MSE = 206.584823, Best MSE = 206.584823


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 11: MSE = 206.565637, Best MSE = 206.565637


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 12: MSE = 206.562457, Best MSE = 206.562457


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 13: MSE = 206.547790, Best MSE = 206.547790


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 14: MSE = 206.507445, Best MSE = 206.507445


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 15: MSE = 206.435490, Best MSE = 206.435490


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 16: MSE = 206.326380, Best MSE = 206.326380


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 17: MSE = 206.192394, Best MSE = 206.192394


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 18: MSE = 206.031238, Best MSE = 206.031238


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 19: MSE = 205.861849, Best MSE = 205.861849


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 20: MSE = 205.694340, Best MSE = 205.694340


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 21: MSE = 205.535145, Best MSE = 205.535145


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 22: MSE = 205.390252, Best MSE = 205.390252


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 23: MSE = 205.262215, Best MSE = 205.262215


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 24: MSE = 205.153426, Best MSE = 205.153426


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 25: MSE = 205.064262, Best MSE = 205.064262


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 26: MSE = 204.998401, Best MSE = 204.998401


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 27: MSE = 204.949242, Best MSE = 204.949242


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 28: MSE = 204.915048, Best MSE = 204.915048


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 29: MSE = 204.890048, Best MSE = 204.890048


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 30: MSE = 204.873020, Best MSE = 204.873020


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 31: MSE = 204.862755, Best MSE = 204.862755


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 32: MSE = 204.856540, Best MSE = 204.856540


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 33: MSE = 204.854099, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 34: MSE = 204.854470, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 35: MSE = 204.856676, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 36: MSE = 204.860170, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 37: MSE = 204.864447, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 38: MSE = 204.869145, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 39: MSE = 204.873954, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 40: MSE = 204.878630, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 41: MSE = 204.882963, Best MSE = 204.854099


  model = cd_fast.sparse_enet_coordinate_descent(


Epoch 42: MSE = 204.886787, Best MSE = 204.854099
Epoch 43: MSE = 204.889934, Best MSE = 204.854099
No improvement for 10 epochs. Stopping early.

Final Best Lasso Regression Model Mean Squared Error: 204.854099

Sample Actual vs. Predicted Values (Lasso):
Actual: 41.0, Predicted: 27.945070597709602
Actual: 58.0, Predicted: 27.945070597709602
Actual: 46.0, Predicted: 48.21559781046307
Actual: 20.0, Predicted: 27.945070597709602
Actual: 9.0, Predicted: 27.945070597709602
Actual: 38.0, Predicted: 35.58831426283508
Actual: 28.0, Predicted: 27.945070597709602
Actual: 28.0, Predicted: 27.945070597709602
Actual: 13.0, Predicted: 27.945070597709602
Actual: 28.0, Predicted: 27.945070597709602


  model = cd_fast.sparse_enet_coordinate_descent(
