In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import joblib

# Load dataset
df = pd.read_csv('cleanedData.csv')

# Drop rows with missing target values
df = df.dropna(subset=['degerlendirme_puani'])

# Extract relevant columns for the classification model
cat_features = [
    'cinsiyet', 'dogum_yeri', 'ikametgah_sehri', 'universite_adi', 'universite_turu',
    'burslu_ise_burs_yuzdesi', 'burs_aliyor_mu?', 'universite_kacinci_sinif',
    'universite_not_ortalamasi', 'daha_once_baska_bir_universiteden_mezun_olmus',
     'lise_mezuniyet_notu', 'baska_bir_kurumdan_burs_aliyor_mu?',
    'baska_kurumdan_aldigi_burs_miktari', 
    'anne_calisma_durumu', 'baba_calisma_durumu', 'kardes_sayisi',
    'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz?',
    'profesyonel_bir_spor_daliyla_mesgul_musunuz?',
    'aktif_olarak_bir_stk_uyesi_misiniz?',
    'stk_projesine_katildiniz_mi?', 'girisimcilikle_ilgili_deneyiminiz_var_mi?',
    'ingilizce_biliyor_musunuz?', 
    'daha_onceden_mezun_olunduysa,_mezun_olunan_universite', 'anne_sektor_encoded',
    'baba_sektor_encoded', 'anne_Unknown', 'anne_diger', 'anne_kamu',
    'anne_ozel sektor', 'baba_Unknown', 'baba_diger', 'baba_kamu', 'baba_ozel sektor',
    'age'
]

text_columns = ['girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz?',"bolum","lise_adi",'lise_adi_diger', 'lise_sehir', 'lise_turu', 'lise_bolumu',
    'lise_bolum_diger', 'uye_oldugunuz_kulubun_ismi', 'burs_aldigi_baska_kurum','anne_egitim_durumu', 'baba_egitim_durumu', 'spor_dalindaki_rolunuz_nedir?', "hangi_stk'nin_uyesisiniz?", 'ingilizce_seviyeniz?']
target_column = 'degerlendirme_puani'

# Handle missing values in text columns
df[text_columns] = df[text_columns].fillna('')

# Combine text columns into one for processing
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Tokenize and preprocess text data for Word2Vec
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

# Prepare Word2Vec model
sentences = [preprocess_text(text) for text in df['combined_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create document vectors by averaging word vectors
def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

df['text_vector'] = df['combined_text'].apply(vectorize_text)
word2vec_features = np.array(df['text_vector'].tolist())

# TF-IDF Vectorization
text_data = df['combined_text']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf_vectorizer.fit_transform(text_data)

# Combine Word2Vec and TF-IDF features
combined_features = hstack([tfidf_vectors, word2vec_features])

# ColumnTransformer for encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

# Prepare data for regression model
X_cat = df[cat_features]
y = df[target_column]

# Split data for regression model
X_combined = combined_features
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Standardize features for regression
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create CatBoost Regressor model
catboost_model = CatBoostRegressor(iterations=1000, depth=6, learning_rate=0.1, loss_function='RMSE', random_seed=42, verbose=200)

# Train the model
catboost_model.fit(X_train, y_train)

# Predict using CatBoost Regressor
y_pred_catboost = catboost_model.predict(X_test)

# Calculate Root Mean Squared Error
rmse_catboost = np.sqrt(mean_squared_error(y_test, y_pred_catboost))
print(f'CatBoost Regression Model Root Mean Squared Error: {rmse_catboost:.6f}')

# Print sample actual vs predicted values for CatBoost
print("\nSample Actual vs. Predicted Values (CatBoost):")
for actual, predicted in zip(y_test.head(10), y_pred_catboost[:10]):
    print(f"Actual: {actual}, Predicted: {predicted}")


0:	learn: 14.8115503	total: 89.1ms	remaining: 1m 28s
200:	learn: 10.6401568	total: 16.6s	remaining: 1m 5s
400:	learn: 10.1537181	total: 32.5s	remaining: 48.6s
600:	learn: 9.7941462	total: 50.9s	remaining: 33.8s
800:	learn: 9.4846314	total: 1m 12s	remaining: 18s
999:	learn: 9.1864293	total: 1m 38s	remaining: 0us
CatBoost Regression Model Root Mean Squared Error: 10.838445

Sample Actual vs. Predicted Values (CatBoost):
Actual: 41.0, Predicted: 42.59256334186282
Actual: 58.0, Predicted: 48.79864289353746
Actual: 46.0, Predicted: 40.43759662310391
Actual: 20.0, Predicted: 19.414577178448504
Actual: 9.0, Predicted: 15.18669416879139
Actual: 38.0, Predicted: 46.03098987441725
Actual: 28.0, Predicted: 31.379420790587552
Actual: 28.0, Predicted: 16.234445450922244
Actual: 13.0, Predicted: 27.357250339881638
Actual: 28.0, Predicted: 22.420984965956986
