In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from catboost import CatBoostRegressor
import joblib

# Load the trained model
catboost_model = CatBoostRegressor()
catboost_model.load_model("catboost_model.cbm")

# Load new data
new_data = pd.read_csv('cleanedTest.csv')

# Extract and process features
cat_features = [
    'cinsiyet', 'dogum_yeri', 'ikametgah_sehri', 'universite_adi', 'universite_turu',
    'burslu_ise_burs_yuzdesi', 'burs_aliyor_mu?', 'bolum', 'universite_kacinci_sinif',
    'universite_not_ortalamasi', 'daha_once_baska_bir_universiteden_mezun_olmus', 'lise_adi',
    'lise_adi_diger', 'lise_sehir', 'lise_turu', 'lise_bolumu', 'lise_bolum_diger',
    'lise_mezuniyet_notu', 'baska_bir_kurumdan_burs_aliyor_mu?', 'burs_aldigi_baska_kurum',
    'baska_kurumdan_aldigi_burs_miktari', 'anne_egitim_durumu', 'anne_calisma_durumu',
    'baba_egitim_durumu', 'baba_calisma_durumu', 'kardes_sayisi',
    'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz?', 'uye_oldugunuz_kulubun_ismi',
    'profesyonel_bir_spor_daliyla_mesgul_musunuz?', 'spor_dalindaki_rolunuz_nedir?',
    'aktif_olarak_bir_stk_uyesi_misiniz?', "hangi_stk'nin_uyesisiniz?", 'stk_projesine_katildiniz_mi?',
    'girisimcilikle_ilgili_deneyiminiz_var_mi?', 'ingilizce_biliyor_musunuz?', 'ingilizce_seviyeniz?',
    'daha_onceden_mezun_olunduysa,_mezun_olunan_universite', 'anne_sektor_encoded', 'baba_sektor_encoded',
    'anne_Unknown', 'anne_diger', 'anne_kamu', 'anne_ozel sektor', 'baba_Unknown', 'baba_diger',
    'baba_kamu', 'baba_ozel sektor', 'age'
]

text_columns = ['girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz?']

# Handle missing values and combine text columns
new_data[text_columns] = new_data[text_columns].fillna('')
new_data['combined_text'] = new_data[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Preprocess text data for Word2Vec
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

# Tokenize and vectorize new data using the same model
sentences_new = [preprocess_text(text) for text in new_data['combined_text']]
word2vec_model = Word2Vec.load('word2vec_model')  # Assuming you have saved the Word2Vec model
def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

new_data['text_vector'] = new_data['combined_text'].apply(vectorize_text)
word2vec_features_new = np.array(new_data['text_vector'].tolist())

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors_new = tfidf_vectorizer.fit_transform(new_data['combined_text'])

# Combine Word2Vec and TF-IDF features
combined_features_new = hstack([tfidf_vectors_new, word2vec_features_new])

# Prepare categorical features
X_cat_new = new_data[cat_features]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_features)
    ])
X_cat_preprocessed_new = preprocessor.fit_transform(X_cat_new)

# Combine features for prediction
X_combined_new = hstack([X_cat_preprocessed_new, combined_features_new])

# Make predictions
predictions = catboost_model.predict(X_combined_new)

# Save predictions to a CSV file
new_data['predictions'] = predictions
new_data.to_csv('predictions.csv', index=False)

print("Predictions have been saved to 'predictions.csv'")
