In [141]:
import pandas as pd
test_df = pd.read_csv('cleanTest-1.csv')
test_df["cinsiyet"] = test_df["cinsiyet"].fillna(0)
# Identify integer columns
integer_columns = test_df.select_dtypes(include=['int64']).columns

# Convert all integer columns to float
test_df[integer_columns] = test_df[integer_columns].astype(float)
df = pd.read_csv('cleanedData-1.csv', low_memory=False)

# Identify boolean columns
bool_columns = df.select_dtypes(include=['bool']).columns

# Convert boolean columns to object (string)
for col in bool_columns:
    df[col] = df[col].astype('object')

# Verify the changes
print("Data types of each column after conversion:")
print(df["anne_calisma_durumu"].dtypes)

Data types of each column after conversion:
float64


In [135]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
from catboost import CatBoostRegressor
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Define the preprocessing and model paths
word2vec_model_path = 'word2vec_model.model'
tfidf_vectorizer_path = 'tfidf_vectorizer.joblib'
catboost_model_path = 'catboost_model.cbm'

# Define the columns
cat_features = [
    'cinsiyet', 'dogum_yeri', 'ikametgah_sehri', 'universite_adi', 'universite_turu',
    'burslu_ise_burs_yuzdesi', 'burs_aliyor_mu?', 'universite_kacinci_sinif',
    'universite_not_ortalamasi', 'daha_once_baska_bir_universiteden_mezun_olmus',
    'lise_mezuniyet_notu', 'baska_bir_kurumdan_burs_aliyor_mu?',
    'baska_kurumdan_aldigi_burs_miktari', 
    'anne_calisma_durumu', 'baba_calisma_durumu', 'kardes_sayisi',
    'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz?',
    'profesyonel_bir_spor_daliyla_mesgul_musunuz?',
    'aktif_olarak_bir_stk_uyesi_misiniz?',
    'stk_projesine_katildiniz_mi?', 'girisimcilikle_ilgili_deneyiminiz_var_mi?',
    'ingilizce_biliyor_musunuz?', 
    'daha_onceden_mezun_olunduysa_mezun_olunan_universite', 'anne_sektor_encoded',
    'baba_sektor_encoded', 'anne_unknown', 'anne_diger', 'anne_kamu',
    'anne_ozel_sektor', 'baba_unknown', 'baba_diger', 'baba_kamu', 'baba_ozel_sektor',
    'age'
]

text_columns = ['girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz?',"bolum","lise_adi",'lise_adi_diger', 'lise_sehir', 'lise_turu', 'lise_bolumu',
    'lise_bolum_diger', 'uye_oldugunuz_kulubun_ismi', 'burs_aldigi_baska_kurum','anne_egitim_durumu', 'baba_egitim_durumu', 'spor_dalindaki_rolunuz_nedir?', "hangi_stk_nin_uyesisiniz?", 'ingilizce_seviyeniz?']
target_column = 'degerlendirme_puani'

# Load the test data


# Ensure the same columns are present
missing_cols = [col for col in text_columns if col not in test_df.columns]
if missing_cols:
    print("Missing columns in test_df:", missing_cols)
    # Add missing columns with empty strings
    for col in missing_cols:
        test_df[col] = ''

# Handle missing values in text columns
test_df[text_columns] = test_df[text_columns].fillna('')

# Combine text columns into one for processing
test_df['combined_text'] = test_df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Check if the Word2Vec model and TF-IDF vectorizer exist
import os

# Load Word2Vec model
if os.path.exists(word2vec_model_path):
    word2vec_model = Word2Vec.load(word2vec_model_path)
else:
    raise FileNotFoundError(f"{word2vec_model_path} not found")

# Load TF-IDF vectorizer
if os.path.exists(tfidf_vectorizer_path):
    tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
else:
    raise FileNotFoundError(f"{tfidf_vectorizer_path} not found")

# Create document vectors by averaging word vectors
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

test_word2vec_features = np.array([vectorize_text(text) for text in test_df['combined_text']])
test_tfidf_vectors = tfidf_vectorizer.transform(test_df['combined_text'])
test_combined_features = hstack([test_tfidf_vectors, test_word2vec_features])

# Prepare test data for regression model
test_X_cat = test_df[cat_features]

# Load the preprocessor (if saved, or recreate it)
if os.path.exists('preprocessor.joblib'):
    preprocessor = joblib.load('preprocessor.joblib')
else:
    raise FileNotFoundError("Preprocessor not found")

test_X_cat_preprocessed = preprocessor.transform(test_X_cat)

# Combine features
test_X_combined = hstack([test_X_cat_preprocessed, test_combined_features])

# Load the saved CatBoost model
if os.path.exists(catboost_model_path):
    catboost_model = CatBoostRegressor()
    catboost_model.load_model(catboost_model_path)
else:
    raise FileNotFoundError(f"{catboost_model_path} not found")

# Make predictions
test_predictions = catboost_model.predict(test_X_combined)

# Output the predictions
print("Predictions for the test data:")
print(test_predictions)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [11]:
from sklearn.metrics import mean_squared_error
import numpy as np
# Example actual and predicted values
y_true = np.array(puanlar)  # Real values
y_pred = np.array(test_predictions)  # Predicted values

# Calculate RMSE
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(f'RMSE: {rmse:.4f}')

RMSE: 6.8757


