In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import joblib
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor
import os

# Load the data
df = pd.read_csv('cleanedData.csv')

# Drop rows with missing target values
df = df.drop(['burslu_ise_burs_yuzdesi', 'daha_once_baska_bir_universiteden_mezun_olmus', 'lise_adi_diger', 'lise_bolum_diger',
              'uye_oldugunuz_kulubun_ismi', 'stk_projesine_katildiniz_mi?', 'ingilizce_seviyeniz?',
              'daha_onceden_mezun_olunduysa_mezun_olunan_universite'], axis=1)
df = df.dropna(subset=['degerlendirme_puani'])

# Define relevant columns
cat_features = [
    'cinsiyet', 'dogum_yeri', 'ikametgah_sehri', 'universite_adi', 'universite_turu',
    'burs_aliyor_mu?', 'universite_kacinci_sinif', 'universite_not_ortalamasi',
    'lise_mezuniyet_notu', 'baska_bir_kurumdan_burs_aliyor_mu?', 'baska_kurumdan_aldigi_burs_miktari', 
    'anne_calisma_durumu', 'baba_calisma_durumu', 'kardes_sayisi',
    'girisimcilik_kulupleri_tarzi_bir_kulube_uye_misiniz?', 'profesyonel_bir_spor_daliyla_mesgul_musunuz?',
    'aktif_olarak_bir_stk_uyesi_misiniz?', 'girisimcilikle_ilgili_deneyiminiz_var_mi?', 'ingilizce_biliyor_musunuz?',
    'anne_sektor_encoded', 'baba_sektor_encoded', 'anne_unknown', 'anne_diger', 'anne_kamu',
    'anne_ozel_sektor', 'baba_unknown', 'baba_diger', 'baba_kamu', 'baba_ozel_sektor', 'age'
]

text_columns = ['girisimcilikle_ilgili_deneyiminizi_aciklayabilir_misiniz?', "bolum", "lise_adi", 'lise_sehir', 'lise_turu', 'lise_bolumu',
                'burs_aldigi_baska_kurum', 'anne_egitim_durumu', 'baba_egitim_durumu', 'spor_dalindaki_rolunuz_nedir?', "hangi_stk_nin_uyesisiniz?"]
target_column = 'degerlendirme_puani'

# Handle missing values in text columns
df[text_columns] = df[text_columns].fillna('')

# Combine text columns into one for processing
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Tokenize and preprocess text data for Word2Vec
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

# Check if the Word2Vec model already exists
if os.path.exists('word2vec_model.model'):
    word2vec_model = Word2Vec.load('word2vec_model.model')
else:
    sentences = [preprocess_text(text) for text in df['combined_text']]
    word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word2vec_model.save('word2vec_model.model')

# Create document vectors by averaging word vectors
def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

df['text_vector'] = df['combined_text'].apply(vectorize_text)
word2vec_features = np.array(df['text_vector'].tolist())

# TF-IDF Vectorization
text_data = df['combined_text']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf_vectorizer.fit_transform(text_data)

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Combine Word2Vec and TF-IDF features
combined_features = hstack([tfidf_vectors, word2vec_features])

# Debugging step: Verify columns in df and missing columns in cat_features
missing_cols = [col for col in cat_features if col not in df.columns]
if missing_cols:
    print("Warning: The following columns are missing from the DataFrame:", missing_cols)

# Prepare data for classification model
try:
    X_cat = df[cat_features]
except KeyError as e:
    print("Error: One or more columns are missing:", e)
    print("Available columns:", df.columns)
    raise

y = df[target_column]

# ColumnTransformer for encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

# Fit the preprocessor on the data
X_cat_preprocessed = preprocessor.fit_transform(X_cat)

# Save the fitted preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

# Classification pipeline
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split data for classification model
X_cat_train, X_cat_test, y_train, y_test = train_test_split(X_cat, y, test_size=0.2, random_state=42)

# Train classification model
clf_pipeline.fit(X_cat_train, y_train)

# Evaluate classification model
y_pred_cat = clf_pipeline.predict(X_cat_test)
print("Classification Model Accuracy:", accuracy_score(y_test, y_pred_cat))
print("Classification Report:\n", classification_report(y_test, y_pred_cat))

# Prepare data for regression model
# Debugging step: Print shapes and types
print("Shape of X_cat:", X_cat.shape)
print("Shape of combined_features:", combined_features.shape)
print("Type of X_cat:", type(X_cat))
print("Type of combined_features:", type(combined_features))
print("Dtype of X_cat:", X_cat.dtypes)
print("Dtype of combined_features:", combined_features.dtype)

# Identify numeric and categorical columns
numeric_features = X_cat.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_cat.select_dtypes(include=['object', 'bool']).columns

# Create preprocessor for numeric and categorical features
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_features)
    ])

# Fit and transform the data
X_cat_preprocessed = preprocessor_reg.fit_transform(X_cat)

# Debugging step: Print shapes after preprocessing
print("Shape of X_cat_preprocessed:", X_cat_preprocessed.shape)
print("Shape of combined_features:", combined_features.shape)

# Combine features
X_combined = hstack([X_cat_preprocessed, combined_features])

# Debugging step: Print final combined shape
print("Shape of X_combined:", X_combined.shape)

# Split data for regression model
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.33, random_state=42)

# Load or train CatBoost model
catboost_model_path = 'catboost_model.cbm'
if os.path.exists(catboost_model_path):
    catboost_model = CatBoostRegressor()
    catboost_model.load_model(catboost_model_path)
else:
    # Modify CatBoost hyperparameters
    catboost_model = CatBoostRegressor(
        iterations=2000,  # Increased iterations
        learning_rate=0.01,  # Lowered learning rate
        depth=8,  # Increased depth
        l2_leaf_reg=3,  # Added L2 regularization
        loss_function='RMSE',
        eval_metric='RMSE',
        random_seed=42,
        verbose=100,
        early_stopping_rounds=500  # Increased early stopping rounds
    )

    # Add feature selection
    from sklearn.feature_selection import SelectFromModel

    selector = SelectFromModel(estimator=CatBoostRegressor(iterations=100), threshold='median')
    X_selected = selector.fit_transform(X_combined, y)

    # Split data using selected features
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.33, random_state=42)

    # Train the model with selected features
    catboost_model.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        use_best_model=True,
        verbose=100
    )

    # Save the CatBoost model
    catboost_model.save_model(catboost_model_path)

# Cross-validation
cv_scores = cross_val_score(catboost_model, X_combined, y, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-validation RMSE: {(-cv_scores.mean())**0.5:.4f} (+/- {cv_scores.std() * 2:.4f})')

# Predict and evaluate regression model
y_pred_reg = catboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_reg)
rmse = mse ** 0.5
print(f'Regression Model RMSE: {rmse:.4f}')

# Print actual vs predicted values for regression
print("\nSample Actual vs. Predicted Values:")
for actual, predicted in zip(y_test[:10], y_pred_reg[:10]):
    print(f"Actual: {actual:.2f}, Predicted: {predicted:.2f}")

# Feature importance
feature_importance = catboost_model.feature_importances_
feature_names = preprocessor_reg.get_feature_names_out().tolist() + [f'text_feature_{i}' for i in range(combined_features.shape[1])]
for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:20]:
    print(f"Feature: {name}, Importance: {importance:.4f}")


Classification Model Accuracy: 0.193292815281094
Classification Report:
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         4
         2.0       0.00      0.00      0.00         2
         3.0       0.50      1.00      0.67         4
         4.0       0.45      0.45      0.45        11
         5.0       0.34      0.68      0.45        95
         6.0       0.37      0.74      0.50        86
         7.0       0.35      0.78      0.49       237
         8.0       0.42      0.52      0.47       154
         9.0       0.35      0.59      0.44       138
        10.0       0.32      0.37      0.34       134
        11.0       0.26      0.32      0.29       119
        12.0       0.42      0.47      0.44       274
        13.0       0.18      0.22      0.20       156
        14.0       0.32      0.34      0.33       236
        15.0       0.21      0.24      0.23       177
        16.0       0.27      0.27      0.27       188
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Shape of X_cat_preprocessed: (46068, 1828)
Shape of combined_features: (46068, 1100)
Shape of X_combined: (46068, 2928)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mustafa\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mustafa\anaconda3\Lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mustafa\anaconda3\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mustafa\anaconda3\Lib\site-packages\catboost\core.py", line 2367, in _prepare_train_params
    raise CatBoostError("To employ param {'use_best_model': True} provide non-empty 'eval_set'.")
_catboost.CatBoostError: To employ param {'use_best_model': True} provide non-empty 'eval_set'.


In [25]:
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

# Check if the Word2Vec model already exists

# Create document vectors by averaging word vectors
def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

df['text_vector'] = df['combined_text'].apply(vectorize_text)
word2vec_features = np.array(df['text_vector'].tolist())

# TF-IDF Vectorization
text_data = df['combined_text']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf_vectorizer.fit_transform(text_data)

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Combine Word2Vec and TF-IDF features
combined_features = hstack([tfidf_vectors, word2vec_features])

# Debugging step: Verify columns in df and missing columns in cat_features
missing_cols = [col for col in cat_features if col not in df.columns]
if missing_cols:
    print("Warning: The following columns are missing from the DataFrame:", missing_cols)

# Prepare data for classification model
try:
    X_cat = df[cat_features]
except KeyError as e:
    print("Error: One or more columns are missing:", e)
    print("Available columns:", df.columns)
    raise

y = df[target_column]

# ColumnTransformer for encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

['preprocessor.joblib']

In [149]:
import pandas as pd

test_df = pd.read_csv('cleanTest.csv')
test_df["cinsiyet"] = test_df["cinsiyet"].fillna(1.0)
test_df = test_df.dropna(axis=0)

df[text_columns] = df[text_columns].fillna('')


print(len(test_df))
# Preprocess test data
test_df['combined_text'] = test_df[text_columns].apply(lambda x: ' '.join(x), axis=1)
print(test_df["combined_text"])


# Create document vectors for test data
test_df['text_vector'] = test_df['combined_text'].apply(vectorize_text)
test_word2vec_features = np.array(test_df['text_vector'].tolist())

# TF-IDF Vectorization for test data
test_tfidf_vectors = tfidf_vectorizer.transform(test_df['combined_text'])

# Combine Word2Vec and TF-IDF features for test data
test_combined_features = hstack([test_tfidf_vectors, test_word2vec_features])

# Prepare test data for the model
X_test_cat = test_df[cat_features]


# Preprocess test data
X_test_cat_preprocessed = preprocessor.transform(X_test_cat)

# Combine features for test data
X_test_combined = hstack([X_test_cat_preprocessed, test_combined_features])

# Apply feature selection
X_test_selected = selector.transform(X_test_combined)

# Load the saved CatBoost model
loaded_model = CatBoostRegressor()
loaded_model.load_model('catboost_model.cbm')

# Make predictions
test_predictions = loaded_model.predict(X_test_selected)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'Degerlendirme Puani': test_predictions
})

# Save the submission file
submission.to_csv('submissionenson.csv', index=False)

print("Predictions have been saved to 'submission.csv'")


11049
0        amasya amasya munzur universitesi devlet sosya...
1        konya konya hacettepe universitesi devlet ulus...
2        istanbul istanbul kapadokya universitesi ozel ...
3        mardin mardin mardin artuklu universitesi devl...
4        samsun istanbul bogazici universitesi devlet i...
                               ...                        
11044    yozgat ankara gazi universitesi devlet endustr...
11045    konya konya nigde omer halisdemir universitesi...
11046    ankara ankara karabuk universitesi devlet elek...
11047    adiyaman istanbul bursa uludag universitesi de...
11048    nigde ankara hacettepe universitesi devlet end...
Name: combined_text, Length: 11049, dtype: object
column = Unnamed: 0:<class 'str'>
column = basvuru_yili:<class 'str'>
column = cinsiyet:<class 'str'>
column = dogum_yeri:<class 'str'>
column = ikametgah_sehri:<class 'str'>
column = universite_adi:<class 'str'>
column = universite_turu:<class 'str'>
column = burslu_ise_burs_yuzdesi:<class 'st

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [49]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Example actual and predicted values
y_true = np.array(puanlar)  # Real values
y_pred = np.array(test_predictions)  # Predicted values

# Calculate RMSE
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(f'RMSE: {rmse:.4f}')

RMSE: 6.9099




In [3]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')

['preprocessor.joblib']