In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Read CSV
train_data = pd.read_csv('/content/drive/MyDrive/datathon2024/datathon-2024/train.csv', low_memory=False)

In [None]:
# Drop unnecessary colmuns
drop_columns = [
    'id'
]

#, 'Spor Dalindaki Rolunuz Nedir?', 'Uye Oldugunuz Kulubun Ismi' 'Basvuru Yili', 'Cinsiyet', 'Dogum Tarihi', 'Dogum Yeri',

train_data_cleaned = train_data.drop(columns=drop_columns)
train_data_cleaned.to_csv('/content/drive/MyDrive/datathon2024/datathon-2024/processed_train.csv', index=False)

# Load new dataset
train_data = pd.read_csv('/content/drive/MyDrive/datathon2024/datathon-2024/processed_train.csv', low_memory=False)

In [None]:
# Calculate average for column 'Universite Not Ortalamasi'
def calculate_average(value):
    if isinstance(value, str) and '-' in value:
        try:
            lower, upper = value.split('-')
            average = (float(lower.strip()) + float(upper.strip())) / 2
            return average
        except ValueError:
            return None
    return value

train_data['Universite Not Ortalamasi'] = train_data['Universite Not Ortalamasi'].apply(calculate_average)
train_data['Universite Not Ortalamasi'] = pd.to_numeric(train_data['Universite Not Ortalamasi'], errors='coerce')
train_data['Universite Not Ortalamasi'] = train_data['Universite Not Ortalamasi'].fillna(train_data['Universite Not Ortalamasi'].mean())

# Save new dataset
train_data.to_csv('/content/drive/MyDrive/datathon2024/datathon-2024/processed1_train.csv', index=False)

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/datathon2024/datathon-2024/processed1_train.csv', low_memory=False)

# Calculate average for numeric columns
numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
numeric_imputer = SimpleImputer(strategy='mean')
train_data[numeric_cols] = numeric_imputer.fit_transform(train_data[numeric_cols])

# Fill missings with most frequent values for categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_cols] = categorical_imputer.fit_transform(train_data[categorical_cols])

In [None]:
# Label categorical columns
for col in categorical_cols:
    encoder = LabelEncoder()
    train_data[col] = encoder.fit_transform(train_data[col].astype(str))

# IQR method for contradictory values
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Clean contradictory values
for col in numeric_cols:
    train_data = remove_outliers_iqr(train_data, col)

# Delete columns that has a lot of contradictory values
missing_ratio = train_data.isnull().mean(axis=1)
threshold = 0.5  # %50'den fazla eksik verisi olan satırları sil
train_data = train_data[missing_ratio < threshold]

# Fill missing values
train_data.ffill(inplace=True)
train_data.bfill(inplace=True)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

target_col = 'Degerlendirme Puani'

features = train_data.drop(columns=[target_col])

train_data_non_missing = train_data.dropna(subset=[target_col])
train_data_missing = train_data[train_data[target_col].isnull()]

model = xgb.XGBRegressor(n_estimators=100, random_state=42)

model.fit(train_data_non_missing.drop(columns=[target_col]), train_data_non_missing[target_col])

predictions = model.predict(train_data_missing.drop(columns=[target_col]))

train_data.loc[train_data[target_col].isnull(), target_col] = predictions

train_data.to_csv('/content/drive/MyDrive/datathon2024/datathon-2024/processed2_train.csv', index=False)

print("Eksik değerler başarıyla tahmin edildi ve veri seti kaydedildi.")

Eksik değerler başarıyla tahmin edildi ve veri seti kaydedildi.


In [None]:
#Hyperparameter Optimization
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [1, 1.5, 2]
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(random_state=42),
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)


random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)

best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_val = best_model.predict(X_val)
print("Optimized Validation MSE:", mean_squared_error(y_val, y_pred_val))

test_predictions = best_model.predict(test_data_processed)

submission = pd.DataFrame({'id': test_data['id'], 'Degerlendirme Puani': test_predictions})
submission.to_csv('/content/drive/MyDrive/datathon2024/datathon-2024/submission_optimized.csv', index=False)

print("Tahminler başarıyla kaydedildi.")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.6}
Optimized Validation MSE: 34.48892829283832
Tahminler başarıyla kaydedildi.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# Veriyi yükleme
data = pd.read_csv('/content/drive/MyDrive/datathon2024/datathon-2024/processed2_train.csv')  # Ön işleme yapılmış eğitim verisi
test_data = pd.read_csv('/content/drive/MyDrive/datathon2024/datathon-2024/test_x.csv')     # Test verisi

# Özellikler ve etiketlerin ayrılması
X = data.drop(columns=['Degerlendirme Puani'])  # Özellikler
y = data['Degerlendirme Puani']                  # Etiketler

# Kategorik verileri sayısal verilere dönüştürme
def preprocess_data(df, reference_df=None):
    df = pd.get_dummies(df)
    if reference_df is not None:
        # Referans veri setindeki sütunlarla eşitle
        df = df.reindex(columns=reference_df.columns, fill_value=0)
    # Eksik verileri doldurma
    df.fillna(df.mean(), inplace=True)
    return df

# Eğitim veri setini işleme
X_processed = preprocess_data(X)

# Test veri setini işleme, eğitim veri setindeki sütunlarla eşitleme
test_data_processed = preprocess_data(test_data, reference_df=X_processed)

# Eğitim ve test veri setlerine ayırma
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Modeli oluşturma ve eğitme
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Modeli değerlendirme (isteğe bağlı)
y_pred_val = model.predict(X_val)
print("Validation MSE:", mean_squared_error(y_val, y_pred_val))

# Test veri seti üzerinde tahmin yapma
test_predictions = model.predict(test_data_processed)

# Tahminleri bir CSV dosyasına yazma
submission = pd.DataFrame({'id': test_data['id'], 'Degerlendirme Puani': test_predictions})
submission.to_csv('/content/drive/MyDrive/datathon2024/datathon-2024/submission4.csv', index=False)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Validation MSE: 35.94034923332127
