# Kütüphaneleri Ekleme

In [1]:
import numpy as np
import re
from html import unescape
import pandas as pd

# Veri setini içeri aktarma

In [2]:
data = pd.read_csv('profiles.csv' , low_memory=False)

# Hatalı verileri kontrol etme ve düzenleme

In [3]:
missing_data_reloaded = data.isnull().mean() * 100
data_types_reloaded = data.dtypes

In [4]:
data_summary = pd.DataFrame({
    "Missing Percentage": missing_data_reloaded,
    "Data Type": data_types_reloaded
}).sort_values(by="Missing Percentage", ascending=False)

In [5]:
threshold = 90
cols_to_drop = data_summary[data_summary['Missing Percentage'] > threshold].index
data_cleaned = data.drop(columns=cols_to_drop)

In [6]:
# eğer düşürülmezse Yaş için eksik değerleri medyanla hesaplıyoruz
if 'age' in data_cleaned.columns:
    data_cleaned['age'].fillna(data_cleaned['age'].median(), inplace=True)

In [7]:
# Kategorik verileri kategori tipine dönüştürüyoruz
categorical_cols = ['sex', 'status', 'orientation', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'job', 'pets', 'religion', 'sign', 'smokes', 'speaks']
for col in categorical_cols:
    if col in data_cleaned.columns:
        data_cleaned[col] = data_cleaned[col].astype('category')

In [8]:
# metin sütunlarını normalleştirme ve düzenleme yapıyoruz
def clean_text(text):
    if isinstance(text, str):
        text = unescape(text)  # Decode HTML entities
        text = re.sub('<br />', ' ', text)  # Replace breaks with spaces
        text = re.sub('<[^>]+>', '', text)  # Remove HTML tags
        return text.strip()
    return text

In [9]:
essay_cols = [col for col in data_cleaned.columns if col.startswith('essay')]
data_cleaned[essay_cols] = data_cleaned[essay_cols].apply(lambda x: x.apply(clean_text))

In [10]:
# Feature engineering :) Yaşı nesil gruplarına dönüştürdüm
def classify_generation(age):
    if 18 <= age <= 32:
        return 'Millennial'
    elif 33 <= age <= 47:
        return 'Gen X'
    elif 48 <= age <= 70:
        return 'Boomer'
    return 'Other'

data_cleaned['generation'] = data_cleaned['age'].apply(classify_generation)

print(data_cleaned.head())

    age       body_type    drinks      drugs  \
0  22.0  a little extra  socially      never   
1  35.0         average     often  sometimes   
2  38.0            thin  socially        NaN   
3  23.0            thin  socially        NaN   
4  29.0        athletic  socially      never   

                           education  \
0      working on college/university   
1              working on space camp   
2     graduated from masters program   
3      working on college/university   
4  graduated from college/university   

                                              essay0  \
0  about me: \n \ni would love to think that i wa...   
1  i am a chef: this is what that means. \n1. i a...   
2  i'm not ashamed of much, but writing public te...   
3          i work in a library and go to school. . .   
4  hey how's it going? currently vague on the pro...   

                                              essay1  \
0  currently working as an international agent fo...   
1  dedicating everyda

# Veriyi eğitime hazırlama

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

In [12]:
# Veri kümesindeki özellikleri temsil eden bir liste oluşturuyoruz
features = [
    'body_type', 'drinks', 'drugs', 'education', 'height', 
    'income', 'job', 'location', 'pets', 'religion', 
    'sex', 'sign', 'smokes', 'speaks', 'status'
]
# Hedef değişkenleri belirliyoruz : yaş ve nesil
target_age = 'age'
target_generation = 'generation'
print(data_cleaned.columns)


Index(['age', 'body_type', 'drinks', 'drugs', 'education', 'essay0', 'essay1',
       'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8',
       'essay9', 'ethnicity', 'height', 'income', 'job', 'last_online',
       'location', 'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes',
       'speaks', 'status', 'generation'],
      dtype='object')


In [13]:
categorical_cols = data_cleaned.select_dtypes(include=['category']).columns
data_cleaned[categorical_cols] = data_cleaned[categorical_cols].apply(lambda x: x.cat.add_categories('Unknown') if 'Unknown' not in x.cat.categories else x)

In [14]:
# eksik değerleri doldurma
X = data_cleaned[features].fillna('Unknown')
X = data_cleaned[features].fillna('Unknown')
y_age = data_cleaned['age']
y_generation = data_cleaned['generation']


In [15]:
data_cleaned['height'] = pd.to_numeric(data_cleaned['height'], errors='coerce')
data_cleaned['income'] = pd.to_numeric(data_cleaned['income'], errors='coerce')
data_cleaned['height'].fillna(data_cleaned['height'].median(), inplace=True)
data_cleaned['income'].fillna(data_cleaned['income'].median(), inplace=True)
data_cleaned['location'] = data_cleaned['location'].astype('category')

# veri setini ayırma

In [16]:
X_train, X_test, y_train_age, y_test_age = train_test_split(data_cleaned[features], data_cleaned['age'], test_size=0.2, random_state=42)
X_train, X_test, y_train_gen, y_test_gen = train_test_split(data_cleaned[features], data_cleaned['generation'], test_size=0.2, random_state=42)

In [17]:
# # Kategorik özellikler için OneHotEncoder pipeline oluşturma
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), features)
])

# Model seçimi

In [18]:
regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

classifier_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Modelleri eğitme

In [19]:
regressor_pipeline.fit(X_train, y_train_age)
classifier_pipeline.fit(X_train, y_train_gen)

# Değerlendirme

In [20]:
# Regresyon modelini tahmin etme ve değerlendirme
y_pred_age = regressor_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test_age, y_pred_age))

In [21]:
# classification modelini tahmin etme ve değerlendirme
y_pred_gen = classifier_pipeline.predict(X_test)
accuracy = accuracy_score(y_test_gen, y_pred_gen)

In [22]:
print("RMSE on test set for age prediction:", rmse)
print("Accuracy on test set for generation prediction:", accuracy)

RMSE on test set for age prediction: 3.2364076756563263
Accuracy on test set for generation prediction: 0.9436875567665758
