In [1]:
import pickle

In [2]:
with open('cleaned_text.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
df.head()

Unnamed: 0,combined_text,rating
0,great story line slow going first bad kept int...,5
1,rushed story skip ahead fast thing explained p...,2
2,lot history two meet get hot hurry good book s...,5
3,great fun read right length relaxing getting i...,5
4,gave book 3 star simply writing little awkward...,3


In [4]:
df["combined_text"].str.split().apply(len).describe()

count    9083.000000
mean       46.175052
std        38.628713
min         6.000000
25%        18.000000
50%        31.000000
75%        62.000000
max       202.000000
Name: combined_text, dtype: float64

In [5]:
MAX_LENGTH = 100  # Sabit uzunluk belirle

def pad_text(text, max_length=MAX_LENGTH):
    words = text.split()
    if len(words) < max_length:
        words += ["<PAD>"] * (max_length - len(words))
    return " ".join(words[:max_length])

df["combined_text"] = df["combined_text"].apply(pad_text)


In [7]:
from gensim.models import Word2Vec
import numpy as np

sentences = [text.split() for text in df["combined_text"]]

word2vec_model = Word2Vec(sentences,
                          vector_size=200,
                          window=5,
                          min_count=2,
                          workers=4,
                          sg=1,
                          epochs=20
                          )

word2vec_model.save("word2vec_model.bin")


In [8]:
def vectorize_text(text, model, vector_size=200):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)  
    else:
        return np.zeros(vector_size) 


X_word2vec = np.array([vectorize_text(text, word2vec_model) for text in df["combined_text"].astype(str)])

print(f" Word2Vec özellik boyutu: {X_word2vec.shape}")


 Word2Vec özellik boyutu: (9083, 200)


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df["rating"], test_size=0.2, random_state=42)

print(f" Train Set: {X_train.shape}, Test Set: {X_test.shape}")


 Train Set: (7266, 200), Test Set: (1817, 200)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="r2", n_jobs=-1,verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_rf = grid_search.best_estimator_


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}


In [16]:
y_pred_train = best_rf.predict(X_train)
y_pred_test = best_rf.predict(X_test)


y_pred_train = [round(pred) for pred in y_pred_train]
y_pred_test = [round(pred) for pred in y_pred_test]


train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Train Accuracy: 0.7348
Test Accuracy: 0.3330


In [18]:
import pickle

In [19]:
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)