In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the CSV file
train_df = pd.read_csv("train.csv")

# Drop rows where the target column 'Degerlendirme Puani' is NaN
train_df = train_df.dropna(subset=['Degerlendirme Puani'])

# Extract the target variable
target = train_df['Degerlendirme Puani']

# Handle missing values in the text column by filling with empty strings
text_data = train_df['Girisimcilikle Ilgili Deneyiminizi Aciklayabilir misiniz?'].fillna("")

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf_vectorizer.fit_transform(text_data)

# Optional: Scale the TF-IDF features if needed (generally not required but can be useful in some cases)
# scaler = StandardScaler(with_mean=False)  # StandardScaler can't handle sparse matrices with mean centering
# tfidf_vectors = scaler.fit_transform(tfidf_vectors)

# Combine TF-IDF features with any other numeric features (if you have any)
# In this case, we are only using TF-IDF features
combined_features = tfidf_vectors

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, target, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

# Optionally, print some predictions vs actual values
for actual, pred in zip(y_test.head(10), y_pred[:10]):
    print(f"Actual: {actual}, Predicted: {pred}")



  train_df = pd.read_csv("train.csv")


Mean Squared Error: 315.77
Actual: 36.0, Predicted: 36.71044057145474
Actual: 68.0, Predicted: 44.85462039566596
Actual: 7.0, Predicted: 30.602363972883687
Actual: 30.0, Predicted: 41.16338201405769
Actual: 11.0, Predicted: 30.602363972883687
Actual: 46.0, Predicted: 30.602363972883687
Actual: 31.0, Predicted: 30.602363972883687
Actual: 21.0, Predicted: 30.602363972883687
Actual: 15.0, Predicted: 30.602363972883687
Actual: 15.0, Predicted: 30.602363972883687
