In [2]:
pip install pandas numpy scikit-learn gensim textblob scipy joblib

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from textblob import TextBlob
from sklearn.pipeline import Pipeline
import joblib

# Load the data
df = pd.read_csv('train.csv')

# Extract the relevant columns
text_columns = [
    'Girisimcilikle Ilgili Deneyiminiz Var Mi?',
    'Girisimcilikle Ilgili Deneyiminizi Aciklayabilir misiniz?'
]
target_column = 'Degerlendirme Puani'

# Drop rows with missing target values
df = df.dropna(subset=[target_column])

# Fill missing values in text columns with empty strings
df[text_columns] = df[text_columns].fillna('')

# Combine text columns into one for processing
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

# Tokenize and preprocess the text data for Word2Vec
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)

# Prepare the Word2Vec model
sentences = [preprocess_text(text) for text in df['combined_text']]
word2vec_model = Word2Vec(sentences, vector_size=150, window=7, min_count=5, workers=4)

# Create document vectors by averaging word vectors
def vectorize_text(text):
    tokens = preprocess_text(text)
    vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

df['text_vector'] = df['combined_text'].apply(vectorize_text)
word2vec_features = np.array(df['text_vector'].tolist())

# TF-IDF Vectorization
text_data = df['combined_text']
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
tfidf_vectors = tfidf_vectorizer.fit_transform(text_data)

# Add sentiment analysis features
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df[['sentiment_polarity', 'sentiment_subjectivity']] = df['combined_text'].apply(lambda x: pd.Series(get_sentiment(x)))

# Combine all features
text_features = hstack([tfidf_vectors, word2vec_features])
additional_features = df[['sentiment_polarity', 'sentiment_subjectivity']].values
combined_features = hstack([text_features, additional_features])

# Target variable
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train different regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Support Vector Regression': SVR(kernel='rbf'),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'Random Forest Regressor': RandomForestRegressor()
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    cv_scores = cross_val_score(model, combined_features, y, cv=5, scoring='neg_mean_squared_error')
    print(f'{name} - Mean Squared Error: {mse:.2f}')
    print(f'{name} - Cross-Validated MSE: {-cv_scores.mean():.2f}')
    
    # Print actual vs predicted values
    print(f"\n{name} Sample Actual vs. Predicted Values:")
    for actual, predicted in zip(y_test.head(10), y_pred[:10]):
        print(f"Actual: {actual}, Predicted: {predicted}")

    # Save the best model
    joblib.dump(model, f'{name.lower().replace(" ", "_")}_model.pkl')
    joblib.dump(scaler, f'{name.lower().replace(" ", "_")}_scaler.pkl')


  df = pd.read_csv('train.csv')


Linear Regression - Mean Squared Error: 289.39
Linear Regression - Cross-Validated MSE: 328.86

Linear Regression Sample Actual vs. Predicted Values:
Actual: 36.0, Predicted: 36.64029577674991
Actual: 68.0, Predicted: 49.06229570185936
Actual: 7.0, Predicted: 28.27308949509944
Actual: 30.0, Predicted: 38.08947423088991
Actual: 11.0, Predicted: 28.27308949509944
Actual: 46.0, Predicted: 27.104119613713614
Actual: 31.0, Predicted: 28.27308949509944
Actual: 21.0, Predicted: 43.95125480186621
Actual: 15.0, Predicted: 28.27308949509944
Actual: 15.0, Predicted: 27.104119613713614
Ridge Regression - Mean Squared Error: 289.17
Ridge Regression - Cross-Validated MSE: 316.58

Ridge Regression Sample Actual vs. Predicted Values:
Actual: 36.0, Predicted: 35.81762681773813
Actual: 68.0, Predicted: 50.25253624113701
Actual: 7.0, Predicted: 28.27286445028412
Actual: 30.0, Predicted: 36.81022042069917
Actual: 11.0, Predicted: 28.27286445028412
Actual: 46.0, Predicted: 27.104201448663687
Actual: 31.0, 