In [11]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from collections import Counter
from sklearn.metrics import mean_squared_error, r2_score
from pycaret.regression import *

ImportError: Pandas requires version '3.0.0' or newer of 'jinja2' (version '2.11.2' currently installed).

In [None]:
!pip install upgrade pandas


In [5]:
# Load data
prompts_df = pd.read_csv('./prompts_train.csv')
summaries_df = pd.read_csv('./summaries_train.csv')

In [6]:
# Merge dataframes
merged_df = pd.merge(summaries_df, prompts_df, on='prompt_id')

In [41]:
# Function to calculate n-gram overlap between two texts
def ngram_overlap(text1, text2, n=2):
    # Create n-grams for each text
    ngrams1 = list(ngrams(text1.split(), n))
    ngrams2 = list(ngrams(text2.split(), n))
    
    # Create counters for n-grams
    counter1 = Counter(ngrams1)
    counter2 = Counter(ngrams2)
    
    # Calculate the overlap
    common_ngrams = sum((counter1 & counter2).values())
    total_ngrams = sum((counter1 | counter2).values())
    
    return common_ngrams / total_ngrams if total_ngrams > 0 else 0


In [54]:
# Function to create features for a given DataFrame
def create_features(df):
    
    # Calculate text length features
    df['summary_length'] = df['text'].apply(len)
    df['article_length'] = df['prompt_text'].apply(len)
    df['length_ratio'] = df['summary_length'] / df['article_length']

    # Calculate TF-IDF based cosine similarity between the summary and the article
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].tolist() + df['prompt_text'].tolist())
    cosine_similarities = cosine_similarity(tfidf_matrix[:len(df)], tfidf_matrix[len(df):])
    df['cosine_similarity'] = cosine_similarities.diagonal()

    # Calculate vocabulary richness in the summary
    df['vocab_richness'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)

    # Calculate bi-gram and tri-gram overlaps between the summary and the article
    df['bigram_overlap'] = df.apply(lambda row: ngram_overlap(row['text'], row['prompt_text'], n=2), axis=1)
    df['trigram_overlap'] = df.apply(lambda row: ngram_overlap(row['text'], row['prompt_text'], n=3), axis=1)
    
    return df

In [55]:
# Create features for the test set
merged_df = create_features(merged_df)

# Show first few rows of the feature-engineered test dataframe
merged_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,summary_length,article_length,length_ratio,cosine_similarity,vocab_richness,bigram_overlap,trigram_overlap
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,346,3566,0.097027,0.182623,0.836066,0.003063,0.0
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,1225,3566,0.343522,0.405863,0.679803,0.032383,0.005057
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,345,3566,0.096747,0.323222,0.833333,0.017107,0.007728
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,451,3566,0.126472,0.403937,0.776316,0.029186,0.012121
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,145,3566,0.040662,0.183623,0.925926,0.006483,0.0


In [46]:
# Split the training set into development and train sets
train_df, dev_df = train_test_split(merged_df, test_size=0.2, random_state=42)


In [61]:
# Model Initialization
gb_model_content = GradientBoostingRegressor(random_state=42)
gb_model_wording = GradientBoostingRegressor(random_state=42)

In [58]:
# Feature and Target Variables
features = ['summary_length', 'article_length', 'length_ratio', 'cosine_similarity', 'vocab_richness', 'bigram_overlap', 'trigram_overlap']
target_content = 'content'
target_wording = 'wording'


In [59]:
# Data preparation
X_train = train_df[features]
y_train_content = train_df[target_content]
y_train_wording = train_df[target_wording]

X_dev = dev_df[features]
y_dev_content = dev_df[target_content]
y_dev_wording = dev_df[target_wording]

In [60]:
# Function to train and evaluate a model
def train_evaluate(model, X_train, y_train, X_dev, y_dev):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_dev)
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    return model, rmse


In [64]:
# Train and evaluate models for 'content' score
gb_content, rmse_gb_content = train_evaluate(gb_model_content, X_train, y_train_content, X_dev, y_dev_content)

# Train and evaluate models for 'wording' score
gb_wording, rmse_gb_wording = train_evaluate(gb_model_wording, X_train, y_train_wording, X_dev, y_dev_wording)


In [65]:

# Show the evaluation results
print(f"Gradient model RMSE content:{rmse_gb_content}\nGradient model RMSE wording:{rmse_gb_wording}\nGradient model MCRMSE:{(rmse_gb_content + rmse_gb_wording)/2}")


Gradient model RMSE content:0.4344979354904358
Gradient model RMSE wording:0.5880323920195103
Gradient model MCRMSE:0.511265163754973


In [36]:
print(gb_model.get_params())

{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [25]:
# Randomized Search Parameters
gb_param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

In [26]:
# Initialize GridSearchCV objects for Gradient Boosting
gb_grid_search = GridSearchCV(gb_model, param_grid=gb_param_grid, cv=3)



In [27]:
# Function to perform Grid Search and return the best model
def perform_grid_search(grid_search, X_train, y_train):
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params


In [28]:
# Train and Fine-tune Model for 'content'
best_gb_model_content, best_gb_params_content = perform_grid_search(gb_grid_search, train_df[features], train_df[target_content])




In [29]:

# Train and Fine-tune Model for 'wording'
best_gb_model_wording, best_gb_params_wording = perform_grid_search(gb_grid_search, train_df[features], train_df[target_wording])


In [30]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Predict 'content' and 'wording' scores on the development set
y_pred_gb_content = best_gb_model_content.predict(dev_df[features])

y_pred_gb_wording = best_gb_model_wording.predict(dev_df[features])

# Calculate RMSE for 'content' and 'wording' for each model
rmse_gb_content = calculate_rmse(dev_df[target_content], y_pred_gb_content)

rmse_gb_wording = calculate_rmse(dev_df[target_wording], y_pred_gb_wording)

# Calculate MCRMSE (Mean Column-wise Root Mean Squared Error)
mcrmse_gb = np.mean([rmse_gb_content, rmse_gb_wording])

print(f"Gradient Boosting: RMSE Content = {rmse_gb_content}, RMSE Wording = {rmse_gb_wording}, MCRMSE = {mcrmse_gb}")


Gradient Boosting: RMSE Content = 0.43833797730604956, RMSE Wording = 0.5866782896609213, MCRMSE = 0.5125081334834855


In [31]:
# Randomized Search for Random Forest and Gradient Boosting

gb_random_search = RandomizedSearchCV(gb_model, param_distributions=gb_param_grid, n_iter=5, cv=3, random_state=42)

# Train and Fine-tune Model for 'content'
gb_random_search.fit(train_df[features], train_df[target_content])
best_gb_model_content = gb_random_search.best_estimator_

# Train and Fine-tune Model for 'wording'
gb_random_search.fit(train_df[features], train_df[target_wording])
best_gb_model_wording = gb_random_search.best_estimator_

# You can then evaluate these best models on your development set or make predictions on your test set.

In [32]:
# predict
y_pred_gb_content = best_gb_model_content.predict(dev_df[features])
y_pred_gb_wording = best_gb_model_wording.predict(dev_df[features])

rmse_gb_content = calculate_rmse(dev_df[target_content], y_pred_gb_content)
rmse_gb_wording = calculate_rmse(dev_df[target_wording], y_pred_gb_wording)

mcrmse_gb = np.mean([rmse_gb_content, rmse_gb_wording])

print(f"Gradient Boosting: RMSE Content = {rmse_gb_content}, RMSE Wording = {rmse_gb_wording}, MCRMSE = {mcrmse_gb}")


Gradient Boosting: RMSE Content = 0.43833797730604956, RMSE Wording = 0.5866782896609213, MCRMSE = 0.5125081334834855


In [37]:
print(best_gb_model_content.get_params())


{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 42, 'subsample': 0.8, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [38]:
print(gb_model.get_params())


{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


# Test

In [56]:
prompts_test_df = pd.read_csv('prompts_test.csv')
summaries_test_df = pd.read_csv('summaries_test.csv')
merged_test_df = pd.merge(summaries_test_df, prompts_test_df, on='prompt_id')

merged_test_df = create_features(merged_test_df)

# Show first few rows of the feature-engineered test dataframe
merged_test_df

Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text,summary_length,article_length,length_ratio,cosine_similarity,vocab_richness,bigram_overlap,trigram_overlap
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...,14,15,0.933333,0.284005,1.0,0.0,0.0
1,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...,14,15,0.933333,0.284005,1.0,0.0,0.0
2,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...,14,15,0.933333,0.284005,1.0,0.0,0.0
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...,14,15,0.933333,0.284005,1.0,0.0,0.0


In [68]:
test_features = merged_test_df[features]

In [71]:
print(f"Content prediction: {gb_model_content.predict(test_features)}")
print(f"Wording prediction: {gb_model_wording.predict(test_features)}")

Content prediction: [-0.93703228 -0.93703228 -0.93703228 -0.93703228]
Wording prediction: [0.25802261 0.25802261 0.25802261 0.25802261]
