In [1]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from collections import Counter
from sklearn.metrics import mean_squared_error, r2_score
from pycaret.regression import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED


ModuleNotFoundError: No module named 'pycaret'

In [2]:
# Load data
prompts_df = pd.read_csv('./prompts_train.csv')
summaries_df = pd.read_csv('./summaries_train.csv')

In [3]:
# Merge dataframes
merged_df = pd.merge(summaries_df, prompts_df, on='prompt_id')

In [4]:
# Function to calculate n-gram overlap between two texts
def ngram_overlap(text1, text2, n=2):
    # Create n-grams for each text
    ngrams1 = list(ngrams(text1.split(), n))
    ngrams2 = list(ngrams(text2.split(), n))
    
    # Create counters for n-grams
    counter1 = Counter(ngrams1)
    counter2 = Counter(ngrams2)
    
    # Calculate the overlap
    common_ngrams = sum((counter1 & counter2).values())
    total_ngrams = sum((counter1 | counter2).values())
    
    return common_ngrams / total_ngrams if total_ngrams > 0 else 0


In [5]:
# Function to create features for a given DataFrame
def create_features(df):
    # Calculate text length features
    df['summary_length'] = df['text'].apply(len)
    df['article_length'] = df['prompt_text'].apply(len)
    df['length_ratio'] = df['summary_length'] / df['article_length']

    # Calculate TF-IDF based cosine similarity between the summary and the article
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].tolist() + df['prompt_text'].tolist())
    cosine_similarities = cosine_similarity(tfidf_matrix[:len(df)], tfidf_matrix[len(df):])
    df['cosine_similarity'] = cosine_similarities.diagonal()

    # Calculate vocabulary richness in the summary
    df['vocab_richness'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)

    # Calculate bi-gram and tri-gram overlaps between the summary and the article
    df['bigram_overlap'] = df.apply(lambda row: ngram_overlap(row['text'], row['prompt_text'], n=2), axis=1)
    df['trigram_overlap'] = df.apply(lambda row: ngram_overlap(row['text'], row['prompt_text'], n=3), axis=1)
    
    return df

In [6]:
# Create features for the test set
merged_df = create_features(merged_df)

# Show first few rows of the feature-engineered test dataframe
merged_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,summary_length,article_length,length_ratio,cosine_similarity,vocab_richness,bigram_overlap,trigram_overlap
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,346,3566,0.097027,0.182623,0.836066,0.003063,0.0
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,1225,3566,0.343522,0.405863,0.679803,0.032383,0.005057
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,345,3566,0.096747,0.323222,0.833333,0.017107,0.007728
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,451,3566,0.126472,0.403937,0.776316,0.029186,0.012121
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,145,3566,0.040662,0.183623,0.925926,0.006483,0.0


In [7]:
# Split the training set into development and train sets
train_df, dev_df = train_test_split(merged_df, test_size=0.2, random_state=42)


In [8]:
reg_content = setup(data = train_df, target = 'content')

AttributeError: module 'pandas' has no attribute 'Int64Index'

In [9]:
# Feature and Target Variables
features = ['summary_length', 'article_length', 'length_ratio', 'cosine_similarity', 'vocab_richness', 'bigram_overlap', 'trigram_overlap']
target_content = 'content'
target_wording = 'wording'


In [11]:
# load sample dataset
from pycaret.datasets import get_data
data = get_data('diabetes')

from pycaret.classification import *
s = setup(data, target = 'Class variable', session_id = 123)

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


AttributeError: module 'pandas' has no attribute 'Int64Index'