In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression


In [None]:
# Load the dataset
df = pd.read_csv('dataset.csv')


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


In [None]:
# Create a TF-IDF vectorizer to convert text data into numerical vectors
vectorizer = TfidfVectorizer(stop_words='english')


In [None]:
# Fit the vectorizer to the training data and transform both datasets
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Train a linear regression model on the training data
model = LinearRegression()
model.fit(X_train_tfidf, y_train)


In [None]:
# Evaluate the model on the testing data
y_pred = model.predict(X_test_tfidf)
f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro F1 score: {f1:.3f}')


In [None]:
# Use the trained model to summarize new datasets
new_data = ['This is a test sentence.', 'Another test sentence.']
new_data_tfidf = vectorizer.transform(new_data)
summary = model.predict(new_data_tfidf)
print(f'Summary: {summary}'