<a href="https://colab.research.google.com/github/dragonfire-09/projects/blob/main/trained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import csv
import sys
from io import StringIO

# Increase CSV field size limit
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

def read_large_csv(file_path):
    """
    Read large CSV files with custom parsing
    """
    print("Starting data loading process...")

    # Initialize empty lists for data
    rows = []
    headers = None

    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
            # Read first line to get headers
            headers = file.readline().strip().split('\t')
            print(f"Found {len(headers)} columns:")
            print(headers)

            # Process rest of the file
            for line_num, line in enumerate(file, 2):
                try:
                    # Split line by tab
                    fields = line.strip().split('\t')

                    # Handle inconsistent number of fields
                    if len(fields) != len(headers):
                        print(f"Warning: Line {line_num} has {len(fields)} fields (expected {len(headers)})")
                        # Adjust field count to match headers
                        if len(fields) > len(headers):
                            fields = fields[:len(headers)]
                        else:
                            fields.extend([''] * (len(headers) - len(fields)))

                    rows.append(fields)

                    # Print progress
                    if line_num % 10000 == 0:
                        print(f"Processed {line_num} lines...")

                except Exception as e:
                    print(f"Error processing line {line_num}: {e}")
                    continue

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

    print("Creating DataFrame...")
    df = pd.DataFrame(rows, columns=headers)
    return df

def clean_and_process_data(df):
    """
    Clean and process the loaded data
    """
    print("Starting data cleaning and processing...")

    # 1. Basic cleaning
    print("Performing basic cleaning...")
    df = df.replace('', np.nan)
    df = df.replace('None', np.nan)

    # 2. Clean numeric columns
    def clean_numeric(x):
        if pd.isna(x):
            return 0
        try:
            # Remove any non-numeric characters except decimal point and minus
            cleaned = ''.join(char for char in str(x) if char.isdigit() or char in '.-')
            return float(cleaned) if cleaned else 0
        except:
            return 0

    # 3. Process specific columns
    print("Processing specific columns...")

    # Financial columns
    numeric_columns = ['EU Contribution', 'Total Cost']
    for col in numeric_columns:
        if col in df.columns:
            df[f'{col}_Clean'] = df[col].apply(clean_numeric)
            print(f"Processed {col}")

    # Dates
    date_columns = ['Project Start Date', 'Project End Date', 'Call Deadline Date']
    for col in date_columns:
        if col in df.columns:
            df[f'{col}_Clean'] = pd.to_datetime(df[col], errors='coerce')
            print(f"Processed {col}")

    # 4. Create derived features
    print("Creating derived features...")
    if 'Project Start Date_Clean' in df.columns and 'Project End Date_Clean' in df.columns:
        df['Project_Duration_Days'] = (df['Project End Date_Clean'] - df['Project Start Date_Clean']).dt.days

    return df

def analyze_data(df):
    """
    Perform basic data analysis
    """
    print("\nData Analysis:")
    print("\nDataset Shape:", df.shape)

    print("\nColumns:")
    for col in df.columns:
        non_null = df[col].count()
        dtype = df[col].dtype
        print(f"{col}: {non_null} non-null values, dtype: {dtype}")

    print("\nSample of numeric columns:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(df[numeric_cols].describe())

def main():
    # File path
    file_path = '/content/Horizon projects dataset .csv'

    # Load data
    print("Loading data...")
    df = read_large_csv(file_path)

    if df is not None:
        # Process data
        print("\nProcessing data...")
        df_processed = clean_and_process_data(df)

        # Analyze data
        print("\nAnalyzing data...")
        analyze_data(df_processed)

        # Save processed data
        print("\nSaving processed data...")
        df_processed.to_csv('processed_horizon_data.csv', index=False)
        print("Data saved to 'processed_horizon_data.csv'")

        return df_processed
    else:
        print("Failed to load data")
        return None

# Run the script
if __name__ == "__main__":
    processed_df = main()

Loading data...
Starting data loading process...
Found 1 columns:
['Programme;Project Number;CORDIS Link;Project Start Year;Project End Date;Project End Year;Project Title;Project Acronym;Project Status;Call ID;Call Deadline Date;Call Deadline Year;Project Signature Date;Project Signature Year;Project Start Date;Thematic Priority Descr;Pillar Abbr;Pillar Descr;Topic Code;Topic Descr;Simplified ToA;Legal Name;General PIC;Partner Role;Partner Type;Legal Entity Type;Signed Grants;Participation;EU Contribution;Total Cost']
Processed 10000 lines...
Processed 20000 lines...
Processed 30000 lines...
Processed 40000 lines...
Processed 50000 lines...
Processed 60000 lines...
Creating DataFrame...

Processing data...
Starting data cleaning and processing...
Performing basic cleaning...
Processing specific columns...
Creating derived features...

Analyzing data...

Data Analysis:

Dataset Shape: (63985, 1)

Columns:
Programme;Project Number;CORDIS Link;Project Start Year;Project End Date;Project 

In [None]:
# First, let's look at the data structure
import pandas as pd

# Load and examine the data
df = pd.read_csv('processed_horizon_data.csv')
print("Columns in the dataset:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

Columns in the dataset:
['Programme;Project Number;CORDIS Link;Project Start Year;Project End Date;Project End Year;Project Title;Project Acronym;Project Status;Call ID;Call Deadline Date;Call Deadline Year;Project Signature Date;Project Signature Year;Project Start Date;Thematic Priority Descr;Pillar Abbr;Pillar Descr;Topic Code;Topic Descr;Simplified ToA;Legal Name;General PIC;Partner Role;Partner Type;Legal Entity Type;Signed Grants;Participation;EU Contribution;Total Cost']

First few rows:
  Programme;Project Number;CORDIS Link;Project Start Year;Project End Date;Project End Year;Project Title;Project Acronym;Project Status;Call ID;Call Deadline Date;Call Deadline Year;Project Signature Date;Project Signature Year;Project Start Date;Thematic Priority Descr;Pillar Abbr;Pillar Descr;Topic Code;Topic Descr;Simplified ToA;Legal Name;General PIC;Partner Role;Partner Type;Legal Entity Type;Signed Grants;Participation;EU Contribution;Total Cost
0  H2020;115797;http://cordis.europa.eu/pro

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from sklearn.preprocessing import LabelEncoder
import pickle
from tqdm import tqdm

class FastAcademicAdvisor:
    def __init__(self, data_path='processed_horizon_data.csv'):
        self.data_path = data_path
        self.encoder = SentenceTransformer('paraphrase-MiniLM-L3-v2')
        self.label_encoder = LabelEncoder()
        self.embeddings = None
        self.knowledge_base = {}

        self.categories = [
            'eligibility',
            'funding',
            'application',
            'topic',
            'partners'
        ]

    def process_data(self):
        """
        Process data with error handling
        """
        print("Processing data...")

        # Read data with proper handling
        try:
            df = pd.read_csv(
                self.data_path,
                sep=';',
                low_memory=False  # Handle mixed types
            )

            # Clean and prepare text data
            print("Cleaning data...")
            df = self.clean_dataframe(df)

            # Create embeddings
            print("Creating embeddings...")
            texts = df['combined_text'].tolist()
            texts = [str(text) for text in texts]  # Ensure all texts are strings

            # Process in batches
            batch_size = 32
            embeddings = []

            for i in tqdm(range(0, len(texts), batch_size)):
                batch = texts[i:i + batch_size]
                try:
                    batch_embeddings = self.encoder.encode(batch)
                    embeddings.extend(batch_embeddings)
                except Exception as e:
                    print(f"Error processing batch {i}: {e}")
                    continue

            self.embeddings = np.array(embeddings)

            # Build knowledge base
            print("Building knowledge base...")
            for idx, row in df.iterrows():
                self.knowledge_base[idx] = {
                    'title': str(row['Project Title']),
                    'topic': str(row['Topic Descr']),
                    'funding': self.clean_numeric(row['EU Contribution']),
                    'type': str(row['Legal Entity Type']),
                    'status': str(row['Project Status'])
                }

            print("Processing complete!")
            return df

        except Exception as e:
            print(f"Error processing data: {e}")
            return None

    def clean_dataframe(self, df):
        """
        Clean and prepare DataFrame
        """
        # Handle missing values
        df = df.fillna('')

        # Clean text columns
        text_columns = ['Project Title', 'Topic Descr', 'Thematic Priority Descr']
        for col in text_columns:
            if col in df.columns:
                df[col] = df[col].astype(str).apply(self.clean_text)

        # Combine text for embeddings
        df['combined_text'] = df.apply(
            lambda x: f"{x['Project Title']} {x['Topic Descr']} {x['Thematic Priority Descr']}",
            axis=1
        )

        return df

    @staticmethod
    def clean_text(text):
        """
        Clean text data
        """
        # Convert to string and clean
        text = str(text)
        text = text.lower()
        text = ' '.join(text.split())
        return text

    @staticmethod
    def clean_numeric(value):
        """
        Clean numeric values
        """
        try:
            if pd.isna(value):
                return 0
            value = str(value).replace(',', '').replace('€', '').strip()
            return float(value) if value else 0
        except:
            return 0

    def save_model(self, filename='fast_academic_advisor.pkl'):
        """
        Save model data
        """
        if self.embeddings is not None:
            model_data = {
                'embeddings': self.embeddings,
                'knowledge_base': self.knowledge_base,
                'categories': self.categories
            }

            with open(filename, 'wb') as f:
                pickle.dump(model_data, f)
            print("Model saved!")
        else:
            print("No model data to save!")

    def load_model(self, filename='fast_academic_advisor.pkl'):
        """
        Load model data
        """
        try:
            with open(filename, 'rb') as f:
                model_data = pickle.load(f)

            self.embeddings = model_data['embeddings']
            self.knowledge_base = model_data['knowledge_base']
            self.categories = model_data['categories']
            print("Model loaded!")
        except Exception as e:
            print(f"Error loading model: {e}")

    def get_response(self, query):
        """
        Generate response for query
        """
        try:
            # Clean and encode query
            query = self.clean_text(query)
            query_embedding = self.encoder.encode([query])

            # Find similar projects
            similarities = np.dot(self.embeddings, query_embedding.T).flatten()
            top_indices = similarities.argsort()[-3:][::-1]

            # Generate response
            response = "Relevant Projects:\n\n"
            for idx in top_indices:
                project = self.knowledge_base[idx]
                response += f"Project: {project['title']}\n"
                response += f"Topic: {project['topic']}\n"
                response += f"Funding: €{project['funding']:,.2f}\n"
                response += f"Type: {project['type']}\n"
                response += f"Status: {project['status']}\n\n"

            return response

        except Exception as e:
            return f"Error generating response: {e}"

def main():
    try:
        # Initialize
        print("Initializing Fast Academic Advisor...")
        advisor = FastAcademicAdvisor('processed_horizon_data.csv')

        # Process data
        df = advisor.process_data()

        if df is not None:
            # Save model
            advisor.save_model()

            # Test queries
            print("\nTesting queries...")
            test_queries = [
                "AI research projects",
                "climate change funding",
                "healthcare innovation"
            ]

            for query in test_queries:
                print(f"\nQuery: {query}")
                response = advisor.get_response(query)
                print(response)
        else:
            print("Failed to process data!")

    except Exception as e:
        print(f"Error in main execution: {e}")

if __name__ == "__main__":
    main()

Initializing Fast Academic Advisor...
Processing data...
Cleaning data...
Creating embeddings...


100%|██████████| 22038/22038 [1:16:41<00:00,  4.79it/s]


Building knowledge base...
Processing complete!
Model saved!

Testing queries...

Query: AI research projects
Relevant Projects:

Project: human-ai teaming platform for maintaining and evolving ai systems in manufacturing
Topic: artificial intelligence for manufacturing
Funding: €444.92
Type: HES
Status: CLOSED

Project: human-ai teaming platform for maintaining and evolving ai systems in manufacturing
Topic: artificial intelligence for manufacturing
Funding: €300.00
Type: PRC
Status: CLOSED

Project: human-ai teaming platform for maintaining and evolving ai systems in manufacturing
Topic: artificial intelligence for manufacturing
Funding: €311.38
Type: PRC
Status: CLOSED



Query: climate change funding
Relevant Projects:

Project: climate change experiment
Topic: -
Funding: €0.00
Type: -
Status: CLOSED

Project: climate change experiment
Topic: -
Funding: €0.00
Type: -
Status: CLOSED

Project: climate change experiment
Topic: -
Funding: €0.00
Type: -
Status: CLOSED



Query: healthca

In [2]:
# Install necessary libraries
!pip install transformers datasets scikit-learn nltk pandas numpy torch sentence-transformers

import pandas as pd
import numpy as np
import torch
import nltk
import re
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
from datasets import Dataset

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define a function to load and preprocess your data
def load_data(file_path):
    """
    Load research funding dataset.
    Expected format: JSON or CSV with at least 'question' and 'answer' fields.
    Additional metadata like 'category', 'source', etc. are helpful.
    """
    if file_path.endswith('.json'):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    elif file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format")

    return data

# Load your dataset
data_path = '/content/processed_horizon_data.csv'  # Update with your actual path
try:
    data = load_data(data_path)
    print(f"Successfully loaded data with {len(data)} entries")
except Exception as e:
    print(f"Error loading data: {e}")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using device: cpu
Successfully loaded data with 63985 entries


In [9]:
print(f"Input data type: {type(data)}")
if isinstance(data, pd.DataFrame):
    print(f"Input DataFrame shape: {data.shape}")
    print(f"Input DataFrame columns: {data.columns.tolist()}")
elif isinstance(data, list):
    print(f"Input list length: {len(data)}")
    if len(data) > 0:
        print(f"First item in list: {data[0]}")
else:
    print(f"Unexpected input data type: {type(data)}")

Input data type: <class 'pandas.core.frame.DataFrame'>
Input DataFrame shape: (63985, 1)
Input DataFrame columns: ['Programme;Project Number;CORDIS Link;Project Start Year;Project End Date;Project End Year;Project Title;Project Acronym;Project Status;Call ID;Call Deadline Date;Call Deadline Year;Project Signature Date;Project Signature Year;Project Start Date;Thematic Priority Descr;Pillar Abbr;Pillar Descr;Topic Code;Topic Descr;Simplified ToA;Legal Name;General PIC;Partner Role;Partner Type;Legal Entity Type;Signed Grants;Participation;EU Contribution;Total Cost']


In [10]:
def preprocess_data(data):
    """
    Preprocess the dataset for model training.
    """
    processed_data = []
    skipped_items = 0

    if isinstance(data, pd.DataFrame):
        # Process DataFrame
        for index, row in data.iterrows():
            if 'question' in row and 'answer' in row:
                processed_data.append({
                    'question': clean_text(row['question']),
                    'answer': clean_text(row['answer']),
                    'category': row.get('category', 'general'),
                    'index': index  # Add index for debugging
                })
            else:
                skipped_items += 1
    else:
        # Process JSON list
        for index, item in enumerate(data):
            if 'question' in item and 'answer' in item:
                processed_data.append({
                    'question': clean_text(item['question']),
                    'answer': clean_text(item['answer']),
                    'category': item.get('category', 'general'),
                    'index': index  # Add index for debugging
                })
            else:
                skipped_items += 1

    print(f"Processed {len(processed_data)} QA pairs")
    print(f"Skipped {skipped_items} items due to missing question or answer")

    return processed_data

In [11]:
processed_data = preprocess_data(data)
print(f"Processed {len(processed_data)} QA pairs")

if len(processed_data) > 0:
    print("First processed item:")
    print(processed_data[0])
else:
    print("No items were processed successfully.")

Processed 0 QA pairs
Skipped 63985 items due to missing question or answer
Processed 0 QA pairs
No items were processed successfully.


In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the CSV file with semicolon separator
df = pd.read_csv('/content/processed_horizon_data.csv', sep=';')
print("Original DataFrame shape:", df.shape)
print("Columns:", df.columns.tolist())

# Data preprocessing function
def preprocess_horizon_data(df):
    """
    Preprocess the Horizon dataset for model training.
    """
    processed_data = []

    for _, row in df.iterrows():
        qa_pairs = [
            {
                'question': 'What is the project title and its acronym?',
                'answer': f"Title: {row['Project Title']}\nAcronym: {row['Project Acronym']}",
                'category': 'project_overview'
            },
            {
                'question': 'What are the project details including timeline and status?',
                'answer': f"Start Date: {row['Project Start Date']}\nEnd Date: {row['Project End Date']}\nStatus: {row['Project Status']}\nThematic Priority: {row['Thematic Priority Descr']}",
                'category': 'project_timeline'
            },
            {
                'question': 'What is the project funding information?',
                'answer': f"EU Contribution: {row['EU Contribution']}\nTotal Cost: {row['Total Cost']}",
                'category': 'funding'
            },
            {
                'question': 'What are the project topic and pillar details?',
                'answer': f"Pillar: {row['Pillar Descr']}\nTopic: {row['Topic Descr']}",
                'category': 'classification'
            }
        ]

        processed_data.extend(qa_pairs)

    return processed_data

def clean_text(text):
    """Clean and normalize text."""
    if pd.isna(text) or not isinstance(text, str):
        return ""

    #<span class="ml-2" /><span class="inline-block w-3 h-3 rounded-full bg-neutral-a12 align-middle mb-[0.1rem]" />

Original DataFrame shape: (63985, 30)
Columns: ['Programme', 'Project Number', 'CORDIS Link', 'Project Start Year', 'Project End Date', 'Project End Year', 'Project Title', 'Project Acronym', 'Project Status', 'Call ID', 'Call Deadline Date', 'Call Deadline Year', 'Project Signature Date', 'Project Signature Year', 'Project Start Date', 'Thematic Priority Descr', 'Pillar Abbr', 'Pillar Descr', 'Topic Code', 'Topic Descr', 'Simplified ToA', 'Legal Name', 'General PIC', 'Partner Role', 'Partner Type', 'Legal Entity Type', 'Signed Grants', 'Participation', 'EU Contribution', 'Total Cost']


In [19]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

# 1. Load the CSV file with semicolon separator
df = pd.read_csv('/content/processed_horizon_data.csv', sep=';')
print("Original DataFrame shape:", df.shape)
print("\nColumns in the dataset:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

def preprocess_horizon_data(df):
    processed_data = []

    for _, row in df.iterrows():
        qa_pairs = [
            # Project Overview
            {
                'question': 'What are the basic details of this Horizon project?',
                'answer': (f"Programme: {row['Programme']}\n"
                          f"Project Number: {row['Project Number']}\n"
                          f"Project Title: {row['Project Title']}\n"
                          f"Project Acronym: {row['Project Acronym']}"),
                'category': 'project_overview'
            },

            # Timeline Information
            {
                'question': 'What are the key dates and status of this project?',
                'answer': (f"Start Date: {row['Project Start Date']}\n"
                          f"End Date: {row['Project End Date']}\n"
                          f"Status: {row['Project Status']}\n"
                          f"Call Deadline: {row['Call Deadline Date']}"),
                'category': 'timeline'
            },

            # Financial Information
            {
                'question': 'What is the financial information for this project?',
                'answer': (f"EU Contribution: {row['EU Contribution']}\n"
                          f"Total Cost: {row['Total Cost']}\n"
                          f"Participation: {row['Participation']}"),
                'category': 'financial'
            },

            # Topic and Programme Information
            {
                'question': 'What are the topic and programme details?',
                'answer': (f"Topic Code: {row['Topic Code']}\n"
                          f"Topic Description: {row['Topic Descr']}\n"
                          f"Pillar: {row['Pillar Descr']}\n"
                          f"Thematic Priority: {row['Thematic Priority Descr']}"),
                'category': 'topic'
            },

            # Partner Information
            {
                'question': 'What are the partner details?',
                'answer': (f"Legal Name: {row['Legal Name']}\n"
                          f"Partner Role: {row['Partner Role']}\n"
                          f"Partner Type: {row['Partner Type']}\n"
                          f"Legal Entity Type: {row['Legal Entity Type']}"),
                'category': 'partner'
            }
        ]
        processed_data.extend(qa_pairs)

    return processed_data

def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return "Not specified"
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s.,!?€$()-]', '', text)
    return text

# Clean the DataFrame
print("Cleaning data...")
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].apply(clean_text)

# Process the data
print("Processing data...")
processed_data = preprocess_horizon_data(df)
print(f"Processed {len(processed_data)} QA pairs")

# Split into train and validation sets
train_data, eval_data = train_test_split(processed_data, test_size=0.15, random_state=42)
print(f"Training examples: {len(train_data)}")
print(f"Evaluation examples: {len(eval_data)}")

# Convert to Hugging Face datasets format
def convert_to_hf_dataset(data_list):
    return Dataset.from_dict({
        'question': [item['question'] for item in data_list],
        'answer': [item['answer'] for item in data_list],
        'category': [item['category'] for item in data_list]
    })

# Create the datasets
print("Creating datasets...")
train_dataset = convert_to_hf_dataset(train_data)
eval_dataset = convert_to_hf_dataset(eval_data)

# Print sample from dataset to verify
print("\nSample from training dataset:")
print(train_dataset[0])

Original DataFrame shape: (63985, 30)

Columns in the dataset:
['Programme', 'Project Number', 'CORDIS Link', 'Project Start Year', 'Project End Date', 'Project End Year', 'Project Title', 'Project Acronym', 'Project Status', 'Call ID', 'Call Deadline Date', 'Call Deadline Year', 'Project Signature Date', 'Project Signature Year', 'Project Start Date', 'Thematic Priority Descr', 'Pillar Abbr', 'Pillar Descr', 'Topic Code', 'Topic Descr', 'Simplified ToA', 'Legal Name', 'General PIC', 'Partner Role', 'Partner Type', 'Legal Entity Type', 'Signed Grants', 'Participation', 'EU Contribution', 'Total Cost']

First few rows:
                                           Programme  Project Number  \
0  H2020;115797;http://cordis.europa.eu/project/i...             NaN   
1  H2020;115797;http://cordis.europa.eu/project/i...             NaN   
2  H2020;115797;http://cordis.europa.eu/project/i...             NaN   
3  H2020;115797;http://cordis.europa.eu/project/i...             NaN   
4  H2020;11579

In [21]:
# First, let's examine the data structure
print("Sample of raw data:")
print(df.iloc[0])
print("\nColumn value counts for a sample column:")
print(df['Programme'].value_counts().head())

# Modified preprocessing function with better error handling and data validation
def preprocess_horizon_data(df):
    processed_data = []
    valid_entries = 0

    for idx, row in df.iterrows():
        # Only create QA pairs if essential fields are present
        if pd.notna(row['Programme']) and pd.notna(row['Project Title']):
            qa_pairs = [
                # Project Overview
                {
                    'question': 'What are the basic details of this Horizon project?',
                    'answer': (f"Programme: {row['Programme']}\n"
                              f"Project Title: {row['Project Title']}\n"
                              f"Project Number: {row['Project Number']}\n"
                              f"Project Acronym: {row['Project Acronym']}").strip(),
                    'category': 'project_overview'
                }
            ]

            # Add timeline information if dates are available
            if pd.notna(row['Project Start Date']) or pd.notna(row['Project End Date']):
                qa_pairs.append({
                    'question': 'What are the key dates and status of this project?',
                    'answer': (f"Start Date: {row['Project Start Date']}\n"
                              f"End Date: {row['Project End Date']}\n"
                              f"Status: {row['Project Status']}").strip(),
                    'category': 'timeline'
                })

            # Add financial information if available
            if pd.notna(row['EU Contribution']) or pd.notna(row['Total Cost']):
                qa_pairs.append({
                    'question': 'What is the financial information for this project?',
                    'answer': (f"EU Contribution: {row['EU Contribution']}\n"
                              f"Total Cost: {row['Total Cost']}").strip(),
                    'category': 'financial'
                })

            # Add topic information if available
            if pd.notna(row['Topic Code']) or pd.notna(row['Topic Descr']):
                qa_pairs.append({
                    'question': 'What are the topic and programme details?',
                    'answer': (f"Topic Code: {row['Topic Code']}\n"
                              f"Topic Description: {row['Topic Descr']}\n"
                              f"Pillar: {row['Pillar Descr']}").strip(),
                    'category': 'topic'
                })

            # Add partner information if available
            if pd.notna(row['Legal Name']) or pd.notna(row['Partner Role']):
                qa_pairs.append({
                    'question': 'What are the partner details?',
                    'answer': (f"Legal Name: {row['Legal Name']}\n"
                              f"Partner Role: {row['Partner Role']}\n"
                              f"Partner Type: {row['Partner Type']}").strip(),
                    'category': 'partner'
                })

            processed_data.extend(qa_pairs)
            valid_entries += 1

        # Print progress every 1000 rows
        if idx % 1000 == 0:
            print(f"Processed {idx} rows, found {valid_entries} valid entries...")

    return processed_data

def clean_text(text):
    """Clean and normalize text while preserving meaningful content."""
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Keep more special characters that might be meaningful
    text = re.sub(r'[^\w\s.,!?€$()-/]', '', text)

    return text if text else ""

# Clean the DataFrame
print("Cleaning data...")
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].apply(clean_text)

# Process the data
print("Processing data...")
processed_data = preprocess_horizon_data(df)
print(f"\nProcessed {len(processed_data)} total QA pairs")

# Print some statistics about the processed data
categories = [item['category'] for item in processed_data]
print("\nDistribution of QA pairs by category:")
for category in set(categories):
    count = categories.count(category)
    print(f"{category}: {count} pairs ({count/len(categories)*100:.2f}%)")

# Print some sample QA pairs
print("\nSample QA pairs:")
samples = []
seen_categories = set()
for item in processed_data:
    if item['category'] not in seen_categories and item['answer'].strip() != "":
        samples.append(item)
        seen_categories.add(item['category'])
        if len(seen_categories) == len(set(categories)):
            break

for sample in samples:
    print(f"\nCategory: {sample['category']}")
    print(f"Question: {sample['question']}")
    print(f"Answer: {sample['answer']}")

# Split into train and validation sets
train_data, eval_data = train_test_split(processed_data, test_size=0.15, random_state=42)
print(f"\nTraining examples: {len(train_data)}")
print(f"Evaluation examples: {len(eval_data)}")

# Convert to Hugging Face datasets format
train_dataset = Dataset.from_dict({
    'question': [item['question'] for item in train_data],
    'answer': [item['answer'] for item in train_data],
    'category': [item['category'] for item in train_data]
})

eval_dataset = Dataset.from_dict({
    'question': [item['question'] for item in eval_data],
    'answer': [item['answer'] for item in eval_data],
    'category': [item['category'] for item in eval_data]
})

# Verify the quality of the datasets
print("\nVerifying dataset quality...")
print("\nTraining dataset sample:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Question: {train_dataset[i]['question']}")
    print(f"Answer: {train_dataset[i]['answer']}")
    print(f"Category: {train_dataset[i]['category']}")

Sample of raw data:
Programme                  H2020115797 approaches to disease modifying th...
Project Number                                                           NaN
CORDIS Link                                                    Not specified
Project Start Year                                             Not specified
Project End Date                                               Not specified
Project End Year                                               Not specified
Project Title                                                  Not specified
Project Acronym                                                Not specified
Project Status                                                 Not specified
Call ID                                                        Not specified
Call Deadline Date                                             Not specified
Call Deadline Year                                                       NaN
Project Signature Date                                  

In [23]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the data
df = pd.read_csv('/content/processed_horizon_data.csv', sep=';')
print("Original data shape:", df.shape)

def create_qa_pairs(df):
    """
    Create question-answer pairs from the Horizon dataset.
    """
    qa_pairs = []

    for _, row in df.iterrows():
        # Create multiple QA pairs for each project
        pairs = [
            {
                'question': 'What is the basic information about this project?',
                'answer': f"Project Title: {row['Project Title']}\nProgramme: {row['Programme']}\nProject Number: {row['Project Number']}\nAcronym: {row['Project Acronym']}",
                'category': 'basic_info'
            },
            {
                'question': 'What are the project dates and timeline?',
                'answer': f"Start Date: {row['Project Start Date']}\nEnd Date: {row['Project End Date']}\nStatus: {row['Project Status']}",
                'category': 'timeline'
            },
            {
                'question': 'What is the financial information?',
                'answer': f"EU Contribution: {row['EU Contribution']}\nTotal Cost: {row['Total Cost']}",
                'category': 'financial'
            },
            {
                'question': 'What are the project topics and priorities?',
                'answer': f"Topic: {row['Topic Descr']}\nPillar: {row['Pillar Descr']}\nThematic Priority: {row['Thematic Priority Descr']}",
                'category': 'topics'
            }
        ]
        qa_pairs.extend(pairs)

    return qa_pairs

def clean_text(text):
    """Clean and normalize text."""
    if pd.isna(text) or not isinstance(text, str):
        return "Not available"

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?€$()-]', '', text)
    return text

# Clean the DataFrame
print("Cleaning data...")
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].apply(clean_text)

# Create QA pairs
print("Creating QA pairs...")
qa_pairs = create_qa_pairs(df)
print(f"Created {len(qa_pairs)} QA pairs")

# Split into train and validation sets
train_data, eval_data = train_test_split(
    qa_pairs, test_size=0.15, random_state=42
)
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(eval_data)}")

# Convert to Hugging Face datasets format
def convert_to_hf_dataset(data_list):
    return Dataset.from_dict({
        'question': [item['question'] for item in data_list],
        'answer': [item['answer'] for item in data_list],
        'category': [item['category'] for item in data_list]
    })

# Create the datasets
print("Creating Hugging Face datasets...")
train_dataset = convert_to_hf_dataset(train_data)
eval_dataset = convert_to_hf_dataset(eval_data)

# Print some examples to verify the data
print("\nExample QA pairs from training dataset:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Question: {train_dataset[i]['question']}")
    print(f"Answer: {train_dataset[i]['answer']}")
    print(f"Category: {train_dataset[i]['category']}")

# Print dataset statistics
print("\nDataset Statistics:")
categories = [item['category'] for item in qa_pairs]
for category in set(categories):
    count = categories.count(category)
    percentage = (count / len(categories)) * 100
    print(f"{category}: {count} pairs ({percentage:.2f}%)")

# Save datasets (optional)
train_dataset.save_to_disk('horizon_train_dataset')
eval_dataset.save_to_disk('horizon_eval_dataset')

Original data shape: (63985, 30)
Cleaning data...
Creating QA pairs...
Created 255940 QA pairs
Training set size: 217549
Validation set size: 38391
Creating Hugging Face datasets...

Example QA pairs from training dataset:

Example 1:
Question: What is the financial information?
Answer: EU Contribution: 165.506
Total Cost: 236.438
Category: financial

Example 2:
Question: What are the project dates and timeline?
Answer: Start Date: Not available
End Date: Not available
Status: Not available
Category: timeline

Example 3:
Question: What is the basic information about this project?
Answer: Project Title: Not available
Programme: H2020689682 Management of Barriers in European RiversAMBERCLOSEDH2020-SC5-2015-two-stage8.09.2015201529.04.201620161.06.2016Climate action, environment, resource efficiency and raw materialsEU.3.Societal ChallengesSC5-07-2015More effective ecosystem restoration in the EURIAUNIVERSIDAD DE OVIEDO999848647PARTICIPANTBENEFICIARYHES11209.129209.129
Project Number: nan

Saving the dataset (0/1 shards):   0%|          | 0/217549 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/38391 [00:00<?, ? examples/s]