# Merge of OpenAlex & Arxiv

In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import os

def clean_text(text):
    if pd.isna(text) or text is None:
        return ""
    text = str(text)
    
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\(\)\[\]\{\}\-\'\"\&\%\$\#\@\+\=\/\\]', '', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def normalize_date(date_str):
    if pd.isna(date_str) or not date_str:
        return ""
    
    try:
        formats = [
            '%Y-%m-%d', '%Y/%m/%d', '%d-%m-%Y', '%d/%m/%Y',
            '%Y-%m', '%Y/%m', '%m-%Y', '%m/%Y',
            '%Y'
        ]
        
        for fmt in formats:
            try:
                dt = datetime.strptime(str(date_str), fmt)
                if fmt == '%Y':
                    return f"{dt.year}-01-01"  # Si seulement l'année est disponible
                elif fmt in ['%Y-%m', '%Y/%m', '%m-%Y', '%m/%Y']:
                    return f"{dt.year}-{dt.month:02d}-01"  # Si seulement l'année et le mois sont disponibles
                else:
                    return dt.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        year_match = re.search(r'(\d{4})', str(date_str))
        if year_match:
            return f"{year_match.group(1)}-01-01"
            
        return ""
    except Exception:
        return ""

def merge_and_clean_data(arxiv_file, openalex_file, output_file=None):
    print(f"Chargement des données arXiv depuis {arxiv_file}...")
    arxiv_df = pd.read_csv("data/"+arxiv_file)
    
    print(f"Chargement des données OpenAlex depuis {openalex_file}...")
    openalex_df = pd.read_csv("data/"+openalex_file)
    
    print(f"arXiv: {len(arxiv_df)} articles, OpenAlex: {len(openalex_df)} articles")
    
    arxiv_df['source'] = 'arxiv'
    openalex_df['source'] = 'openalex'
    
    arxiv_columns = {
        'id': 'source_id',
        'title': 'title',
        'authors': 'authors',
        'abstract': 'abstract',
        'published_date': 'publication_date',
        'year': 'year',
        'doi': 'doi',
        'categories': 'categories'
    }
    
    openalex_columns = {
        'id': 'source_id',
        'title': 'title',
        'authors': 'authors', 
        'abstract': 'abstract',
        'publication_date': 'publication_date',
        'year': 'year',
        'doi': 'doi',
        'cited_by_count': 'cited_by_count',
        'concepts': 'concepts',
        'author_countries': 'countries',
        'author_institutions': 'institutions',
        'categories': 'categories'
    }
    
    arxiv_df_renamed = arxiv_df.rename(columns={old: new for old, new in arxiv_columns.items() if old in arxiv_df.columns})
    openalex_df_renamed = openalex_df.rename(columns={old: new for old, new in openalex_columns.items() if old in openalex_df.columns})
    
    arxiv_cols = list(arxiv_columns.values())
    arxiv_cols.append('source')
    arxiv_df_selected = arxiv_df_renamed[[col for col in arxiv_cols if col in arxiv_df_renamed.columns]]
    
    openalex_cols = list(openalex_columns.values())
    openalex_cols.append('source')
    openalex_df_selected = openalex_df_renamed[[col for col in openalex_cols if col in openalex_df_renamed.columns]]
    
    for col in set(arxiv_cols + openalex_cols):
        if col not in arxiv_df_selected.columns:
            arxiv_df_selected[col] = np.nan
        if col not in openalex_df_selected.columns:
            openalex_df_selected[col] = np.nan
            
    print("Fusion des datasets...")
    merged_df = pd.concat([arxiv_df_selected, openalex_df_selected], ignore_index=True)
    
    print("Nettoyage des données...")
    
    merged_df['title'] = merged_df['title'].apply(clean_text)
    merged_df['abstract'] = merged_df['abstract'].apply(clean_text)
    
    merged_df['publication_date'] = merged_df['publication_date'].apply(normalize_date)
    
    for idx, row in merged_df.iterrows():
        if pd.isna(row['year']) and row['publication_date']:
            try:
                date_parts = row['publication_date'].split('-')
                if len(date_parts) >= 1:
                    merged_df.at[idx, 'year'] = int(date_parts[0])
            except:
                pass
    
    print("Suppression des doublons...")
    merged_df['title_lower'] = merged_df['title'].str.lower()
    merged_df = merged_df.sort_values('cited_by_count', ascending=False, na_position='last')
    merged_df = merged_df.drop_duplicates(subset=['title_lower', 'year'], keep='first')
    merged_df = merged_df.drop(columns=['title_lower'])
    
    merged_df['year'] = merged_df['year'].fillna(0).astype(int)
    merged_df['cited_by_count'] = merged_df['cited_by_count'].fillna(0).astype(int)
    
    merged_df = merged_df[(merged_df['year'] >= 2019) & (merged_df['year'] <= 2024)]
    
    merged_df['id'] = [f"paper_{i+1}" for i in range(len(merged_df))]
    
    cols_order = ['id', 'source', 'source_id', 'title', 'authors', 'abstract', 'publication_date', 
                 'year', 'doi', 'categories', 'concepts', 'cited_by_count', 
                 'countries', 'institutions']
    
    final_cols = [col for col in cols_order if col in merged_df.columns]
    merged_df = merged_df[final_cols]
    
    print(f"Processus terminé. Dataset final: {len(merged_df)} articles uniques.")
    
    if output_file is None:
        output_file = f'data/merged_generative_ai_data_{datetime.now().strftime("%Y%m%d")}.csv'
        
    merged_df.to_csv(output_file, index=False)
    print(f"Données fusionnées et nettoyées sauvegardées dans {output_file}")
    
    return merged_df

if __name__ == "__main__":
    arxiv_files = [f for f in os.listdir('data') if f.startswith('arxiv_generative_ai_data') and f.endswith('.csv')]
    openalex_files = [f for f in os.listdir('data') if f.startswith('openalex_generative_ai_data') and f.endswith('.csv')]
    
    if arxiv_files and openalex_files:
        arxiv_file = sorted(arxiv_files)[-1]
        openalex_file = sorted(openalex_files)[-1]
        
        print(f"Utilisation des fichiers les plus récents:")
        print(f"  arXiv: {arxiv_file}")
        print(f"  OpenAlex: {openalex_file}")
        
        merged_df = merge_and_clean_data(arxiv_file, openalex_file)
        
        print("\nRésumé des données fusionnées:")
        print(f"Nombre total d'articles: {len(merged_df)}")
        
        years_count = merged_df['year'].value_counts().sort_index()
        print("\nDistribution par année:")
        for year, count in years_count.items():
            print(f"  {year}: {count} articles")
        
        sources_count = merged_df['source'].value_counts()
        print("\nDistribution par source:")
        for source, count in sources_count.items():
            print(f"  {source}: {count} articles")
    else:
        print("Aucun fichier de données trouvé. Veuillez d'abord exécuter les scripts de collecte de données.")

In [None]:
merged_df.isnull().sum()