In [None]:
import json
from geopy.geocoders import Nominatim
import pandas as pd
from collections import Counter
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import re
import os

# Récupération de la metadata:

In [None]:
with open('data/fr.sputniknews.africa--20220630--20230630.json') as f:
    data = json.load(f)

def flatten_all(metadata, keys_to_keep=None):
    all_data = metadata.get('all', {})
    if keys_to_keep:
        all_data = {key: all_data[key] for key in keys_to_keep if key in all_data}
    all_df = pd.json_normalize(all_data, sep='_')
    all_df['level'] = 'all'
    return all_df

def flatten_year_data(year, year_info, keys_to_keep=None):
    year_flat = []
    for category, values in year_info.items():
        if keys_to_keep and category not in keys_to_keep:
            continue  
        if isinstance(values, dict): 
            category_df = pd.DataFrame(list(values.items()), columns=['key', 'value'])
        else:
            category_df = pd.DataFrame({'key': [category], 'value': [values]})
        category_df['year'] = year
        category_df['category'] = category
        year_flat.append(category_df)
    return pd.concat(year_flat, ignore_index=True)

def flatten_month_data(year, month, month_info, keys_to_keep=None):
    month_flat = []
    for category, values in month_info.items():
        if keys_to_keep and category not in keys_to_keep:
            continue
        if isinstance(values, dict): 
            category_df = pd.DataFrame(list(values.items()), columns=['key', 'value'])
        else:
            category_df = pd.DataFrame({'key': [category], 'value': [values]})
        category_df['year'] = year
        category_df['month'] = month
        category_df['category'] = category
        month_flat.append(category_df)
    return pd.concat(month_flat, ignore_index=True)

keys_to_keep = ['kws', 'loc', 'org' ]  

all_df = flatten_all(data.get('metadata', {}), keys_to_keep=keys_to_keep)
all_df.to_csv('metadata/all_data_chunk.csv', index=False)  

yearly_dfs = []
for year, year_info in data.get('metadata', {}).get('year', {}).items():
    yearly_df = flatten_year_data(year, year_info, keys_to_keep=keys_to_keep)
    yearly_df.to_csv(f'metadata/year_data_{year}_chunk.csv', index=False)

monthly_dfs = []
for year, months in data.get('metadata', {}).get('month', {}).items():
    for month, month_info in months.items():
        monthly_df = flatten_month_data(year, month, month_info, keys_to_keep=keys_to_keep)
        monthly_df.to_csv(f'metadata/month_data_{year}_{month}_chunk.csv', index=False)  

print("Data processing complete. Chunks saved as individual CSV files.")

In [None]:
folder_path = "metadata"

csv_files = [f for f in os.listdir(folder_path) if f.startswith('year_data_2022')]

dataframes = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
merged_df = pd.concat(dataframes, ignore_index=True)

output_file = "metadata/year_data_2022.csv"
merged_df.to_csv(output_file, index=False)

print(f"All CSV files merged into {output_file}")

# Récupération de la data :

In [None]:
with open('data/fr.sputniknews.africa--20220630--20230630.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


def flatten_data(year, month, day, day_data):
    """
    Flatten the nested structure of the 'data' section into a DataFrame.
    """
    if isinstance(day_data, list):
        day_df = pd.DataFrame(day_data)
        day_df['year'] = year
        day_df['month'] = month
        day_df['day'] = day
        return day_df
    else:
        print(f"Unexpected data format for {year}-{month}-{day}: {day_data}")
        return pd.DataFrame()

def process_data(data, output_dir='output_data', chunk_size=10000):
    """
    Process the 'data' section and save it as CSV chunks.
    """
    os.makedirs(output_dir, exist_ok=True) 
    for year, months in data.get('data', {}).items():
        for month, days in months.items():
            for day, day_data in days.items():
                day_df = flatten_data(year, month, day, day_data)
                
                if not day_df.empty:
                    num_chunks = (len(day_df) // chunk_size) + 1
                    for i in range(num_chunks):
                        chunk = day_df[i * chunk_size:(i + 1) * chunk_size]
                        if not chunk.empty:
                            output_file = os.path.join(output_dir, f'data_{year}_{month}_{day}_chunk{i + 1}.csv')
                            chunk.to_csv(output_file, index=False)
                            print(f"Saved {output_file}")

process_data(data)

print("Data processing complete. CSV chunks saved.")

In [None]:
directory = 'output_data'

dataframes = []

for filename in os.listdir(directory):
    if filename.startswith("data") and filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        dataframes.append(pd.read_csv(filepath))
        os.remove(filepath)

merged_dataframe = pd.concat(dataframes, ignore_index=True)

output_filepath = os.path.join(directory, 'data.csv')
merged_dataframe.to_csv(output_filepath, index=False)

print(f"Merged file saved at: {output_filepath}")

# Code pour récupérer les données de géolocalisation :

In [None]:
df = pd.read_csv("metadata\year_data_2022.csv")
location_df = df[df["category"] == "loc"]


def is_valid_location(location):
    """
    Vérifie si une localisation est correcte.
    """
    # Ignorer les NaN
    if pd.isna(location) or isinstance(location, bool) or not isinstance(location, str):
        return None

    location = re.sub(r"[\"“”„]", "", location)
    
    if re.search(r"\d", location): 
        return None
    
    if re.match(r"^[0-9@#]", location) or re.search(r"\d", location):
        return None
    location = re.sub(r"^(l'|la |le |les |de |du |des )", "", location, flags=re.IGNORECASE)

    if re.match(r"^[0-9@#]", location):
        return None

    if not re.match(r"^[\w\s\-À-ÿ]+$", location): 
        return None
    
    if len(location.strip()) < 3:
        return None

    return location if location else None

location_df["key"] = location_df["key"].apply(is_valid_location)

location_df = location_df[location_df["key"].notnull()]

location_df.to_csv("localisation_2023.csv", index=False)
print(f"Dataset nettoyé sauvegardé")


In [None]:
import pandas as pd
import requests
import time

def get_geocode(location):
    """
    Fetch geocoding data from openstreetmap API for a given location in French.
    """
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": location, 
        "format": "json",  
        "limit": 1, 
        "accept-language": "en"  
    }
    headers = {"User-Agent": "MyGeocoderApp/1.0"}
    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data: 
                result = data[0]
                lat = float(result.get("lat", 0))
                lon = float(result.get("lon", 0))
                display_name = result.get("display_name", "Inconnu")  
                country = display_name.split(",")[-1].strip() if "," in display_name else result.get("name", "Inconnu")
                return lat, lon, country
        else:
            print(f"Erreur : Statut {response.status_code} pour la localisation {location}")
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération des données pour {location} : {e}")
    return None, None, "Inconnu"

# Charger votre dataset
df = pd.read_csv('localisation_2023.csv')
df['latitude'] = None
df['longitude'] = None
df['country'] = None

# Géocodage pour chaque localisation dans la colonne 'key'
for index, row in df.iterrows():
    location = row['key'].strip() 
    lat, lon, country = get_geocode(location)
    time.sleep(1)  
    df.at[index, 'latitude'] = lat
    df.at[index, 'longitude'] = lon
    df.at[index, 'country'] = country


output_file = 'metadata/geocoded_locations_2023.csv'
df.to_csv(output_file, index=False)
print(f"Données géocodées sauvegardées dans {output_file}")

# Code pour compter combien de fois des organisations sont cités ensemble :

In [None]:
import pandas as pd
from itertools import combinations


keywords_file = 'metadata/year_data_2022.csv'  
articles_file = 'output_data/data.csv'

keywords_data = pd.read_csv(keywords_file)

keywords_data = keywords_data[keywords_data['category'] == 'org']

articles_data = pd.read_csv(articles_file)

if 'org' not in articles_data.columns or articles_data['org'].isnull().all():
    raise ValueError("The 'org' column is missing or empty in the articles file.")

try:
    articles_data['org_dict'] = articles_data['org'].apply(eval)  
except Exception as e:
    raise ValueError(f"Failed to parse 'org' column: {e}")

# Build a list of keyword pairs
keyword_pairs = []

for _, row in articles_data.iterrows():
    keywords = list(row['org_dict'].keys())
    keyword_pairs.extend(combinations(keywords, 2))

# Convert to a DataFrame
keyword_pairs_df = pd.DataFrame(keyword_pairs, columns=['keyword', 'related_keyword'])

# Count the frequency of each pair (optional)
keyword_pairs_count = keyword_pairs_df.value_counts().reset_index(name='count')

# Save results to a CSV file
keyword_pairs_count.to_csv('output_data/org_relationships.csv', index=False)
print("Keyword relationships saved to 'keyword_relationships.csv'")