In [61]:
import json
import pandas as pd
from collections import defaultdict

In [62]:
# Cargar el archivo JSON
with open('datasets/dataset_crunchbase_ycombinator_all.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [63]:
# Lista de tipos de financiamiento permitidos
tipos_permitidos = ['seed', 'pre_seed', 'angel', 'series_a', 'series_b']

# Procesar los campos deseados
resultado = []
for item in data:
    # Filtrar por tipo de financiamiento
    if item.get("last_funding_type") not in tipos_permitidos:
        continue

    # Extraer nombres de fundadores
    founder_names = ", ".join([f.get("value", "") for f in item.get("founder_identifiers", [])])
    
    # Extraer categorías
    category_names = ", ".join([c.get("value", "") for c in item.get("category_groups", [])])

    resultado.append({
        "company_name": item.get("name"),
        "equity_funding_total": item.get("equity_funding_total", {}).get("value"),
        "num_articles": item.get("num_articles"),
        "last_equity_funding_total": item.get("last_equity_funding_total", {}).get("value"),
        "num_investments_funding_rounds": item.get("num_investments_funding_rounds"),
        "founded_on": item.get("founded_on", {}).get("value"),
        "ipqwery_num_trademark_registered": item.get("ipqwery_num_trademark_registered"),
        "ipqwery_num_patent_granted": item.get("ipqwery_num_patent_granted"),
        "num_investors": item.get("num_investors"),
        "operating_status": item.get("operating_status"),
        "num_lead_investors": item.get("num_lead_investors"),
        "num_employees_enum": item.get("num_employees_enum"),
        "funding_total": item.get("funding_total", {}).get("value"),
        "num_funding_rounds": item.get("num_funding_rounds"),
        "last_funding_type": item.get("last_funding_type"),
        "last_funding_at": item.get("last_funding_at"),
        "last_funding_total": item.get("last_funding_total", {}).get("value"),
        "num_founders": item.get("num_founders"),
        "linkedin": item.get("linkedin", {}).get("value"),
        "facebook": item.get("facebook", {}).get("value"),
        "twitter": item.get("twitter", {}).get("value"),
        "founders": founder_names,
        "categories": category_names
    })


In [64]:
df = pd.DataFrame(resultado)

In [65]:
# Se eliminan los valores nulos de
df[["founders","last_equity_funding_total", "num_lead_investors", "categories", "num_employees_enum"]] = df[[
    "founders","last_equity_funding_total", "num_lead_investors", "categories", "num_employees_enum"
]].replace("", pd.NA)

In [66]:
#Filtrar eliminando filas con valores faltantes en las columnas 
df = df.dropna(subset=["founders","last_equity_funding_total", "num_lead_investors", "categories", "num_employees_enum"])

In [67]:
# Crear una copia opcional del nombre original si deseas conservarlo
# df["original_company_name"] = df["company_name"]

# Paso 1: Identificar valores duplicados
duplicados_mask = df["company_name"].duplicated(keep=False)

# Paso 2: Crear un contador para los duplicados
contador = defaultdict(int)

# Paso 3: Nueva columna con nombres renombrados
nombres_actualizados = []

for i, name in enumerate(df["company_name"]):
    if duplicados_mask.iloc[i]:
        contador[name] += 1
        nombres_actualizados.append(f"{name}_{contador[name]}")
    else:
        nombres_actualizados.append(name)

# Paso 4: Asignar la nueva columna
df["company_name"] = nombres_actualizados

In [68]:
duplicados = df[df["company_name"].duplicated()]
print(duplicados["company_name"])

Series([], Name: company_name, dtype: object)


In [69]:
df.to_csv("datasets/startups_etapa_temprana.csv", index=False)

In [70]:
categorias = df[["company_name","categories"]]

In [71]:
categorias.to_csv("datasets/categorias_startups.csv", index=False)

In [11]:
# Tipos de financiamiento aceptados
tipos_permitidos = {
    "pre_seed", "seed", "series_a", "series_b"
}

# Cargar archivo JSON
with open('dataset_crunchbase_ycombinator_all.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extraer fundadores de empresas filtradas
founders_data = []
for item in data:
    funding_type = item.get("last_funding_type")
    if funding_type in tipos_permitidos:
        company_name = item.get("name")
        for founder in item.get("founder_identifiers", []):
            founder_name = founder.get("value")
            if founder_name:
                founders_data.append({
                    "founder_name": founder_name,
                    "company_name": company_name,
                    "last_funding_type": funding_type
                })

# Crear DataFrame
df = pd.DataFrame(founders_data)
df.to_csv("fundadores_filtrados_etapa_temprana.csv", index=False)
