Base de dato de injuries

In [None]:
import pandas as pd
import re

def extract_injury_info(text):
    if not isinstance(text, str) or pd.isna(text):
        return pd.Series({
            'anatomical_area': 'not_specified',
            'injury_type': 'not_specified',
            'status': 'not_specified'
        })
    
    # Limpieza básica del texto
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # quita espacios duplicados
    text = re.sub(r'[(){}[\]]', '', text)  # quita paréntesis y corchetes

    # Correcciones de errores comunes
    typo_corrections = {
        "outindefinetily": "out indefinitely",
        "outindefinately": "out indefinitely",
        "out indefinately": "out indefinitely",
        "out indefinetly": "out indefinitely",
        "out indefinetely": "out indefinitely"
    }
    for typo, correction in typo_corrections.items():
        text = text.replace(typo, correction)

    # Anatomical areas
    anatomical_areas = {
        'knee': r'knee|patella|acl|mcl|pcl',
        'ankle': r'ankle',
        'shoulder': r'shoulder',
        'wrist': r'wrist',
        'back': r'back',
        'hamstring': r'hamstring',
        'achilles': r'achilles',
        'foot': r'foot',
        'hip': r'hip',
        'elbow': r'elbow',
        'hand': r'hand|finger|pinky|thumb',
        'neck': r'neck',
        'groin': r'groin',
        'quad': r'quad'
    }

    # Injury types
    injury_types = {
        'fracture': r'fracture|broken|stress fracture',
        'sprain': r'sprain',
        'dislocation': r'dislocat|luxation',
        'tear': r'tear|rupture|torn',
        'surgery': r'surgery|procedure',
        'strain': r'strain|pull',
        'contusion': r'contusion|bruise',
        'pain': r'pain|soreness|tightness'
    }

    # Anatomical area
    anatomical_area = 'not_specified'
    for area, pattern in anatomical_areas.items():
        if re.search(pattern, text):
            anatomical_area = area
            break

    # Injury type
    injury_type = 'other'
    for injury, pattern in injury_types.items():
        if re.search(pattern, text):
            injury_type = injury
            break

    # Injury status
    if "out indefinitely" in text:
        status = 'out_indefinitely'
    elif re.search(r'out for season|il|inactive', text):
        status = 'IL'
    elif re.search(r'returned to lineup|activated|available', text):
        status = 'returned'
    else:
        status = 'active'

    return pd.Series({
        'anatomical_area': anatomical_area,
        'injury_type': injury_type,
        'status': status
    })


# Load the dataset
lesiones = pd.read_csv(r'C:\Users\CamiUribe\OneDrive\Desktop\Lesiones 2010-2020 con ID.csv')

# Clean column names
lesiones.columns = lesiones.columns.str.strip()

# Print column names to verify
print("Columnas en el DataFrame:", lesiones.columns)

# Check the first few rows of the DataFrame
print(lesiones.head())

# Apply the function to the 'Notes' column
if 'Notes' in lesiones.columns:
    # Apply the function and create a DataFrame directly
    parsed_data = lesiones['Notes'].apply(extract_injury_info)
    
    # No need to convert to list, parsed_data is already a DataFrame
    lesiones = pd.concat([lesiones, parsed_data], axis=1)

    # Display the updated DataFrame
    print(lesiones[['Notes', 'anatomical_area', 'injury_type', 'status']].head())
else:
    print("La columna 'Notes' no se encuentra en el DataFrame.")


print(lesiones)





Analiis EDA del archivo 

In [None]:
#tipos de datos en la tabla
lesiones.info()

In [None]:
# hallazgo de nulos
lesiones.isnull().sum()

In [None]:
duplicado = lesiones.duplicated()
print('Los duplicados son:', duplicado)

ANALISIS DESCRIPTIVO 

In [None]:
# Analisis descriptivo
lesiones.describe()


In [None]:
# Analisis descptivo (graficas)
import seaborn as sns
import matplotlib.pyplot as plt

print(lesiones.head())

#calculo de frecuencias absolutas

columnas = ['Relinquished', 'Notes', 'ID', 'anatomical_area', 'injury_type', 'status']

for col in columnas:
    print(f"Frecuencia absoluta de '{col}':")
    print(lesiones[col].value_counts(dropna=False))
    print("\n" + "-"*50 + "\n")

In [None]:
#calculo de frecuencias absolutas

columnas = ['Relinquished', 'Notes', 'anatomical_area', 'injury_type', 'status']


for col in columnas:
    frecuencias = lesiones[col].value_counts(dropna=False).head(10)  # Solo las 10 más frecuentes

    plt.figure(figsize=(10, 5))
    frecuencias.plot(kind='bar', color='skyblue', edgecolor='black')

    plt.title(f"Top 10 categorías de '{col}'", fontsize=14)
    plt.xlabel(col, fontsize=12)
    plt.ylabel("Frecuencia", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.grid(axis='y', linestyle='--', alpha=0.7)

In [None]:
#Frecuencia relativas> 

columns = ['Relinquished', 'Notes', 'anatomical_area', 'injury_type', 'status']

for col in columns:
    frecuencia_rel = lesiones[col].value_counts(normalize=True, dropna=False).head(10) * 100

    plt.figure(figsize=(10, 5))
    frecuencia_rel.plot(kind='bar', color='red', edgecolor='black')

    plt.title(f"Top 10 categorías de '{col}'", fontsize=14)
    plt.xlabel(col, fontsize=12)
    plt.ylabel("Frecuencia relativa (%)", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.show()


In [13]:
# MODA 
columnas = ['Date','Relinquished', 'Notes', 'anatomical_area', 'injury_type', 'status']

for col in columnas:
    moda = lesiones[col].mode()
    print(f"Moda de '{col}': {list(moda)}")  # En lista por si hay más de una
    print("-" * 50)

Moda de 'Date': ['16-04-14']
--------------------------------------------------
Moda de 'Relinquished': ['Kevin Love']
--------------------------------------------------
Moda de 'Notes': ['activated from IL']
--------------------------------------------------
Moda de 'anatomical_area': ['not_specified']
--------------------------------------------------
Moda de 'injury_type': ['other']
--------------------------------------------------
Moda de 'status': ['IL']
--------------------------------------------------


In [18]:
# Media de la fecha
lesiones['Date'] = pd.to_datetime(lesiones['Date'])

Media_Date= lesiones['Date'].mean()
print("la media es:", Media_Date)

la media es: 2015-09-05 05:46:29.551743232
