Base de dato de injuries

In [None]:
import pandas as pd
import re

def extract_injury_info(text):
    if not isinstance(text, str) or pd.isna(text):
        return pd.Series({
            'anatomical_area': 'not_specified',
            'injury_type': 'not_specified',
            'status': 'not_specified'
        })
    
    text = text.lower().strip()
    
    # Anatomical areas
    anatomical_areas = {
        'knee': r'knee|patella|acl|mcl|pcl',
        'ankle': r'ankle',
        'shoulder': r'shoulder',
        'wrist': r'wrist',
        'back': r'back',
        'hamstring': r'hamstring',
        'achilles': r'achilles',
        'foot': r'foot',
        'hip': r'hip',
        'elbow': r'elbow',
        'hand': r'hand',
        'neck': r'neck',
        'groin': r'groin',
        'quad': r'quad'
    }
    
    # Injury types
    injury_types = {
        'fracture': r'fracture|broken|stress fracture',
        'sprain': r'sprain',
        'dislocation': r'dislocat|luxation',
        'tear': r'tear|rupture|torn',
        'surgery': r'surgery|procedure',
        'strain': r'strain|pull',
        'contusion': r'contusion|bruise',
        'pain': r'pain|soreness|tightness'
    }
    
    # Extract anatomical area
    anatomical_area = 'not_specified'
    for area, pattern in anatomical_areas.items():
        if re.search(pattern, text):
            anatomical_area = area
            break
    
    # Extract injury type
    injury_type = 'other'
    for injury, pattern in injury_types.items():
        if re.search(pattern, text):
            injury_type = injury
            break
    
    # Determine status
    status = 'active'
    if re.search(r'out for season|il|inactive', text):
        status = 'IL'
    elif re.search(r'returned to lineup|activated|available', text):
        status = 'returned'
    
    return pd.Series({
        'anatomical_area': anatomical_area,
        'injury_type': injury_type,
        'status': status
    })

# Load the dataset
lesiones = pd.read_csv(r'C:\Users\CamiUribe\OneDrive\Desktop\Lesiones 2010-2020 con ID.csv')

# Clean column names
lesiones.columns = lesiones.columns.str.strip()

# Print column names to verify
print("Columnas en el DataFrame:", lesiones.columns)

# Check the first few rows of the DataFrame
print(lesiones.head())

# Apply the function to the 'Notes' column
if 'Notes' in lesiones.columns:
    # Apply the function and create a DataFrame directly
    parsed_data = lesiones['Notes'].apply(extract_injury_info)
    
    # No need to convert to list, parsed_data is already a DataFrame
    lesiones = pd.concat([lesiones, parsed_data], axis=1)

    # Display the updated DataFrame
    print(lesiones[['Notes', 'anatomical_area', 'injury_type', 'status']].head())
else:
    print("La columna 'Notes' no se encuentra en el DataFrame.")


print(lesiones)





Columnas en el DataFrame: Index(['Date', 'Team', 'Nombre', 'Acquired', 'Relinquished', 'Notes', 'ID'], dtype='object')
       Date     Team         Nombre Acquired   Relinquished  \
0  03-10-10    Bulls  Carlos Boozer      NaN  Carlos Boozer   
1  06-10-10  Pistons  Jonas Jerebko      NaN  Jonas Jerebko   
2  06-10-10  Pistons  Terrico White      NaN  Terrico White   
3  08-10-10  Blazers     Jeff Ayres      NaN     Jeff Ayres   
4  08-10-10     Nets    Troy Murphy      NaN    Troy Murphy   

                                               Notes      ID  
0  fractured bone in right pinky finger (out inde...    2430  
1      torn right Achilles tendon (out indefinitely)  201973  
2  broken fifth metatarsal in right foot (out ind...  202358  
3          torn ACL in right knee (out indefinitely)  201965  
4             strained lower back (out indefinitely)    2211  
                                               Notes anatomical_area  \
0  fractured bone in right pinky finger (out inde...

Analiis EDA del archivo 