## Import/data load

In [1]:
import pandas as pd

df1 = pd.read_csv('Data/superhero_info - superhero_info.csv')
df2 = pd.read_csv('Data/superhero_powers - superhero_powers.csv')

In [2]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hero|Publisher  463 non-null    object
 1   Gender          463 non-null    object
 2   Race            463 non-null    object
 3   Alignment       463 non-null    object
 4   Hair color      463 non-null    object
 5   Eye color       463 non-null    object
 6   Skin color      463 non-null    object
 7   Measurements    463 non-null    object
dtypes: object(8)
memory usage: 29.1+ KB


In [3]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hero_names  667 non-null    object
 1   Powers      667 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [4]:
df = pd.concat([df1, df2], axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hero|Publisher  463 non-null    object
 1   Gender          463 non-null    object
 2   Race            463 non-null    object
 3   Alignment       463 non-null    object
 4   Hair color      463 non-null    object
 5   Eye color       463 non-null    object
 6   Skin color      463 non-null    object
 7   Measurements    463 non-null    object
 8   hero_names      667 non-null    object
 9   Powers          667 non-null    object
dtypes: object(10)
memory usage: 52.2+ KB


## Advanced Transformations

In [6]:
df[['Hero', 'Publisher']] = df['Hero|Publisher'].str.split('|', 1, expand=True)
df.head()

  df[['Hero', 'Publisher']] = df['Hero|Publisher'].str.split('|', 1, expand=True)


Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,hero_names,Powers,Hero,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",3-D Man,"Agility,Super Strength,Stamina,Super Speed",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Abe Sapien,Dark Horse Comics
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...",Abin Sur,DC Comics
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abin Sur,Lantern Power Ring,Abomination,Marvel Comics
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Abomination,"Accelerated Healing,Intelligence,Super Strengt...",Absorbing Man,Marvel Comics


In [7]:
df.drop(['Hero|Publisher','hero_names'], axis=1, inplace=True)

In [8]:
test_mes = df.loc[1, 'Measurements']
test_mes

"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"

In [9]:
# a function to extract the height and weight values from the 'Measurements' column

def get_height_weight(measurements):
    if pd.isna(measurements):
        return (None, None)
    else:
        # Convert the string to a dictionary using the 'eval' function
        measurements_dict = eval(measurements)
        # Extract the height and weight values
        height = measurements_dict['Height'].replace(' cm', '')
        weight = measurements_dict['Weight'].replace(' kg', '')
        # Return the height and weight values as a tuple
        return (float(height), float(weight))

# two new columns for height and weight
df[['Height', 'Weight']] = df['Measurements'].apply(lambda x: pd.Series(get_height_weight(x)))

# drop original 'Measurements' column
df.drop(['Measurements'], axis=1, inplace=True)

df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Powers,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,"Agility,Super Strength,Stamina,Super Speed",A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"Accelerated Healing,Durability,Longevity,Super...",Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,"Agility,Accelerated Healing,Cold Resistance,Du...",Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Lantern Power Ring,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,"Accelerated Healing,Intelligence,Super Strengt...",Absorbing Man,Marvel Comics,193.0,122.0


In [10]:
df['Powers'].value_counts

<bound method IndexOpsMixin.value_counts of 0             Agility,Super Strength,Stamina,Super Speed
1      Accelerated Healing,Durability,Longevity,Super...
2      Agility,Accelerated Healing,Cold Resistance,Du...
3                                     Lantern Power Ring
4      Accelerated Healing,Intelligence,Super Strengt...
                             ...                        
662                   Flight,Energy Blasts,Size Changing
663    Cold Resistance,Durability,Longevity,Super Str...
664    Agility,Stealth,Danger Sense,Marksmanship,Weap...
665    Cryokinesis,Telepathy,Magic,Fire Control,Proba...
666    Super Speed,Intangibility,Time Travel,Time Man...
Name: Powers, Length: 667, dtype: object>

In [11]:
# split 'Powers' column by comma
df['Powers'] = df['Powers'].str.split(',')

# a series of values for each power
powers_series = pd.Series(df['Powers']).explode()

# new columns for each power
df = pd.concat([df, pd.get_dummies(powers_series)], axis=1)

In [12]:
df.drop('Powers', axis=1, inplace=True)

df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Vision - Thermal,Vision - X-Ray,Vitakinesis,Wallcrawling,Water Control,Weapon-based Powers,Weapons Master,Weather Control,Web Creation,Wind Control
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,0,0,0
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,0,0,0
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,0,0,0
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,0,0,0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5874 entries, 0 to 666
Columns: 177 entries, Gender to Wind Control
dtypes: float64(2), object(8), uint8(167)
memory usage: 1.4+ MB


In [14]:
# average weight of superheroes with 'Super Speed'
super_speed_mask = df['Super Speed'] == True
super_speed_avg_weight = df.loc[super_speed_mask, 'Weight'].mean()

# average weight of superheroes without 'Super Speed'
no_super_speed_mask = df['Super Speed'] == False
no_super_speed_avg_weight = df.loc[no_super_speed_mask, 'Weight'].mean()

# Print the results
print("Average weight of superheroes with 'Super Speed':", super_speed_avg_weight)
print("Average weight of superheroes without 'Super Speed':", no_super_speed_avg_weight)

Average weight of superheroes with 'Super Speed': 113.39181286549707
Average weight of superheroes without 'Super Speed': 112.16013673415725


In [15]:
# average height of heroes for each publisher
avg_height_by_publisher = df.groupby('Publisher')['Height'].mean()

print(avg_height_by_publisher)

Publisher
DC Comics            180.684744
Dark Horse Comics    180.136054
George Lucas         175.090909
Image Comics         211.000000
Marvel Comics        193.923941
Shueisha             170.666667
Star Trek            181.666667
Team Epic TV         181.153846
Unknown              178.000000
Name: Height, dtype: float64
