1. Clean the files and combine them into one final DataFrame.

This dataframe should have the following columns:
   - Hero (Just the name of the Hero)
   
   - Publisher
   
   - Gender
   
   - Eye color
   
   - Race
   
   - Hair color
   
   - Height (numeric)
   
   - Skin color
   
   - Alignment
   
   - Weight (numeric)
    
    **Plus, one-hot-encoded columns for every power that appears in the dataset. E.g.:**
    
   - Agility
   
   - Flight
   
   - Superspeed
   
   - etc.
Hint: There is a space in "100 kg" or "52.5 cm"



2. Use your combined DataFrame to answer the following questions.

- Compare the average weight of super powers who have Super Speed to those who do not.

- What is the average height of heroes for each publisher?


In [1]:
# Imports
import numpy as np
import pandas as pd
import json

In [2]:
# Make Data folder
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

['superhero_info - superhero_info.csv',
 'superhero_powers - superhero_powers.csv']

In [3]:
# Load datasets
info = pd.read_csv('Data/superhero_info - superhero_info.csv')
powers = pd.read_csv('Data/superhero_powers - superhero_powers.csv')

In [4]:
# Display first five rows of both datasets
display(info.head(), powers.head())

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


## Clean Data

### Split Hero|Publisher column

In [5]:
# Split Hero and Publisher into two columns
info[["Hero", "Publisher"]]  = info["Hero|Publisher"].str.split('|', expand = True)
# Drop Hero|Publisher column
info.drop(columns = 'Hero|Publisher', inplace = True)
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


## Split Height and Weight

In [6]:
# Import ast to convert he strings to python dictionaries
import ast

# Converting Measurements strings to dictionaries
info['Measurements'] = info['Measurements'].apply(ast.literal_eval)

# Split Measurements into two columns
info['Height'] = info['Measurements'].apply(lambda x: x['Height'])
info['Weight'] = info['Measurements'].apply(lambda x: x['Weight'])

# Remove the original Measurements column
info.drop(columns = 'Measurements', inplace = True)

info.head()


Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


## Remove cm and kg from height and weight columns

In [7]:
# Remove 'cm' and 'kg' from 'Height' and 'Weight' columns
info['Height'] = info['Height'].str.replace(' cm', '').astype(float)
info['Weight'] = info['Weight'].str.replace(' kg', '').astype(float)

## Split Power Column

In [8]:
### Get a list of all powers
possible_powers = powers['Powers'].str.split(',').explode().value_counts().index
possible_powers

Index(['Super Strength', 'Stamina', 'Durability', 'Super Speed', 'Agility',
       'Flight', 'Accelerated Healing', 'Reflexes', 'Intelligence',
       'Energy Blasts',
       ...
       'Omnitrix', 'Thirstokinesis', 'Anti-Gravity', 'Hyperkinesis',
       'Speed Force', 'Electrical Transport', 'Molecular Dissipation',
       'Banish', 'Biokinesis', 'Changing Armor'],
      dtype='object', name='Powers', length=167)

In [9]:
# OHE power column
for col in possible_powers:
    powers[col] = powers['Powers'].str.contains(col)
    
# Remove the original Powers column
powers.drop(columns = 'Powers', inplace = True)

# Display new powers dataframe
powers.head()

  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'

Unnamed: 0,hero_names,Super Strength,Stamina,Durability,Super Speed,Agility,Flight,Accelerated Healing,Reflexes,Intelligence,...,Omnitrix,Thirstokinesis,Anti-Gravity,Hyperkinesis,Speed Force,Electrical Transport,Molecular Dissipation,Banish,Biokinesis,Changing Armor
0,3-D Man,True,True,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,True,True,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,True,False,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,True,True,False,True,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False


## Merge Dataframes 


In [10]:
# Merge info and powers
df = pd.merge(info, powers, left_on = 'Hero', right_on = 'hero_names')

# Set Hero to index
df = df.set_index('Hero')

df.head()

Unnamed: 0_level_0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Publisher,Height,Weight,hero_names,...,Omnitrix,Thirstokinesis,Anti-Gravity,Hyperkinesis,Speed Force,Electrical Transport,Molecular Dissipation,Banish,Biokinesis,Changing Armor
Hero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A-Bomb,Male,Human,good,No Hair,yellow,Unknown,Marvel Comics,203.0,441.0,A-Bomb,...,False,False,False,False,False,False,False,False,False,False
Abe Sapien,Male,Icthyo Sapien,good,No Hair,blue,blue,Dark Horse Comics,191.0,65.0,Abe Sapien,...,False,False,False,False,False,False,False,False,False,False
Abin Sur,Male,Ungaran,good,No Hair,blue,red,DC Comics,185.0,90.0,Abin Sur,...,False,False,False,False,False,False,False,False,False,False
Abomination,Male,Human / Radiation,bad,No Hair,green,Unknown,Marvel Comics,203.0,441.0,Abomination,...,False,False,False,False,False,False,False,False,False,False
Absorbing Man,Male,Human,bad,No Hair,blue,Unknown,Marvel Comics,193.0,122.0,Absorbing Man,...,False,False,False,False,False,False,False,False,False,False


# Questions:

## Compare the average weight of super powers who have Super Speed to those who do not.

In [11]:
# Create filter for super speed
speed_filter = df["Super Speed"]

# Calculate avererage weight of each superhero group
ave_weight_with = round(df.loc[speed_filter, 'Weight'].mean(), 2)
ave_weight_without = round(df.loc[~speed_filter, 'Weight'].mean(), 2)

print(f'The average weight of heros with super speed abilities is {ave_weight_with} kg.')
print(f'The average weight of heros without super speed abilities is {ave_weight_without} kg.')


The average weight of heros with super speed abilities is 129.4 kg.
The average weight of heros without super speed abilities is 101.77 kg.


## What is the average height of heroes for each publisher?

In [12]:
df.groupby('Publisher')['Height'].mean().round(2).sort_values(ascending = False)

Publisher
Image Comics         211.00
Marvel Comics        191.55
DC Comics            181.92
Star Trek            181.50
Team Epic TV         180.75
Unknown              178.00
Dark Horse Comics    176.91
Shueisha             171.50
George Lucas         159.60
Name: Height, dtype: float64